In [2]:
import numpy as np
import pandas as pd
import re


In [3]:
o_train = pd.read_csv("train.csv")
o_test = pd.read_csv("test_pred.csv")

In [4]:
o_train.sample(10)
o_test.sample(10)

Unnamed: 0,text,sentiment
2068,r u serious min delay with americanairlines aa...,negative
728,hours on hold and issue still not resolved hop...,negative
3255,this is all i gotta say to yall amp your staff...,positive
1120,some of my kids are anxious husband and i cant...,negative
2099,thanks so much,positive
4155,all flights cancelled flighted tomorrow and th...,negative
3533,umm hello,positive
251,how do you manage to place a family of into a ...,negative
1280,its just a very bad customer service experienc...,negative
1449,do families no longer get early boarding with ...,negative


In [5]:
print(o_train.shape)
print(o_test.shape)

(10000, 2)
(4640, 2)


In [6]:
train1 = o_train
test1 = o_test

In [7]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
    return text

train1["text"] = train1["text"].apply(preprocess_text)

test1["text"] = o_test["text"].apply(preprocess_text)

from sklearn.preprocessing import OrdinalEncoder

en = OrdinalEncoder()
train1["sentiment"] = en.fit_transform(train1[["sentiment"]])
test1["sentiment"] = en.transform(test1[["sentiment"]])

In [8]:
print(train1.sample(10))

from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(train1, test_size = 0.2, random_state = 42)

train_set["sentiment"].value_counts()


                                                   text  sentiment
8809  my luggage is gone ive filed my paperwork prom...        0.0
9282  holy high speed internet batman speeds at unit...        2.0
1047  wow awesome videos guys great work bluemanity ...        2.0
8724  marks th year with new bluemanity plane design...        2.0
2648  if someone had bothered to inform us that the ...        0.0
6946                                 just did thank you        2.0
9427      loving the free wifi and legroom seattlebound        2.0
4378  their names are both angel seriously how cool ...        2.0
1268  only happened because u couldnt get us home th...        0.0
7028  still havent left maybe by the time im suppose...        0.0


sentiment
0.0    4609
1.0    1925
2.0    1466
Name: count, dtype: int64

In [9]:
train1["sentiment"].value_counts()

sentiment
0.0    5759
1.0    2408
2.0    1833
Name: count, dtype: int64

In [10]:
print(train1.loc[1, ["text"]].iloc[0])

delayed twice now cancelled flightedsent complaint email yet no response need an explanation pls notsatisfied smh unhappy


In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectorizer.fit(train_set.loc[:,"text"])
X_train = vectorizer.transform(train_set["text"])
X_test = vectorizer.transform(test_set["text"])
print(vectorizer.get_feature_names_out())
print(len(vectorizer.get_feature_names_out()))




['aa' 'aaba' 'aampc' ... 'zoom' 'zrh' 'zurich']
9119


In [12]:

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

full_model3 = Pipeline([("vectorizer", TfidfVectorizer()),("clf", LogisticRegression(max_iter = 1000))])
y_train = train_set["sentiment"]
y_test = test_set["sentiment"]

full_model3.fit(train_set["text"], y_train)
y_pred = full_model3.predict(test_set["text"])

In [13]:
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score

conf_mat = confusion_matrix(test_set["sentiment"], y_pred)
print("precision: ", precision_score(test_set["sentiment"], y_pred, average="micro"))
print("recall: ", recall_score(test_set["sentiment"], y_pred, average="micro"))
print("confusion_mat")
print(conf_mat)

print(en.categories_)

precision:  0.76
recall:  0.76
confusion_mat
[[1065   67   18]
 [ 203  240   40]
 [ 109   43  215]]
[array(['negative', 'neutral', 'positive'], dtype=object)]


In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

full_model2 = Pipeline([("vectorizer", TfidfVectorizer(stop_words = "english")), ("clf", MultinomialNB())])


full_model2.fit(train_set["text"], y_train)
y_bayes_pred = full_model2.predict(test_set["text"])
conf_mat_bayes = confusion_matrix(test_set["sentiment"], y_bayes_pred)
print("precision: ", precision_score(test_set["sentiment"], y_bayes_pred, average="micro"))
print("recall: ", recall_score(test_set["sentiment"], y_bayes_pred, average="micro"))
print("confusion_mat")
print(conf_mat)

precision:  0.657
recall:  0.657
confusion_mat
[[1065   67   18]
 [ 203  240   40]
 [ 109   43  215]]


In [15]:
from gensim.models import KeyedVectors, Word2Vec

w2v_model = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)

In [16]:
train_set.iloc[2, 0].strip().split()
w2v_model.vector_size

300

In [27]:
#create a training data from original with word2vec google pretrained and tfidf to create a average vector sentence
from sklearn.base import BaseEstimator, TransformerMixin
class tfidf_w2v(BaseEstimator, TransformerMixin):
    def __init__(self, vectorizer_ = None, embedding_ = None):
        self.vectorizer_ = vectorizer_
        self.embedding_ = embedding_
        self.embedding_dim_ = len(embedding_["hello"])

    #preprocessor
    def _preprocess_text(self, text):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'@\w+', '', text)  # Remove mentions
        text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespaces
        return text
        
    #fit
    def fit(self, X,y = None):
        self.vectorizer_.fit(X)
        return self
    def transform(self, X):
        feats = self.vectorizer_.get_feature_names_out()
        idf_ = self.vectorizer_.idf_
        idf_dict = dict(zip(feats, idf_))

        X_out = []
        
        for i in range(len(X)):
            vec = np.zeros(self.embedding_dim_)
            
            sentence = X.iloc[i] if hasattr(X, "iloc") else X[i]
            sentence_tokens = sentence.strip().split() #0 is the index of text
            for token in sentence_tokens:
                if token in self.embedding_ and token in idf_dict:
                    vec += (self.embedding_[token])*idf_dict[token]
                    
                if token not in self.embedding_ and token in idf_dict:
                    vec += np.ones(self.embedding_dim_) * idf_dict[token]
                    
           
            X_out.append(vec)


        return np.array(X_out)
    
             

In [28]:
from sklearn.pipeline import Pipeline 
full_model1 = Pipeline([("vectorizer", tfidf_w2v(vectorizer_ = TfidfVectorizer(),embedding_=w2v_model)), ("clf", LogisticRegression(max_iter = 2000))])

full_model1.fit(train_set["text"], y_train)

y_pred_log3 = full_model1.predict(test_set["text"])

conf_mat = confusion_matrix(test_set["sentiment"], y_pred_log3)
print("precision: ", precision_score(test_set["sentiment"], y_pred_log3, average="micro"))
print("recall: ", recall_score(test_set["sentiment"], y_pred_log3, average="micro"))
print("confusion_mat")
print(conf_mat)

precision:  0.748
recall:  0.748
confusion_mat
[[974 129  47]
 [142 282  59]
 [ 73  54 240]]


In [29]:
from sklearn.ensemble import VotingClassifier

vot_clf = VotingClassifier(estimators = [("model1", full_model1),  ("model3", full_model3), ], voting="soft")


In [30]:
vot_clf.fit(train_set["text"], y_train)

0,1,2
,estimators,"[('model1', ...), ('model3', ...)]"
,voting,'soft'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,vectorizer_,TfidfVectorizer()
,embedding_,<gensim.model...0022FA5AC06D0>

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,2000

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [31]:
y_pred = vot_clf.predict(test_set["text"])

conf_mat = confusion_matrix(test_set["sentiment"], y_pred)
print("precision: ", precision_score(test_set["sentiment"], y_pred, average="micro"))
print("recall: ", recall_score(test_set["sentiment"], y_pred, average="micro"))
print("confusion_mat")
print(conf_mat)

precision:  0.7765
recall:  0.7765
confusion_mat
[[1042   78   30]
 [ 165  267   51]
 [  89   34  244]]


In [33]:
from sklearn.base import clone
final_model = clone(vot_clf)
final_model.fit(train1["text"], train1["sentiment"])


0,1,2
,estimators,"[('model1', ...), ('model3', ...)]"
,voting,'soft'
,weights,
,n_jobs,
,flatten_transform,True
,verbose,False

0,1,2
,vectorizer_,TfidfVectorizer()
,embedding_,<gensim.model...0022FD56BD950>

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,2000

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [34]:
y_pred_final = final_model.predict(test1["text"])

In [35]:
conf_mat = confusion_matrix(test1["sentiment"], y_pred_final)
print("precision: ", precision_score(test1["sentiment"], y_pred_final, average="micro"))
print("recall: ", recall_score(test1["sentiment"], y_pred_final, average="micro"))
print("confusion_mat")
print(conf_mat)

precision:  0.8174568965517242
recall:  0.8174568965517242
confusion_mat
[[2980   92   28]
 [ 375  407   32]
 [ 239   81  406]]
