# Import Libraries

In [1]:
# feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# models
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# evaluation metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# text preprocessing
from gensim.models.word2vec import Word2Vec
import nltk
import re 

# others
import pandas as pd
import numpy as np
import pickle as pk

# Read Dataset

In [2]:
training = pd.read_csv('train.csv', encoding="utf-8").iloc[:,-2:]
training.head()

Unnamed: 0,text,Sentiment
0,I've been here many many times and have never ...,positive
1,"I was actually really impressed, even though I...",positive
2,Excellent. Can't say enough about the sampler...,positive
3,"This was my first time here, a fellow yelper r...",positive
4,I went to BJ's when I lived in California. Th...,positive


In [3]:
testing  = pd.read_csv('test.csv', encoding="utf-8").iloc[:,-2:]
testing.head()

Unnamed: 0,text,Sentiment
0,"This place = Failtown, USA.\n\n\n\nMy friends ...",negative
1,"Yes, as someone stated before, this place make...",negative
2,Ahhh the infamous Heart Attack Grill. \n\nList...,negative
3,One of the WORST experiences of my life. My f...,negative
4,I saw the Heart Attack Grill on television and...,negative


# Preprocessing

In [4]:
# Lowercase Text
training['text'] = training['text'].apply(lambda x:x.lower())
testing['text'] = testing['text'].apply(lambda x:x.lower())

In [5]:
# Label Encoding
map = {'positive':1, 'negative':-1}

training['Sentiment'] = training['Sentiment'].replace(map)
testing['Sentiment'] = testing['Sentiment'].replace(map)

In [6]:
training.head()

Unnamed: 0,text,Sentiment
0,i've been here many many times and have never ...,1
1,"i was actually really impressed, even though i...",1
2,excellent. can't say enough about the sampler...,1
3,"this was my first time here, a fellow yelper r...",1
4,i went to bj's when i lived in california. th...,1


In [7]:
testing.head()

Unnamed: 0,text,Sentiment
0,"this place = failtown, usa.\n\n\n\nmy friends ...",-1
1,"yes, as someone stated before, this place make...",-1
2,ahhh the infamous heart attack grill. \n\nlist...,-1
3,one of the worst experiences of my life. my f...,-1
4,i saw the heart attack grill on television and...,-1


# Train-test split

In [8]:
X_train = training.iloc[:,0]
y_train = training.iloc[:,1]

X_test = testing.iloc[:,0]
y_test = testing.iloc[:,1]

# Required Functions

In [9]:
def modeling(train_vectors,test_vectors):
  names = [#'Naive Bayes model',
           'Max Entropy/Logistic Regression model',
           'KNN model',
           'SVM model']

  models = [#MultinomialNB(),
            LogisticRegression(random_state=0, solver='lbfgs'),
            KNeighborsClassifier(n_neighbors = 3, weights = 'distance', metric = 'cosine', algorithm = 'brute'),
            LinearSVC(C=1)]

  for model,name in zip(models,names):
    print(name)
    print('-'*30)
    model = model.fit(train_vectors, y_train)
    y_pred = model.predict(test_vectors)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print('*'*100)

In [10]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_small))])
        else:
            self.dim=0
            
    def fit(self, X,y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

# Feature Extraction

### **PART 1 - TFIDF**

In [11]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=1)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

In [12]:
train_vectors.shape,test_vectors.shape

((17933, 549262), (3797, 549262))

### **PART 2 - Negation**

* ADD Negation Features - word appearing after negative words will be negative.
* By doing this, we will have even bigger feature set.
* Now sentences like "I like that place" and "I don't like that place" can be differenciated by the model.

In [13]:
def nega_tag(text):
    transformed = re.sub(r"\b(?:never|nothing|nowhere|noone|none|not|haven't|hasn't|hasnt|hadn't|hadnt|can't|cant|couldn't|couldnt|shouldn't|shouldnt|won't|wont|wouldn't|wouldnt|don't|dont|doesn't|doesnt|didn't|didnt|isnt|isn't|aren't|arent|aint|ain't|hardly|seldom)\b[\w\s]+[^\w\s]", lambda match: re.sub(r'(\s+)(\w+)', r'\1NEG_\2', match.group(0)), text, flags=re.IGNORECASE)
    return(transformed)

text = "I don't like that place , you keep calling awesome."
print(nega_tag(text))

I don't NEG_like NEG_that NEG_place , you keep calling awesome.


In [14]:
X_nega_train = [nega_tag(text) for text in X_train]
X_nega_test = [nega_tag(text) for text in X_test]

In [15]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=1)
train_nega_vectors = vectorizer.fit_transform(X_nega_train)
test_nega_vectors = vectorizer.transform(X_nega_test)

In [16]:
train_nega_vectors.shape, test_nega_vectors.shape

((17933, 610107), (3797, 610107))

In [17]:
ch21 = SelectKBest(chi2, k=200)
train_Kbest_vectors = ch21.fit_transform(train_nega_vectors, y_train)
test_Kbest_vectors = ch21.transform(test_nega_vectors)
train_Kbest_vectors.shape,test_Kbest_vectors.shape

((17933, 200), (3797, 200))

### **PART 3 - GloVe**

In [18]:
GLOVE_6B_100D_PATH = '/content/drive/MyDrive/glove.6B.100d.txt'

In [19]:
X = list(X_train)+list(X_test)
X = [i.split() for i in X]
all_words = set([item for sublist in X for item in sublist])

print(len(list(all_words)))

95586


In [20]:
glove_small = {}
encoding = "utf-8"
with open(GLOVE_6B_100D_PATH, "rb") as infile:
  for line in infile:
    parts = line.split()
    word = parts[0].decode(encoding)
    if (word in all_words):
      nums=np.array(parts[1:], dtype=np.float32)
      glove_small[word] = nums
      
print(len(glove_small))     

25804


In [21]:
glove_Embedding = MeanEmbeddingVectorizer(glove_small)
X_glove_train = glove_Embedding.transform(X_train)
X_glove_test = glove_Embedding.transform(X_test)
X_glove_train.shape,X_glove_test.shape

((17933, 100), (3797, 100))

### **PART 4 - Custom Word2Vec**

In [22]:
model = Word2Vec(X, size=100, window=5, min_count=2, workers=2)
w2v = {w: vec for w, vec in zip(model.wv.index2word, model.wv.syn0)}
print(len(w2v))

41892


  


In [23]:
w2v_Embedding = MeanEmbeddingVectorizer(w2v)
X_w2v_train = w2v_Embedding.transform(X_train)
X_w2v_test = w2v_Embedding.transform(X_test)
X_w2v_train.shape,X_w2v_test.shape

((17933, 100), (3797, 100))

### **PART 5 - Combination**

In [24]:
final_train_vectors = np.c_[train_Kbest_vectors.toarray(), X_glove_train, X_w2v_train]
final_train_vectors.shape

(17933, 400)

In [25]:
final_test_vectors = np.c_[test_Kbest_vectors.toarray(), X_glove_test, X_w2v_test]
final_test_vectors.shape

(3797, 400)

In [26]:
pd.DataFrame(final_train_vectors).head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,360,361,362,363,364,365,366,367,368,369,370,371,372,373,374,375,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392,393,394,395,396,397,398,399
0,0.042516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.414581,0.129526,-0.128546,0.328955,0.081987,-0.18151,-0.134133,-0.31843,0.202392,0.221848,-0.054815,0.15523,0.00837,0.402443,0.312877,-0.098172,-0.15984,-0.018605,-0.284337,0.32796,-0.35703,-0.222494,0.105351,0.102534,-0.013161,-0.143034,0.166261,-0.395807,0.019706,-0.225649,-0.039584,-0.007226,-0.084747,-0.062607,0.263656,-0.413847,0.269195,-0.245728,-0.132381,0.092718
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.076947,0.0,0.0,0.0,0.046826,0.0,0.033856,0.0,0.0,0.023531,0.0,0.0,0.0,0.0,0.0,0.0,0.02671,0.0,0.053223,0.0,0.047246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.383468,0.15802,-0.134451,0.289637,0.120853,-0.175064,-0.099977,-0.337176,0.153808,0.234126,-0.069888,0.131131,0.061211,0.401538,0.317273,-0.071053,-0.165611,-0.060746,-0.340492,0.279385,-0.289519,-0.228546,0.101558,0.062334,0.051547,-0.072952,0.167329,-0.3384,0.009493,-0.233202,-0.044903,0.054212,-0.129078,-0.074598,0.230258,-0.382317,0.245321,-0.245538,-0.133054,0.068455
2,0.066658,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.359137,0.107554,-0.090144,0.321489,0.013575,-0.161295,-0.10806,-0.136238,0.231315,0.121339,-0.017557,0.240014,-0.035096,0.346115,0.179862,-0.143813,-0.114764,0.034514,-0.165792,0.374699,-0.44236,-0.22607,0.083444,0.086444,-0.05013,-0.194649,0.179441,-0.36918,0.01347,-0.160008,-0.035327,-0.118149,-0.00787,-0.116531,0.188944,-0.392074,0.278408,-0.24951,-0.164445,0.103521
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.352697,0.193896,-0.154121,0.278736,0.172439,-0.176957,-0.059545,-0.286216,0.104446,0.258245,-0.067973,0.139081,0.093881,0.371881,0.300299,-0.051293,-0.120979,-0.073201,-0.369405,0.227905,-0.291198,-0.219209,0.078696,0.034917,0.131794,-0.025111,0.193814,-0.338763,0.009995,-0.209654,-0.015053,0.088545,-0.129436,-0.078541,0.167488,-0.391524,0.247235,-0.272961,-0.104942,0.040901
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.415879,0.124788,-0.147817,0.317775,0.11619,-0.19289,-0.11349,-0.359569,0.183749,0.23754,-0.058266,0.123283,0.035277,0.426068,0.340481,-0.090936,-0.175943,-0.06244,-0.319929,0.301626,-0.304787,-0.229555,0.103516,0.088747,0.030072,-0.117174,0.17206,-0.395337,-0.010396,-0.238733,-0.029245,0.053632,-0.115299,-0.049734,0.245004,-0.413453,0.285686,-0.248716,-0.103051,0.06695


In [27]:
y_train[:5]

0    1
1    1
2    1
3    1
4    1
Name: Sentiment, dtype: int64

# Modeling

In [28]:
modeling(final_train_vectors,final_test_vectors)

Max Entropy/Logistic Regression model
------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[[1600  197]
 [ 519 1481]]
              precision    recall  f1-score   support

          -1       0.76      0.89      0.82      1797
           1       0.88      0.74      0.81      2000

    accuracy                           0.81      3797
   macro avg       0.82      0.82      0.81      3797
weighted avg       0.82      0.81      0.81      3797

****************************************************************************************************
KNN model
------------------------------
[[1347  450]
 [ 757 1243]]
              precision    recall  f1-score   support

          -1       0.64      0.75      0.69      1797
           1       0.73      0.62      0.67      2000

    accuracy                           0.68      3797
   macro avg       0.69      0.69      0.68      3797
weighted avg       0.69      0.68      0.68      3797

****************************************************************************************************
SVM model
------------------------------
[[1571  

