# Import Libraries

In [1]:
# feature extraction
from sklearn.feature_extraction.text import TfidfVectorizer

# feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# models
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

# evaluation metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

# others
import nltk
import re 
import pandas as pd
import numpy as np
import pickle as pk

# Read Dataset

In [2]:
training = pd.read_csv('train.csv', encoding="utf-8").iloc[:,-2:]
training.head()

Unnamed: 0,text,Sentiment
0,I've been here many many times and have never ...,positive
1,"I was actually really impressed, even though I...",positive
2,Excellent. Can't say enough about the sampler...,positive
3,"This was my first time here, a fellow yelper r...",positive
4,I went to BJ's when I lived in California. Th...,positive


In [3]:
testing  = pd.read_csv('test.csv', encoding="utf-8").iloc[:,-2:]
testing.head()

Unnamed: 0,text,Sentiment
0,"This place = Failtown, USA.\n\n\n\nMy friends ...",negative
1,"Yes, as someone stated before, this place make...",negative
2,Ahhh the infamous Heart Attack Grill. \n\nList...,negative
3,One of the WORST experiences of my life. My f...,negative
4,I saw the Heart Attack Grill on television and...,negative


# Lowercase Text

In [4]:
training['text'] = training['text'].apply(lambda x:x.lower())
testing['text'] = testing['text'].apply(lambda x:x.lower())

# Label Encoding
Positive to 1,Negative to -1 

In [5]:
map = {'positive':1, 'negative':-1}

training['Sentiment'] = training['Sentiment'].replace(map)
testing['Sentiment'] = testing['Sentiment'].replace(map)

In [6]:
training.head()

Unnamed: 0,text,Sentiment
0,i've been here many many times and have never ...,1
1,"i was actually really impressed, even though i...",1
2,excellent. can't say enough about the sampler...,1
3,"this was my first time here, a fellow yelper r...",1
4,i went to bj's when i lived in california. th...,1


In [7]:
testing.head()

Unnamed: 0,text,Sentiment
0,"this place = failtown, usa.\n\n\n\nmy friends ...",-1
1,"yes, as someone stated before, this place make...",-1
2,ahhh the infamous heart attack grill. \n\nlist...,-1
3,one of the worst experiences of my life. my f...,-1
4,i saw the heart attack grill on television and...,-1


# Train-test split

In [8]:
X_train = training.iloc[:,0]
y_train = training.iloc[:,1]

X_test = testing.iloc[:,0]
y_test = testing.iloc[:,1]

# **PART 1**

### Feature Extraction

In [9]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=1)
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)

In [10]:
train_vectors.shape,test_vectors.shape

((17933, 549262), (3797, 549262))

### Modeling

In [11]:
def modeling(train_vectors,test_vectors):
  names = ['Naive Bayes model',
           'Max Entropy/Logistic Regression model',
           'KNN model',
           'SVM model']

  models = [MultinomialNB(),
            LogisticRegression(random_state=0, solver='lbfgs'),
            KNeighborsClassifier(n_neighbors = 3, weights = 'distance', metric = 'cosine', algorithm = 'brute'),
            LinearSVC(C=100)]

  for model,name in zip(models,names):
    print(name)
    print('-'*30)
    model = model.fit(train_vectors, y_train)
    y_pred = model.predict(test_vectors)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    print('*'*100)

In [12]:
modeling(train_vectors,test_vectors)

Naive Bayes model
------------------------------
[[1758   39]
 [1237  763]]
              precision    recall  f1-score   support

          -1       0.59      0.98      0.73      1797
           1       0.95      0.38      0.54      2000

    accuracy                           0.66      3797
   macro avg       0.77      0.68      0.64      3797
weighted avg       0.78      0.66      0.63      3797

****************************************************************************************************
Max Entropy/Logistic Regression model
------------------------------
[[1589  208]
 [ 350 1650]]
              precision    recall  f1-score   support

          -1       0.82      0.88      0.85      1797
           1       0.89      0.82      0.86      2000

    accuracy                           0.85      3797
   macro avg       0.85      0.85      0.85      3797
weighted avg       0.86      0.85      0.85      3797

*************************************************************************



# **PART 2**

### Feature Extraction

* ADD Negation Features - word appearing after negative words will be negative.
* By doing this, we will have even bigger feature set.
* Now sentences like "I like that place" and "I don't like that place" can be differenciated by the model.

In [13]:
def nega_tag(text):
    transformed = re.sub(r"\b(?:never|nothing|nowhere|noone|none|not|haven't|hasn't|hasnt|hadn't|hadnt|can't|cant|couldn't|couldnt|shouldn't|shouldnt|won't|wont|wouldn't|wouldnt|don't|dont|doesn't|doesnt|didn't|didnt|isnt|isn't|aren't|arent|aint|ain't|hardly|seldom)\b[\w\s]+[^\w\s]", lambda match: re.sub(r'(\s+)(\w+)', r'\1NEG_\2', match.group(0)), text, flags=re.IGNORECASE)
    return(transformed)

text = "I don't like that place , you keep calling awesome."
print(nega_tag(text))

I don't NEG_like NEG_that NEG_place , you keep calling awesome.


In [14]:
X_nega_train = [nega_tag(text) for text in X_train]
X_nega_test = [nega_tag(text) for text in X_test]

In [16]:
vectorizer = TfidfVectorizer(ngram_range=(1, 2), min_df=1)
train_nega_vectors = vectorizer.fit_transform(X_nega_train)
test_nega_vectors = vectorizer.transform(X_nega_test)

In [17]:
train_nega_vectors.shape, test_nega_vectors.shape

((17933, 610107), (3797, 610107))

### Modeling

In [18]:
modeling(train_nega_vectors,test_nega_vectors)

Naive Bayes model
------------------------------
[[1755   42]
 [1128  872]]
              precision    recall  f1-score   support

          -1       0.61      0.98      0.75      1797
           1       0.95      0.44      0.60      2000

    accuracy                           0.69      3797
   macro avg       0.78      0.71      0.67      3797
weighted avg       0.79      0.69      0.67      3797

****************************************************************************************************
Max Entropy/Logistic Regression model
------------------------------
[[1590  207]
 [ 340 1660]]
              precision    recall  f1-score   support

          -1       0.82      0.88      0.85      1797
           1       0.89      0.83      0.86      2000

    accuracy                           0.86      3797
   macro avg       0.86      0.86      0.86      3797
weighted avg       0.86      0.86      0.86      3797

*************************************************************************



# **PART 3**

### Feature Selection

In [19]:
ch21 = SelectKBest(chi2, k=600)
train_Kbest_vectors = ch21.fit_transform(train_nega_vectors, y_train)
test_Kbest_vectors = ch21.transform(test_nega_vectors)
train_Kbest_vectors.shape

(17933, 600)

### Modeling

In [20]:
modeling(train_Kbest_vectors,test_Kbest_vectors)

Naive Bayes model
------------------------------
[[1700   97]
 [ 829 1171]]
              precision    recall  f1-score   support

          -1       0.67      0.95      0.79      1797
           1       0.92      0.59      0.72      2000

    accuracy                           0.76      3797
   macro avg       0.80      0.77      0.75      3797
weighted avg       0.80      0.76      0.75      3797

****************************************************************************************************
Max Entropy/Logistic Regression model
------------------------------
[[1598  199]
 [ 478 1522]]
              precision    recall  f1-score   support

          -1       0.77      0.89      0.83      1797
           1       0.88      0.76      0.82      2000

    accuracy                           0.82      3797
   macro avg       0.83      0.83      0.82      3797
weighted avg       0.83      0.82      0.82      3797

*************************************************************************

