In [1]:
#read from files and create a dataframe of data

import glob
import pandas as pd
import string
import re
import os
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import NearestCentroid
from nltk import word_tokenize

neg_list = glob.glob("./data/neg/*.txt")
pos_list = glob.glob("./data/pos/*.txt")

stop_words = set(stopwords.words('english'))

doc_list = []
filename_list = []

#reading the data
for file in neg_list:
    file1 = open(file,"r").read()
    doc_list.append([file1, 0])
    filename_list.append(os.path.basename(file))

for file in pos_list:
    file1 = open(file,"r").read()
    doc_list.append([file1, 1])
    filename_list.append(os.path.basename(file))
    
total_avg_sentences = 0
total_sentences = 0
total_tokens = 0
min_sentence = 10000
max_sentence = 0
min_tokens = 10000
max_tokens = 0

for x in range(len(filename_list)):
    sentences = doc_list[x][0].split('\n')
    tokens = word_tokenize(doc_list[x][0])
    
    total_avg_sentences += len(tokens) / len(sentences)
    
    total_sentences += len(sentences) 
    total_tokens += len(tokens)
    
    if (len(sentences) > max_sentence):
        max_sentence = len(sentences)
    if (len(sentences) < min_sentence):
        min_sentence = len(sentences)
        
    if (len(tokens) > max_tokens):
        max_tokens = len(tokens)
    if (len(tokens) < min_tokens):
        min_tokens = len(tokens)

print("document length in sentences", "min:", min_sentence, "max:", max_sentence, "average:", total_sentences/len(filename_list))
print("document length in tokens", "min:", min_tokens, "max:", max_tokens, "average:", total_tokens/len(filename_list))
print("document sentence lengths in collection", total_avg_sentences/len(filename_list))
    
data = pd.DataFrame(doc_list, columns = ['text' , 'sentiment'])

document length in sentences min: 2 max: 113 average: 33.36
document length in tokens min: 18 max: 2753 average: 762.5195
document sentence lengths in collection 23.37124054883429


In [2]:
#data set cleaner which removes punctuation, and converts everything to lower case

tokenizer = WordPunctTokenizer()

def clean_dataset(text):
    lower_case = text.lower()
    letters_only = re.sub("[^a-zA-Z]", " ", lower_case)
    tokens = tokenizer.tokenize(letters_only)
    return (" ".join(tokens)).strip()

In [3]:
tqdm.pandas(desc="progress-bar")

def post_process(data, n=1000000):
    data = data.head(n)
    data['text'] = data['text'].progress_map(clean_dataset)  
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

data = post_process(data)

progress-bar: 100%|██████████████████████████████████████████████████████████████| 2000/2000 [00:01<00:00, 1597.08it/s]


In [4]:
#splitting our model into test_set and validation set
from sklearn.model_selection import train_test_split
SEED = 1234

x_train, x_validate, y_train, y_validate = train_test_split(data.text, data.sentiment, test_size=0.15, random_state=SEED)

In [5]:
#feature selecting using Count Vectorizer

cv = CountVectorizer()
cv.set_params(stop_words=stop_words, max_features=2000)

#building our pipeline for count vectorizer and logistic regression
lg_cv_pipeline = Pipeline([('vectorizer', cv), ('classifier', LogisticRegression(solver='liblinear'))])

#feature selecting using Tf-idf Vectorizer

tfidf = TfidfVectorizer()
tfidf.set_params(stop_words=stop_words, max_features=2000)

#building our pipeline for tfidf and logistic regression
lg_tfidf_pipeline = Pipeline([('vectorizer', tfidf), ('classifier', LogisticRegression(solver='liblinear'))])

In [6]:
#building our pipeline for count vectorizer and Ridge Classifier
rc_cv_pipeline = Pipeline([('vectorizer', cv), ('classifier', RidgeClassifier())])
#building our pipeline for tfidf and Ridge Classifier
rc_tfidf_pipeline = Pipeline([('vectorizer', tfidf), ('classifier', RidgeClassifier())])

In [7]:
#building our pipeline for count vectorizer and Nearest Centroid
nc_cv_pipeline = Pipeline([('vectorizer', cv), ('classifier', NearestCentroid())])
#building our pipeline for tfidf and Nearest Centroid
nc_tfidf_pipeline = Pipeline([('vectorizer', tfidf), ('classifier', NearestCentroid())])

In [8]:
lgcv = cross_val_score(lg_cv_pipeline, x_train, y_train, cv=5)
lg_tfidf= cross_val_score(lg_tfidf_pipeline, x_train, y_train, cv=5)
rccv = cross_val_score(rc_cv_pipeline, x_train, y_train, cv=5)
rctfidf = cross_val_score(rc_tfidf_pipeline, x_train, y_train, cv=5)
nccv = cross_val_score(nc_cv_pipeline, x_train, y_train, cv=5)
nctfidf = cross_val_score(nc_tfidf_pipeline, x_train, y_train, cv=5)

In [9]:
print(lgcv.mean(), lg_tfidf.mean(), rccv.mean(), rctfidf.mean(), nccv.mean(), nctfidf.mean())

0.8164705882352941 0.8288235294117646 0.7323529411764707 0.8299999999999998 0.6717647058823529 0.8011764705882353


In [10]:
for max_features in [500, 1000, 1500, 2000, 2500, 3000]:
    cv.set_params(stop_words=stop_words, max_features=max_features)
    tfidf.set_params(stop_words=stop_words, max_features=max_features)
    lg_cv_pipeline = Pipeline([('vectorizer', cv), ('classifier', LogisticRegression(solver='liblinear'))])
    lg_tfidf_pipeline = Pipeline([('vectorizer', tfidf), ('classifier', LogisticRegression(solver='liblinear'))])
    rc_cv_pipeline = Pipeline([('vectorizer', cv), ('classifier', RidgeClassifier())])
    rc_tfidf_pipeline = Pipeline([('vectorizer', tfidf), ('classifier', RidgeClassifier())])
    lgcv = cross_val_score(lg_cv_pipeline, x_train, y_train, cv=10)
    lg_tfidf= cross_val_score(lg_tfidf_pipeline, x_train, y_train, cv=10)
    rccv = cross_val_score(rc_cv_pipeline, x_train, y_train, cv=10)
    rctfidf = cross_val_score(rc_tfidf_pipeline, x_train, y_train, cv=10)
    nccv = cross_val_score(nc_cv_pipeline, x_train, y_train, cv=10)
    nctfidf = cross_val_score(nc_tfidf_pipeline, x_train, y_train, cv=10)
    print(max_features, lgcv.mean(), lg_tfidf.mean(), rccv.mean(), rctfidf.mean(), nccv.mean(), nctfidf.mean())
    

500 0.741764705882353 0.7858823529411765 0.7405882352941177 0.7688235294117647 0.6535294117647059 0.7623529411764706
1000 0.7911764705882354 0.8170588235294117 0.6941176470588235 0.8152941176470587 0.6623529411764706 0.7817647058823529
1500 0.7988235294117647 0.8400000000000001 0.6841176470588236 0.8305882352941175 0.6641176470588236 0.7970588235294118
2000 0.8135294117647058 0.8470588235294118 0.7305882352941176 0.8358823529411765 0.666470588235294 0.8070588235294117
2500 0.8164705882352941 0.8441176470588235 0.7476470588235294 0.8382352941176471 0.6688235294117646 0.8047058823529412
3000 0.8123529411764705 0.8405882352941175 0.7641176470588235 0.8358823529411763 0.6729411764705882 0.8017647058823529


**500** &ensp;0.741764705882353 0.7858823529411765 0.74 0.7688235294117647 0.6535294117647059 0.7623529411764706<br>
**1000** 0.7911764705882354 0.8170588235294117 0.6941176470588235 0.8158823529411764 0.6623529411764706 0.7817647058823529<br>
**1500** 0.7988235294117647 0.8400000000000001 0.6847058823529413 0.8305882352941175 0.6641176470588236 0.7970588235294118<br>
**2000** 0.8135294117647058 **0.8470588235294118** 0.7305882352941176 0.8358823529411765 0.666470588235294 0.8070588235294117<br>
**2500** 0.8164705882352941 0.8441176470588235 0.7476470588235294 **0.8388235294117647** 0.6688235294117646 0.8047058823529412<br>
**3000** 0.8123529411764705 0.8405882352941175 0.7652941176470588 0.8358823529411763 0.6729411764705882 0.8017647058823529<br>

In [11]:
print(len(x_train), len(x_validate))

1700 300


## Analysis

Based on the results it becomes very evident that **TFIDF feature selection method** is more accurate than using a count vectorization feature selection method.

Increasing the amount of maximum features selected gradually improves the performance of the models up until the 2000 features mark. Beyond that we see very marginal improvements in performance, and in some cases a decrease in performance. Furthermore, increasing the amount of features we look at also increases the time and complexity of the model overall. Therefore, our most favorable selection would be somewhere between **2000 and 2500 maximum features**. 

Between the 3 models that were used it becomes evident that **Logistic Regression and Ridge Classifiers** both provide very similar performance slightly beating out Nearest Centroid 

Judging from our results above we can see that the logistic regression using the TFIDF feature selection with maximum features of 2000 method yields the best result.

We can also observe the Ridge classifier using the TFIDF feature selection with maximum features of 2500 also yields some good results. 

In [12]:
#testing our first "Good" model

tfidf.set_params(stop_words=stop_words, max_features=2000)
lg_tfidf_pipeline = Pipeline([('vectorizer', tfidf), ('classifier', LogisticRegression(solver='liblinear'))])
model = lg_tfidf_pipeline.fit(x_train, y_train)
result = model.predict(x_validate)
print('The f1 score is', f1_score(y_validate, result, average="macro"))
print('The precision score is', precision_score(y_validate, result, average="macro"))
print('The recal score is', recall_score(y_validate, result, average="macro")) 
print(confusion_matrix(y_validate, result))

The f1 score is 0.8299981110901232
The precision score is 0.8300146673185476
The recal score is 0.8300000000000001
[[124  26]
 [ 25 125]]


In [13]:
#testing our second "Good" model

tfidf.set_params(stop_words=stop_words, max_features=2500)
rc_tfidf_pipeline = Pipeline([('vectorizer', tfidf), ('classifier', RidgeClassifier())])
model = rc_tfidf_pipeline.fit(x_train, y_train)
result = model.predict(x_validate)
print('The f1 score is', f1_score(y_validate, result, average="macro"))
print('The precision score is', precision_score(y_validate, result, average="macro"))
print('The recal score is', recall_score(y_validate, result, average="macro")) 
print(confusion_matrix(y_validate, result))

The f1 score is 0.8399715504978664
The precision score is 0.840241949830991
The recal score is 0.8400000000000001
[[128  22]
 [ 26 124]]


In [14]:
#testing one of our worst model

cv.set_params(stop_words=stop_words, max_features=500)
nc_cv_pipeline = Pipeline([('vectorizer', cv), ('classifier', NearestCentroid())])
model = nc_cv_pipeline.fit(x_train, y_train)
result = model.predict(x_validate)
print('The f1 score is', f1_score(y_validate, result, average="macro"))
print('The precision score is', precision_score(y_validate, result, average="macro"))
print('The recal score is', recall_score(y_validate, result, average="macro")) 
print(confusion_matrix(y_validate, result))

The f1 score is 0.6550324675324675
The precision score is 0.669779286926995
The recal score is 0.66
[[117  33]
 [ 69  81]]
