In [1]:
#read from files and create a dataframe of data

import glob
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import NearestCentroid

neg_list = glob.glob("./data/neg/*.txt")
pos_list = glob.glob("./data/pos/*.txt")

stop_words = set(stopwords.words('english'))

doc_list = []

#reading the data
for file in neg_list:
    file1 = open(file,"r").read()
    doc_list.append([file1, 0])

for file in pos_list:
    file1 = open(file,"r").read()
    doc_list.append([file1, 1])
    
data = pd.DataFrame(doc_list, columns = ['text' , 'sentiment']) 

In [2]:
#data set cleaner which removes punctuation, and converts everything to lower case

tokenizer = WordPunctTokenizer()

def clean_dataset(text):
    lower_case = text.lower()
    letters_only = re.sub("[^a-zA-Z]", " ", lower_case)
    tokens = tokenizer.tokenize(letters_only)
    return (" ".join(tokens)).strip()

In [3]:
tqdm.pandas(desc="progress-bar")

def post_process(data, n=1000000):
    data = data.head(n)
    data['text'] = data['text'].progress_map(clean_dataset)  
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

data = post_process(data)

  from pandas import Panel
progress-bar: 100%|██████████████████████████████████████████████████████████████| 2000/2000 [00:01<00:00, 1529.79it/s]


In [4]:
#splitting our model into test_set and validation set
from sklearn.model_selection import train_test_split
SEED = 1234

x_train, x_validate, y_train, y_validate = train_test_split(data.text, data.sentiment, test_size=0.15, random_state=SEED)

In [5]:
#feature selecting using Count Vectorizer

cv = CountVectorizer()
cv.set_params(stop_words=stop_words, max_features=2000)

#building our pipeline for count vectorizer and logistic regression
lg_cv_pipeline = Pipeline([('vectorizer', cv), ('classifier', LogisticRegression(solver='liblinear'))])

#feature selecting using Count Vectorizer

tfidf = TfidfVectorizer()
tfidf.set_params(stop_words=stop_words, max_features=2000)

#building our pipeline for tfidf and logistic regression
lg_tfidf_pipeline = Pipeline([('vectorizer', tfidf), ('classifier', LogisticRegression(solver='liblinear'))])

In [6]:
#building our pipeline for count vectorizer and Ridge Classifier
rc_cv_pipeline = Pipeline([('vectorizer', cv), ('classifier', RidgeClassifier())])
#building our pipeline for tfidf and Ridge Classifier
rc_tfidf_pipeline = Pipeline([('vectorizer', tfidf), ('classifier', RidgeClassifier())])

In [7]:
#building our pipeline for count vectorizer and Nearest Centroid
nc_cv_pipeline = Pipeline([('vectorizer', cv), ('classifier', NearestCentroid())])
#building our pipeline for tfidf and Nearest Centroid
nc_tfidf_pipeline = Pipeline([('vectorizer', tfidf), ('classifier', NearestCentroid())])

NameError: name 'NearestCentroid' is not defined

In [None]:
lgcv = cross_val_score(lg_cv_pipeline, x_train, y_train, cv=10)
lg_tfidf= cross_val_score(lg_tfidf_pipeline, x_train, y_train, cv=10)
rccv = cross_val_score(rc_cv_pipeline, x_train, y_train, cv=10)
rctfidf = cross_val_score(rc_tfidf_pipeline, x_train, y_train, cv=10)
nccv = cross_val_score(nc_cv_pipeline, x_train, y_train, cv=10)
nctfidf = cross_val_score(nc_tfidf_pipeline, x_train, y_train, cv=10)

In [None]:
print(lgcv.mean(), lg_tfidf.mean(), rccv.mean(), rctfidf.mean(), nccv.mean(), nctfidf.mean())

In [None]:
for max_features in [500, 1000, 1500, 2000, 2500, 3000]:
    cv.set_params(stop_words=stop_words, max_features=max_features)
    tfidf.set_params(stop_words=stop_words, max_features=max_features)
    lg_cv_pipeline = Pipeline([('vectorizer', cv), ('classifier', LogisticRegression(solver='liblinear'))])
    lg_tfidf_pipeline = Pipeline([('vectorizer', tfidf), ('classifier', LogisticRegression(solver='liblinear'))])
    rc_cv_pipeline = Pipeline([('vectorizer', cv), ('classifier', RidgeClassifier())])
    rc_tfidf_pipeline = Pipeline([('vectorizer', tfidf), ('classifier', RidgeClassifier())])
    lgcv = cross_val_score(lg_cv_pipeline, x_train, y_train, cv=10)
    lg_tfidf= cross_val_score(lg_tfidf_pipeline, x_train, y_train, cv=10)
    rccv = cross_val_score(rc_cv_pipeline, x_train, y_train, cv=10)
    rctfidf = cross_val_score(rc_tfidf_pipeline, x_train, y_train, cv=10)
    nccv = cross_val_score(nc_cv_pipeline, x_train, y_train, cv=10)
    nctfidf = cross_val_score(nc_tfidf_pipeline, x_train, y_train, cv=10)
    print(max_features, lgcv.mean(), lg_tfidf.mean(), rccv.mean(), rctfidf.mean(), nccv.mean(), nctfidf.mean())
    

**500** &ensp;0.741764705882353 0.7858823529411765 0.7405882352941177 0.7688235294117647<br>
**1000** 0.7911764705882354 0.8170588235294117 0.6952941176470588 0.8152941176470587<br>
**1500** 0.7988235294117647 0.8400000000000001 0.6841176470588236 0.8305882352941175<br>
**2000** 0.8135294117647058 0.8470588235294118 0.7311764705882353 0.8364705882352942<br>
**2500** 0.8164705882352941 0.8441176470588235 0.7476470588235294 0.8388235294117647<br>
**3000** 0.8123529411764705 0.8405882352941175 0.7647058823529411 0.8358823529411763<br>