In [57]:
import glob
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer

tokenizer = WordPunctTokenizer()

neg_list = glob.glob("./data/neg/*.txt")
pos_list = glob.glob("./data/pos/*.txt")

stop_words = set(stopwords.words('english'))

doc_list = []

#reading the data
for file in neg_list:
    file1 = open(file,"r").read()
    doc_list.append([file1, 0])

for file in pos_list:
    file1 = open(file,"r").read()
    doc_list.append([file1, 1])
    
data = pd.DataFrame(doc_list, columns = ['text' , 'sentiment']) 

In [59]:

def clean_dataset(text):
    lower_case = text.lower()
    letters_only = re.sub("[^a-zA-Z]", " ", lower_case)
    tokens = tokenizer.tokenize(letters_only)
    return (" ".join(tokens)).strip()

In [61]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

def post_process(data, n=1000000):
    data = data.head(n)
    data['text'] = data['text'].progress_map(clean_dataset)  
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

data = post_process(data)


progress-bar:   0%|                                                                           | 0/2000 [00:00<?, ?it/s]
progress-bar:   6%|███▍                                                           | 111/2000 [00:00<00:01, 1070.15it/s]
progress-bar:  12%|███████▌                                                       | 241/2000 [00:00<00:01, 1117.62it/s]
progress-bar:  17%|██████████▋                                                    | 341/2000 [00:00<00:01, 1071.13it/s]
progress-bar:  22%|█████████████▊                                                 | 438/2000 [00:00<00:01, 1026.51it/s]
progress-bar:  27%|█████████████████                                              | 540/2000 [00:00<00:01, 1024.48it/s]
progress-bar:  34%|█████████████████████▏                                         | 671/2000 [00:00<00:01, 1090.75it/s]
progress-bar:  38%|████████████████████████▎                                      | 770/2000 [00:00<00:01, 1041.39it/s]
progress-bar:  45%|████████████████████

In [63]:
data.head()

Unnamed: 0,text,sentiment
0,plot two teen couples go to a church party dri...,0
1,the happy bastard s quick movie review damn th...,0
2,it is movies like these that make a jaded movi...,0
3,quest for camelot is warner bros first feature...,0
4,synopsis a mentally unstable man undergoing ps...,0


In [65]:
from sklearn.model_selection import train_test_split
SEED = 1234

x_train, x_test, y_train, y_test = train_test_split(data.text, data.sentiment, test_size=.15, random_state=SEED)

print(y_train.head())

634     0
1887    1
181     0
1439    1
827     0
Name: sentiment, dtype: int64


In [67]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
import numpy as np
from time import time


def acc_summary(pipeline, x_train, y_train, x_test, y_test):
    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    train_test_time = time() - t0
    accuracy = accuracy_score(y_test, y_pred)
    print("accuracy score: {0:.2f}%".format(accuracy*100))
    print("train and test time: {0:.2f}s".format(train_test_time))
    print("-"*80)
    return accuracy, train_test_time


from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer()

from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import NearestCentroid
from sklearn.feature_selection import SelectFromModel

names = ["Logistic Regression", "Linear SVC", "LinearSVC with L1-based feature selection","Multinomial NB", 
         "Bernoulli NB", "Ridge Classifier", "AdaBoost", "Perceptron","Passive-Aggresive", "Nearest Centroid"]
classifiers = [
    LogisticRegression(),
    LinearSVC(),
    Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
  ('classification', LinearSVC(penalty="l2"))]),
    MultinomialNB(),
    BernoulliNB(),
    RidgeClassifier(),
    AdaBoostClassifier(),
    Perceptron(),
    PassiveAggressiveClassifier(),
    NearestCentroid()
    ]
zipped_clf = zip(names,classifiers)

tvec = TfidfVectorizer()
def classifier_comparator(vectorizer=tvec, n_features=10000, stop_words=None, ngram_range=(1, 1), classifier=zipped_clf):
    result = []
    vectorizer.set_params(stop_words=stop_words, max_features=n_features, ngram_range=ngram_range)
    for n,c in classifier:
        checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', c)
        ])
        print("Validation result for {}".format(n))
        print(c)
        clf_acc,tt_time = acc_summary(checker_pipeline, x_train, y_train, x_test, y_test)
        result.append((n,clf_acc,tt_time))
    return result

trigram_result = classifier_comparator(n_features=100000,ngram_range=(1,3))

print(trigram_result)

Validation result for Logistic Regression
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)




accuracy score: 79.67%
train and test time: 17.24s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)
accuracy score: 86.00%
train and test time: 19.11s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-based feature selection
Pipeline(memory=None,
         steps=[('feature_selection',
                 SelectFromModel(estimator=LinearSVC(C=1.0, class_weight=None,
                                                     dual=False,
                                                     fit_intercept=True,
                                                     intercept_scaling=1,
                                                     loss=