In [None]:
import glob
import pandas as pd
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import WordPunctTokenizer

tokenizer = WordPunctTokenizer()

neg_list = glob.glob("./data/neg/*.txt")
pos_list = glob.glob("./data/pos/*.txt")

stop_words = set(stopwords.words('english'))

doc_list = []

#reading the data
for file in neg_list:
    file1 = open(file,"r").read()
    doc_list.append([file1, 0])

for file in pos_list:
    file1 = open(file,"r").read()
    doc_list.append([file1, 1])
    
data = pd.DataFrame(doc_list, columns = ['text' , 'sentiment']) 

In [None]:

def clean_dataset(text):
    lower_case = text.lower()
    letters_only = re.sub("[^a-zA-Z]", " ", lower_case)
    tokens = tokenizer.tokenize(letters_only)
    return (" ".join(tokens)).strip()

In [None]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

def post_process(data, n=1000000):
    data = data.head(n)
    data['text'] = data['text'].progress_map(clean_dataset)  
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data

data = post_process(data)

In [None]:
data.head()

In [None]:
from sklearn.model_selection import train_test_split
SEED = 1234

x_train, x_test, y_train, y_test = train_test_split(data.text, data.sentiment, test_size=.15, random_state=SEED)

print(y_train.head())

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 
import numpy as np
from time import time


def acc_summary(pipeline, x_train, y_train, x_test, y_test):
    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    train_test_time = time() - t0
    accuracy = accuracy_score(y_test, y_pred)
    print("accuracy score: {0:.2f}%".format(accuracy*100))
    print("train and test time: {0:.2f}s".format(train_test_time))
    print("-"*80)
    return accuracy, train_test_time


from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer()

from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.neighbors import NearestCentroid
from sklearn.feature_selection import SelectFromModel

names = ["Logistic Regression", "Linear SVC", "LinearSVC with L1-based feature selection","Multinomial NB", 
         "Bernoulli NB", "Ridge Classifier", "AdaBoost", "Perceptron","Passive-Aggresive", "Nearest Centroid"]
classifiers = [
    LogisticRegression(),
    LinearSVC(),
    Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
  ('classification', LinearSVC(penalty="l2"))]),
    MultinomialNB(),
    BernoulliNB(),
    RidgeClassifier(),
    AdaBoostClassifier(),
    Perceptron(),
    PassiveAggressiveClassifier(),
    NearestCentroid()
    ]
zipped_clf = zip(names,classifiers)

tvec = TfidfVectorizer()
def classifier_comparator(vectorizer=tvec, n_features=10000, stop_words=None, ngram_range=(1, 1), classifier=zipped_clf):
    result = []
    vectorizer.set_params(stop_words=stop_words, max_features=n_features, ngram_range=ngram_range)
    for n,c in classifier:
        checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', c)
        ])
        print("Validation result for {}".format(n))
        print(c)
        clf_acc,tt_time = acc_summary(checker_pipeline, x_train, y_train, x_test, y_test)
        result.append((n,clf_acc,tt_time))
    return result

trigram_result = classifier_comparator(n_features=100000,ngram_range=(1,3))

print(trigram_result)

In [None]:
'''
import string
import re
import heapq
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer 

pos_word_freq = {}
neg_word_freq = {}
for document in doc_list:
    # Init the Wordnet Lemmatizer
    lemmatizer = WordNetLemmatizer()

    stop_words = set(stopwords.words('english')) 
    tokens = word_tokenize(document[0]) 
    tokens = [x for x in tokens if not re.fullmatch('[' + string.punctuation + ']+', x)]
    tokens = [w for w in tokens if not w in stop_words]

    for token in tokens:
        
        if (document[1] is 0):
            if token not in neg_word_freq.keys():
                neg_word_freq[token] = 1
            else:
                neg_word_freq[token] += 1
        else:
            if token not in pos_word_freq.keys():
                pos_word_freq[token] = 1
            else:
                pos_word_freq[token] += 1
            
print(len(neg_word_freq.keys()))
print(neg_word_freq)

print(len(pos_word_freq.keys()))
print(pos_word_freq)

#most_freq = heapq.nlargest(200, word_freq, key=word_freq.get)
#print(most_freq)
'''