In [35]:
import numpy as np
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import collections
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
nltk.download('stopwords')

def preprocess(line):
    line = cleanpunc(line)
    line = cleanstop(line)
    line = [stemming(w) for w in line]
    return line
    
def cleanstop(line):
    stop = set(stopwords.words('english'))
    filtered_sentence = []
    for w in line.split():
        if w.isalpha() and len(w)>2:
            w = w.lower()
            if(w not in stop):
                filtered_sentence.append(w)
    return filtered_sentence
                
def stemming(word):
    snowball  = nltk.stem.SnowballStemmer('english')
    return (snowball.stem(word.lower())).encode('utf8')

def cleanpunc(line): 
    cleaned = re.sub(r'[?|!|\'|"|#]',r'',line)
    cleaned = re.sub(r'[.|,|)|(|\|/]',r' ',cleaned)
    return  cleaned

def load_data(path):
    with open(path,"r") as text_file:
        lines = text_file.read().split('\n')
    lines = [line.split("\t") for line in lines if len(line.split("\t"))==2 and line.split("\t")[1]!='']
    train_sentences = [preprocess(line[0]) for line in lines]
    train_labels = [int(line[1]) for line in lines]
    return pd.DataFrame({'line': train_sentences, 'label': train_labels})

def word_count(data, label):
    for line, label in zip(data.values, label.values):
        word_counts = collections.Counter(line)
        for word, count in sorted(word_counts.items()):
            if word not in WORD_COUNTS.keys():
                WORD_COUNTS[word] = count
            else:
                WORD_COUNTS[word] += count
            if word not in POS_NEG_WORDS[label].keys():
                POS_NEG_WORDS[label][word] = count
            else:
                POS_NEG_WORDS[label][word] += count


def prior(y_data, classes):
    priors = np.zeros(len(classes))
    for i, c in enumerate(classes):
        priors[i] = len(y_data[y_data==c]) / len(y_data)
    return priors
def p_x_y(X,y):
    result = []
    label_total = 0
    for w in POS_NEG_WORDS[y].keys():
        label_total += POS_NEG_WORDS[y][w]
    for j, line in enumerate(X.values):
        result.append(_p_x_Y(line, y, label_total))
    return np.array(result)   

def _p_x_Y(line, label, label_total):
    prob = 0
    for w in line:
        p = 0
        if w in POS_NEG_WORDS[label].keys():
            p = (POS_NEG_WORDS[label][w] / label_total)+0.1
        else:
            p = 0.1
        prob += np.log(p)
    return prob
        
def fit(X_data, y_data):
    classes = np.unique(y_data)
    classes.sort()
#     priors = prior(classes)
    priors = prior(y_data, classes)
    result = np.zeros((X_data.shape[0], len(classes)))
    for i, y in enumerate(classes):
        X = X_data#[y_data == y]
        pxy_py = p_x_y(X, y) + priors[i]
        result[:,i] =  pxy_py
    return np.argmax(result, axis=1)
        


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Atlas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [36]:
POS_NEG_WORDS = {0: dict(), 1: dict()}
WORD_COUNTS = {}
data = load_data('amazon_cells_labelled.txt')
X_train, X_test, y_train, y_test = train_test_split(data['line'],data['label'], test_size=0.2, stratify=data['label'] ,random_state=42)
word_count(X_train, y_train)
fit(X_test, y_test)
# WORD_COUNTS = pd.DataFrame.from_dict(word_count(data), orient='index',columns=['count'])
print('Amazon Dataset')
print("Test Accuracy: ")
predicted = fit(X_test, y_test)
print('Total: ', accuracy_score(y_test, predicted))
y_test_0 = y_test[y_test.isin([0])]
y_test_1 = y_test[y_test.isin([1])]
predicted_0 = predicted[y_test.isin([0])]
predicted_1 = predicted[y_test.isin([1])]
print("Class 0 : ", accuracy_score(y_test_0, predicted_0))
print("Class 1 : ", accuracy_score(y_test_1, predicted_1))



print("Train Accuracy: ")
predicted = fit(X_train, y_train)
print('Total: ', accuracy_score(y_train, predicted))
y_train_0 = y_train[y_train.isin([0])]
y_train_1 = y_train[y_train.isin([1])]
predicted_0 = predicted[y_train.isin([0])]
predicted_1 = predicted[y_train.isin([1])]
print("Class 0 : ", accuracy_score(y_train_0, predicted_0))
print("Class 1 : ", accuracy_score(y_train_1, predicted_1))

Amazon Dataset
Test Accuracy: 
Total:  0.705
Class 0 :  0.49
Class 1 :  0.92
Train Accuracy: 
Total:  0.815
Class 0 :  0.6725
Class 1 :  0.9575


In [37]:
POS_NEG_WORDS = {0: dict(), 1: dict()}
WORD_COUNTS = {}
data = load_data('imdb_labelled.txt')
X_train, X_test, y_train, y_test = train_test_split(data['line'],data['label'], test_size=0.2, stratify=data['label'] ,random_state=42)
word_count(X_train, y_train)
print('IMDB Dataset')
print("Test Accuracy: ")
predicted = fit(X_test, y_test)
print('Total: ', accuracy_score(y_test, predicted))
y_test_0 = y_test[y_test.isin([0])]
y_test_1 = y_test[y_test.isin([1])]
predicted_0 = predicted[y_test.isin([0])]
predicted_1 = predicted[y_test.isin([1])]
print("Class 0 : ", accuracy_score(y_test_0, predicted_0))
print("Class 1 : ", accuracy_score(y_test_1, predicted_1))



print("Train Accuracy: ")
predicted = fit(X_train, y_train)
print('Total: ', accuracy_score(y_train, predicted))
y_train_0 = y_train[y_train.isin([0])]
y_train_1 = y_train[y_train.isin([1])]
predicted_0 = predicted[y_train.isin([0])]
predicted_1 = predicted[y_train.isin([1])]
print("Class 0 : ", accuracy_score(y_train_0, predicted_0))
print("Class 1 : ", accuracy_score(y_train_1, predicted_1))

IMDB Dataset
Test Accuracy: 
Total:  0.715
Class 0 :  0.73
Class 1 :  0.7
Train Accuracy: 
Total:  0.82375
Class 0 :  0.8575
Class 1 :  0.79


In [38]:
POS_NEG_WORDS = {0: dict(), 1: dict()}
WORD_COUNTS = {}
data = load_data('yelp_labelled.txt')
X_train, X_test, y_train, y_test = train_test_split(data['line'],data['label'], test_size=0.2, stratify=data['label'] ,random_state=42)
word_count(X_train, y_train)
print("Yelp Dataset")
print("Test Accuracy: ")
predicted = fit(X_test, y_test)
print('Total: ', accuracy_score(y_test, predicted))
y_test_0 = y_test[y_test.isin([0])]
y_test_1 = y_test[y_test.isin([1])]
predicted_0 = predicted[y_test.isin([0])]
predicted_1 = predicted[y_test.isin([1])]
print("Class 0 : ", accuracy_score(y_test_0, predicted_0))
print("Class 1 : ", accuracy_score(y_test_1, predicted_1))



print("Train Accuracy: ")
predicted = fit(X_train, y_train)
print('Total: ', accuracy_score(y_train, predicted))
y_train_0 = y_train[y_train.isin([0])]
y_train_1 = y_train[y_train.isin([1])]
predicted_0 = predicted[y_train.isin([0])]
predicted_1 = predicted[y_train.isin([1])]
print("Class 0 : ", accuracy_score(y_train_0, predicted_0))
print("Class 1 : ", accuracy_score(y_train_1, predicted_1))

Yelp Dataset
Test Accuracy: 
Total:  0.755
Class 0 :  0.67
Class 1 :  0.84
Train Accuracy: 
Total:  0.85625
Class 0 :  0.835
Class 1 :  0.8775
