In [66]:
from collections import defaultdict
import numpy as np
import pandas as pd
import pprint
from nltk.corpus import stopwords
pp = pprint.PrettyPrinter(indent=4)
import warnings
warnings.filterwarnings("ignore")
import re
from sklearn.model_selection import train_test_split
from itertools import islice

In [67]:
data = pd.read_csv('PS1.1A_training_data.txt',sep="\t", names=["ID","TEXT","Sentiment","Topic","Genre"])
data.head(5)
# data = pd.read_csv('train.csv')
# data.head(5)

Unnamed: 0,ID,TEXT,Sentiment,Topic,Genre
0,0,This is definitely a must have if your state d...,POSITIVE,NONE,GENRE_B
1,1,It's a great place and I highly recommend it.,POSITIVE,NONE,GENRE_B
2,2,"I will see the doctors, take some blood tests ...",NEUTRAL,GOING_TO_PLACES,GENRE_A
3,3,I can tell you about having my phone and elect...,NEGATIVE,MONEY_ISSUE,GENRE_A
4,4,Their steaks are 100% recommended!,POSITIVE,NONE,GENRE_B


In [69]:
def clean_TEXT(dataframe):
    dataframe['TEXT'] = dataframe.TEXT.fillna('none')
    #dataframe['Sentiment'] = dataframe.Sentiment.fillna('none')
    #dataframe['Topic'] = dataframe.Topic.fillna('none')
    dataframe['Genre'] = dataframe.Genre.fillna('none')
    dataframe = dataframe[dataframe.Sentiment != 'NEUTRAL']
    dataframe['TEXT'] = dataframe['TEXT'].str.lower()
    STOPWORDS = []
    STOPWORDS = stopwords
    
    columns = list(dataframe.columns)
    
    for column in columns:
        if column != 'ID':
            dataframe[column] = dataframe[column].str.replace(r'\W', ' ').str.replace(r'\s$','')
            
    
    for column in columns:
        dataframe = remove_stop_words(dataframe,column)
    
    for column in columns:
        dataframe = remove_special_characters(dataframe,column)
    
    #dataframe['Text_Genre'] = dataframe['TEXT'] + dataframe['Genre']
    
    return dataframe

        
def remove_stop_words(data_frame, column_name):
    if column_name != 'ID':
        data_frame[column_name] = data_frame[column_name].apply(lambda x: " ".join([i for i in x.lower().split() if i not in STOPWORDS]))
    return data_frame

def remove_special_characters(data_frame, columns):
    data_frame.columns = data_frame.columns.str.replace('[!,@,#,$,%,^,&,*,\",:,;,.]','')
    return data_frame

In [70]:
data = clean_TEXT(data)

In [71]:
data.head()

Unnamed: 0,ID,TEXT,Sentiment,Topic,Genre
0,0,definitely state allow cell phone usage driving,positive,,genre_b
1,1,s great place highly recommend,positive,,genre_b
3,3,tell having phone electricity gas cut couldn t...,negative,money_issue,genre_a
4,4,steaks 100 recommended,positive,,genre_b
5,5,billed thousands dollars said feeling shock un...,negative,money_issue,genre_a


In [72]:
X_train, X_test, y_train, y_test = train_test_split(data['TEXT'], data['Sentiment'],test_size=0.25,random_state=5)

In [73]:
print(X_train.shape), print(y_train.shape)
print(X_test.shape), print(y_test.shape)
#print(X_train.head(5))
print(y_train.unique)

(1765,)
(1765,)
(589,)
(589,)
<bound method Series.unique of 581     negative
246     negative
1844    positive
13      negative
593     positive
          ...   
1121    positive
2309    positive
1543    positive
1869    negative
2451    negative
Name: Sentiment, Length: 1765, dtype: object>


In [74]:
class NaiveBayesClassifier(object):
    def __init__(self, n_gram=1, printing=False):
        self.prior = defaultdict(int)
        self.logprior = {}
        self.bigdoc = defaultdict(list)
        self.loglikelihoods = defaultdict(defaultdict)
        self.V = []
        self.n = n_gram

    def compute_prior_and_bigdoc(self, training_set, training_labels):
        for x, y in zip(training_set, training_labels):
            all_words = x.split(" ")
            if self.n == 1:
                grams = all_words
            else:
                grams = self.words_to_grams(all_words)

            self.prior[y] += len(grams)
            self.bigdoc[y].append(x)

    def compute_vocabulary(self, documents):
        vocabulary = set()

        for doc in documents:
            for word in doc.split(" "):
                vocabulary.add(word.lower())

        return vocabulary

    def count_word_in_classes(self):
        counts = {}
        for c in list(self.bigdoc.keys()):
            docs = self.bigdoc[c]
            counts[c] = defaultdict(int)
            for doc in docs:
                words = doc.split(" ")
                for word in words:
                    counts[c][word] += 1

        return counts

    def train(self, training_set, training_labels, alpha=1):
        # Get number of documents
        N_doc = len(training_set)

        # Get vocabulary used in training set
        self.V = self.compute_vocabulary(training_set)

        # Create bigdoc
        for x, y in zip(training_set, training_labels):
            self.bigdoc[y].append(x)

        # Get set of all classes
        all_classes = set(training_labels)

        # Compute a dictionary with all word counts for each class
        self.word_count = self.count_word_in_classes()

        # For each class
        for c in all_classes:
            # Get number of documents for that class
            N_c = float(sum(training_labels == c))

            # Compute logprior for class
            self.logprior[c] = np.log(N_c / N_doc)

            # Calculate the sum of counts of words in current class
            total_count = 0
            for word in self.V:
                total_count += self.word_count[c][word]

            # For every word, get the count and compute the log-likelihood for this class
            for word in self.V:
                count = self.word_count[c][word]
                self.loglikelihoods[c][word] = np.log((count+ alpha) / (total_count+ alpha))

    def predict(self, test_doc):
        sums = {
            0: 0,
            1: 0,
        }
        for c in self.bigdoc.keys():
            sums[c] = self.logprior[c]
            words = test_doc.split(" ")
            for word in words:
                if word in self.V:
                    sums[c] += self.loglikelihoods[c][word]

        return sums


In [75]:
def evaluate_predictions(validation_set,validation_labels,trained_classifier):
    correct_predictions = 0
    predictions_list = []
    prediction = -1
    
    for dataset,label in zip(validation_set, validation_labels):
        #print(label)
        probabilities = trained_classifier.predict(dataset)
        if probabilities[0] >= probabilities[1]:
            prediction = 'negative'
        elif  probabilities[0] < probabilities[1]:
            prediction = 'positive'
        if prediction == label:
            correct_predictions += 1
            predictions_list.append("+")
        else:
            predictions_list.append("-")
    pp.pprint("Predicted correctly {} out of {} ({}%)".format(correct_predictions,len(validation_labels),round(correct_predictions/len(validation_labels)*100,2)))
    return predictions_list, round(correct_predictions/len(validation_labels)*100)

In [76]:
NBC = NaiveBayesClassifier()
NBC.train(data['TEXT'], data['Sentiment'])


In [77]:
test_df = pd.read_csv('test.csv')
test_df.head(5)

Unnamed: 0,ID,TEXT,Sentiment,Context,Genre
0,0,The reception through this headset is excellent.,POSITIVE,NONE,GENRE_B
1,1,Hands down my favorite Italian restaurant!,POSITIVE,NONE,GENRE_B
2,2,The bathrooms are clean and the place itself i...,POSITIVE,NONE,GENRE_B
3,3,If you haven't gone here GO NOW!,POSITIVE,NONE,GENRE_B
4,4,"Host staff were, for lack of a better word, BI...",NEGATIVE,NONE,GENRE_B


In [78]:
test_df = clean_TEXT(test_df)

In [79]:
test_df.head(5)

Unnamed: 0,ID,TEXT,Sentiment,Context,Genre
0,0,reception headset excellent,positive,,genre_b
1,1,hands favorite italian restaurant,positive,,genre_b
2,2,bathrooms clean place decorated,positive,,genre_b
3,3,haven t gone,positive,,genre_b
4,4,host staff lack better word bitches,negative,,genre_b


In [80]:
result, acc = evaluate_predictions(test_df['TEXT'], test_df['Sentiment'],NBC)
print(result, acc)


'Predicted correctly 1013 out of 2015 (50.27%)'
['-', '-', '-', '-', '+', '+', '-', '+', '+', '-', '-', '-', '+', '-', '-', '+', '+', '+', '+', '-', '+', '+', '+', '+', '-', '+', '+', '+', '+', '-', '-', '-', '-', '+', '+', '-', '-', '+', '+', '-', '+', '+', '+', '-', '+', '+', '-', '-', '+', '+', '+', '+', '+', '-', '-', '-', '-', '+', '+', '+', '-', '-', '-', '+', '-', '+', '-', '-', '-', '+', '+', '-', '-', '+', '-', '-', '+', '+', '-', '-', '-', '-', '-', '+', '+', '-', '+', '-', '+', '-', '+', '+', '-', '-', '+', '-', '-', '-', '+', '+', '+', '-', '-', '+', '+', '+', '-', '+', '-', '-', '-', '+', '+', '-', '+', '+', '-', '-', '+', '-', '+', '-', '+', '+', '-', '-', '+', '+', '+', '-', '+', '-', '+', '-', '-', '+', '-', '+', '-', '-', '-', '+', '+', '+', '-', '-', '-', '+', '+', '+', '+', '+', '-', '-', '+', '+', '+', '-', '-', '-', '-', '+', '-', '+', '+', '-', '+', '-', '-', '-', '-', '-', '-', '-', '+', '-', '+', '-', '-', '+', '+', '-', '+', '-', '-', '-', '-', '+', '-', '-', '