### Machine Learning Spring 2020
### Naive Bayes Classifier
### Ahmed Anwar 20110236

#### 0. Load Dependencies

In [16]:
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm
import random
import re
from random import sample

In [17]:
import nltk
from nltk.corpus import stopwords
stops = stopwords.words('english')

In [18]:
def clean_tweet(review, stops = stops):
    review = review.lower().replace(',','').replace('.','').replace(':','').replace('/><br','').replace('/>','').replace(')','').replace('(','')
    review = review.replace(("\'"),("")).replace(";","").replace('  ',' ').replace('?',' ').replace('"', '').replace('!','').replace('  ', ' ').replace('+','')
    review = " ".join(filter(lambda x:x[0]!='@', review.split())) #removes @
    review = re.sub(r'[0-9]+', '', review) #removes numbers
    review = review.encode('ascii', 'ignore').decode('ascii') #removes emojis
    review = ' '.join([word for word in re.split("\W+", review) if word not in stops])
    return review

In [19]:
tweets = pd.read_csv('tweets.csv')
tweets.head()

Unnamed: 0,airline_sentiment,text
0,neutral,@USAirways Is there a phone line to call into ...
1,positive,@united Bag was finally delivered and intact. ...
2,positive,@usairways Thanks to Kevin and team at F38ish ...
3,negative,"@AmericanAir Yes, talked to them. FLL says is ..."
4,negative,@VirginAmerica and it's a really big bad thing...


### Clean the data

In [20]:
for i in tqdm(range(len(tweets))):
    tweets.text[i] = clean_tweet(tweets.text[i])
    time.sleep(0.001)

HBox(children=(FloatProgress(value=0.0, max=14640.0), HTML(value='')))




### Stratified split 

In [21]:
grp = tweets.groupby('airline_sentiment').apply(lambda x: x.sample(int(0.80*len(x))))
grp = grp.rename(columns = {'airline_sentiment' : 'sentiment'}).reset_index()
training_set = grp.rename(columns = {'level_1':'Document'}).drop(columns=['airline_sentiment'])

In [22]:
total_indices = list(range(0,len(tweets)))
train_indices = training_set.Document.tolist()
test_indices = [x for x in total_indices if x not in train_indices]

In [23]:
train_set = training_set.drop(columns=['Document'])
test_set = tweets.iloc[test_indices,].reset_index(drop=True).rename(columns={'airline_sentiment':'sentiment'})

In [24]:
test_set.sentiment.value_counts()

negative    1836
neutral      620
positive     473
Name: sentiment, dtype: int64

In [25]:
train_set.sentiment.value_counts()

negative    7342
neutral     2479
positive    1890
Name: sentiment, dtype: int64

### Corpus of words for each class

In [26]:
def create_corpus(df, Class):
    corpus = []
    sentences = df[df.sentiment==Class].text
    for sentence in sentences:
        words = sentence.split()
        for word in words:
            corpus.append(word)
    return (corpus, len(sentences)) #returns a tuple

In [29]:
#Each contains corpus of words and frequency of each class
neg_class = create_corpus(train_set,'negative')
pos_class = create_corpus(train_set,'positive')
neu_class = create_corpus(train_set,'neutral')

### Create vocabubulary of unique words in whole training set

In [30]:
Vocab_unique = []
for word in neg_class[0]:
    if word not in Vocab_unique:
        Vocab_unique.append(word)
        
for word in pos_class[0]:
    if word not in Vocab_unique:
        Vocab_unique.append(word)
        
for word in neu_class[0]:
    if word not in Vocab_unique:
        Vocab_unique.append(word)

N = len(Vocab_unique) #Number of unique words

### Calculate Prior Probabilities

In [31]:
p_neg = neg_class[1] / len(train_set)
p_pos = pos_class[1] / len(train_set)
p_neu = neu_class[1] / len(train_set)

priors = (p_neg, p_pos, p_neu)

In [32]:
def remove_unseen(example):
    test_corpus = clean_tweet(example).split()
    for word in test_corpus:
        if word not in Vocab_unique:
            test_corpus.remove(word)
    return test_corpus

### Calculate Posterior for each class

In [33]:
def posterior(example,Class):
    
    class_corpus = Class[0]
    n = len(class_corpus)
    
    test_corpus = remove_unseen(example) # clean and remove unseen
    posterior = 1

    for word in test_corpus:
        posterior = posterior * (class_corpus.count(word) +1 ) / (n+N)
    
    return posterior

### Prediction functions

In [34]:
def predict(example, priors=priors):
    
    neg = np.log(posterior(example, neg_class) * priors[0])
    pos = np.log(posterior(example, pos_class) * priors[1])
    neu = np.log(posterior(example, neu_class) * priors[2])
    
    Argmax = np.array([neg,pos,neu]).argmax()
    
    if Argmax == 0:
        return 'negative'
    elif Argmax == 1:
        return 'positive'
    else:
        return 'neutral'

In [35]:
def predict_on_batch(df):
    predictions = []
    print('Predicting...')
    for i in tqdm(range(len(df))):
        temp = predict(df.text[i])
        predictions.append(temp)
        time.sleep(0.001)
    return np.array(predictions)

### Evaluation Functions

In [36]:
def confusion_matrix(y_pred,y_true):
    cmat = np.zeros((3,3), dtype = int)
    cmat[0][0] = ((y_pred == 'negative') & (y_true == 'negative')).sum()
    cmat[0][1] = ((y_pred == 'negative') & (y_true == 'neutral')).sum()
    cmat[0][2] = ((y_pred == 'negative') & (y_true == 'positive')).sum()

    cmat[1][0] = ((y_pred == 'neutral') & (y_true == 'negative')).sum()
    cmat[1][1] = ((y_pred == 'neutral') & (y_true == 'neutral')).sum()
    cmat[1][2] = ((y_pred == 'neutral') & (y_true == 'positive')).sum()

    cmat[2][0] = ((y_pred == 'positive') & (y_true == 'negative')).sum()
    cmat[2][1] = ((y_pred == 'positive') & (y_true == 'neutral')).sum()
    cmat[2][2] = ((y_pred == 'positive') & (y_true == 'positive')).sum()
    
    print('------True Labels----')
    print('     neg', ' neu', ' pos')
    print('neg', cmat[0])
    print('neu', cmat[1])
    print('pos', cmat[2])
    
    return 

In [37]:
def model_evaluation(y_pred,y_true):

    labels = ['negative','neutral','positive']
    Eval = []
    for label in labels:
        temp = []
        for i in range(len(y_pred)):
            if y_pred[i] == label:
                  output = 'P'
            else:
                  output = 'N'
        
            #classifier predicted positive
            if output == 'P':
                if y_true[i] == label:
                    output = 'T'+ output
                else:
                    output = 'F' + output
        
            #classifier prediced negative
            if output == 'N':
                if y_true[i] == label:
                    output = 'F'+ output
                else:
                    output = 'T' + output
        
            temp.append(output)
        Eval.append(np.unique(np.array(temp), return_counts = True))   
        
    precision, recall, F1 = macro_avg(Eval)
        
    print('\n')
    print('Model Accuracy : ', round(100*np.sum(y_pred==y_true) / len(y_pred),2), '%')   
    print('Precision : ', round(precision,3))
    print('Recall    : ', round(recall,3))
    print('F1 score  : ', round(F1,3))

In [38]:
def macro_avg(Eval):
    Precisions = []
    Recalls = []
    F1s = []
    
    for i in range(3):
        FN = Eval[i][1][0]
        FP = Eval[i][1][1]
        TN = Eval[i][1][2]
        TP = Eval[i][1][3]

        precision = TP/(TP+FP)
        recall =    TP/(TP+FN)
        F1 = (2*precision*recall)/(precision+recall)

        Precisions.append(precision)
        Recalls.append(recall)
        F1s.append(F1)

    macro_precision = np.mean(Precisions)
    macro_recall = np.mean(Recalls)
    macro_F1 = np.mean(F1s)

    return macro_precision, macro_recall, macro_F1

## Evaluation 

In [39]:
predictions = predict_on_batch(test_set)

Predicting...


HBox(children=(FloatProgress(value=0.0, max=2929.0), HTML(value='')))




In [40]:
confusion_matrix(predictions, test_set.sentiment)

------True Labels----
     neg  neu  pos
neg [1740  325  171]
neu [ 74 245  52]
pos [ 22  50 250]


In [41]:
model_evaluation(predictions,test_set.sentiment)



Model Accuracy :  76.31 %
Precision :  0.738
Recall    :  0.624
F1 score  :  0.659
