In [1]:
#separate data into text and labels
import pandas as pd
import numpy as np

df = pd.read_csv('text.csv')

text = []
labels = []
for row in df.iterrows():
    text.append(row[1]['text'])
    labels.append(row[1]['label'])

labels = np.array(labels)
print(labels)

[4 0 4 ... 5 3 5]


In [None]:
#import preprocessing module(s), tokenise etc
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk import WordNetLemmatizer, pos_tag
from nltk.corpus import stopwords, wordnet
import nltk
import string
import pickle

'''tokens = []
for s in text:
    s.maketrans('','', string.punctuation)
    token = word_tokenize(s)
    for t in token:
        if t in set(stopwords.words('english')):
            token.remove(t)
    tokens.append(token)

with open('tokens', 'wb') as f:
    pickle.dump(tokens, f)

'''
with open ('tokens', 'rb') as f: #so the process need not be repeated in the future
    tokens = pickle.load(f)

print(tokens)

In [None]:
def pos_tagger(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV

lemma = WordNetLemmatizer()
    
'''tagged = []
for sent in tokens:
    tagged.append(pos_tag(sent))
wordnet_tagged = []
for sent in tagged:
    wordnet_tagged.append(list(map(lambda x: (x[0], pos_tagger(x[1])), sent)))
print(wordnet_tagged)'''

with open('wn_tagged', 'rb') as f:
    wordnet_tagged = pickle.load(f)

In [2]:
import pickle
'''lemmatised = [] #lemmatization
for sent in wordnet_tagged:
    L = []
    for i in sent:
        if i[1] == None:
            L.append(lemma.lemmatize(i[0]))
        else:
            L.append(lemma.lemmatize(*i))
    lemmatised.append(L)'''

with open('lemmatised', 'rb') as f:
    lemmatised = pickle.load(f)


In [3]:
#split training, validation and test sentences before training **USING SMALLER DATASET AS CANNOT ALLOCATE RESOURCES**
sents_train, sents_val, sents_test = lemmatised[:round(len(lemmatised)*0.1)], lemmatised[round(len(lemmatised)*0.1):round(len(lemmatised)*0.12)], lemmatised[round(len(lemmatised)*0.12):round(len(lemmatised)*0.14)]

In [4]:
class Vectoriser():

    def __init__(self, corpus=None):
        self.word_set = {}
        if corpus:
            self.fit(corpus)
    
    def fit(self, corpus): #learns vocabulary of given corpus
        ws = self.word_set
        for d in corpus:
            for t in d:
                if t not in ws:
                    ws[t] = len(ws)
        self.word_set = ws
    
    def transform(self, doc): #returns feature vector for given document based on learned vocabulary
        vec = np.zeros([len(self.word_set)], dtype=np.short) #generates vector of zeroes the same length as learned vocabulary
        for t in doc:
            if t in self.word_set:
                vec[self.word_set[t]] += 1 #for every instance of a known word, add 1 to corresponding position in vector
        return(vec)

In [5]:
VVV = Vectoriser(sents_train) #fits vectoriser to training data

In [6]:
train_vecs = np.array([VVV.transform(x) for x in sents_train], dtype=np.byte) #small datatype so that entire array can be created
#val_vecs = np.array([VVV.transform(x) for x in sents_val], dtype=np.short)
#test_vecs = np.array([VVV.transform(x) for x in sents_test], dtype=np.short)

In [7]:
from sklearn.naive_bayes import MultinomialNB
y_train = labels[:round(len(labels)*0.1)]
'''classifier = MultinomialNB()

classifier.fit(train_vecs, y_train)

'''
with open('trained_NB.pickle', 'rb') as f:
    classifier = pickle.load(f)

In [8]:
y_val = labels[round(len(labels)*0.1):round(len(labels)*0.12)]
val_vecs = np.array([VVV.transform(x) for x in sents_val], dtype=np.byte)
preds = classifier.predict(val_vecs)

total = 0
for i in range(len(preds)):
    if preds[i] == y_val[i]:
        total += 1

accuracy = total/len(preds)
print(accuracy)

0.8055422264875239


Accuracy of first Naive bayes implementation: 80.55% (2 s.f.)

In [9]:
val_pns = [[0,0,0] for i in range(6)]# 0: TP, 1: FP, 2: FN, TN can be inferred using other class TPs

for i in range(len(preds)):
    if preds[i] == y_val[i]:
        val_pns[preds[i]][0] += 1 #increase TP count on predicted/true class
    else:
        val_pns[y_val[i]][2] += 1 #increase FN count on true class
        val_pns[preds[i]][1] += 1 #increase FP count on predicted class

#Going to use micro avg first
total_tps = np.sum([x[0] for x in val_pns])
total_fps = np.sum([x[1] for x in val_pns])
total_fns = np.sum([x[2] for x in val_pns])

precision = total_tps/(total_tps+total_fps)
recall = total_tps/(total_tps+total_fns)

fscore = 2*(precision*recall)/(precision+recall) #


When using micro averaging, precision, recall and f1-score are equal to accuracy

In [11]:
from sklearn.metrics import f1_score

emo_map = {
    0 : "Sadness",
    1 : "Joy",
    2 : "Love",
    3 : "Anger",
    4 : "Fear",
    5 : "Surprise"
}

mac_f1 = f1_score(y_val, preds, average='macro')
pr_re = []
for x in val_pns:
    pre = x[0]/(x[0]+x[1])
    rec = x[0]/(x[0]+x[2])
    f1 = 2*(pre*rec)/(pre+rec)
    pr_re.append([pre, rec, f1])

for i in range(len(pr_re)):
    print(str(emo_map[i])+": Precision =", str(pr_re[i][0])+", Recall =", str(pr_re[i][1])+", F1-Score =", str(pr_re[i][2]))
print(mac_f1)


Sadness: Precision = 0.7859116022099447, Recall = 0.9343185550082101, F1-Score = 0.8537134283570892
Joy: Precision = 0.7786743515850144, Recall = 0.9414634146341463, F1-Score = 0.8523659305993689
Love: Precision = 0.8589743589743589, Recall = 0.3901018922852984, F1-Score = 0.5365365365365365
Anger: Precision = 0.9004474272930649, Recall = 0.7232704402515723, F1-Score = 0.802192326856004
Fear: Precision = 0.8663911845730028, Recall = 0.6912087912087912, F1-Score = 0.7689486552567236
Surprise: Precision = 0.9210526315789473, Recall = 0.109375, F1-Score = 0.19553072625698326
0.6682146006437842


Sadness: Precision = 0.7859116022099447, Recall = 0.9343185550082101, F1-Score = 0.8537134283570892 <br>
Joy: Precision = 0.7786743515850144, Recall = 0.9414634146341463, F1-Score = 0.8523659305993689 <br>
Love: Precision = 0.8589743589743589, Recall = 0.3901018922852984, F1-Score = 0.5365365365365365 <br>
Anger: Precision = 0.9004474272930649, Recall = 0.7232704402515723, F1-Score = 0.802192326856004 <br>
Fear: Precision = 0.8663911845730028, Recall = 0.6912087912087912, F1-Score = 0.7689486552567236 <br>
Surprise: Precision = 0.9210526315789473, Recall = 0.109375, F1-Score = 0.19553072625698326 <br>
Macro averaged F1-Score = 0.6682146006437842

NOTES:  Surprise and Love missing a lot of tags, Fear and Anger missing some also <br>
        Joy being over-classified 

In [9]:
counts = [0 for i in range(6)]
for i in y_train:
    counts[i] += 1
print(counts)
print(len(y_train))

[12265, 14059, 3409, 5688, 4720, 1540]
41681


In [13]:
counts = [0 for i in range(6)]

for i in y_val:
    counts[i] += 1
print(counts)

[2436, 2870, 687, 1113, 910, 320]


In [13]:
counts = [0, 0, 0, 0, 0, 0]
sents_train_2 = []
train_labels_2 = []
for i in range(len(lemmatised)):
    if counts[labels[i]] < 7000:
        sents_train_2.append(lemmatised[i])
        train_labels_2.append(labels[i])
        counts[labels[i]] += 1

print(sents_train_2[3333])

['be', 'invite', 'dinner', 'my', 'bossman', 'friday', 'be', 'definitely', 'comfort', 'incredibly', 'invigorate', 'feel', 'be', 'some', 'pretty', 'tortured', 'experience', 'past', 'year']


In [14]:
VVV_2 = Vectoriser(sents_train_2)

train_vecs_2 = np.array([VVV.transform(x) for x in sents_train_2], np.byte)

In [15]:
classifier_2 = MultinomialNB()

classifier_2.fit(train_vecs_2, train_labels_2)

with open('trained_NB_2.pickle', 'wb') as f:
    pickle.dump(classifier_2, f)