In [2]:
'''
Emotion Detection - Lexicon Based approach
'''
from __future__ import division
import nltk 
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import *
from textblob.classifiers import NaiveBayesClassifier
from sklearn.cross_validation import KFold
from nltk.classify.naivebayes import NaiveBayesClassifier
from gensim import corpora, models, similarities
from nltk.corpus import wordnet as wn
import urllib
import urllib2

In [3]:
'''
Reading the Dataset (ISEAR Dataset)
'''
Data = pd.read_csv('ISEAR.csv',header=None)
'''
36 - Class Label
40 - Sentence
'''

'\n36 - Class Label\n40 - Sentence\n'

In [4]:
'''
Emotion Labels
'''
emotion_labels = ['joy', 'fear', 'anger', 'sadness', 'disgust', 'shame', 'guilt']

In [5]:
'''
Returns a list of all corresponding class labels
'''
def class_labels(emotions):
    labels = []
    labelset = []
    exclude = []
    for i in range(len(emotions)):
#         labels.append(e)
#         labelset.append([e])
        if emotions[i] not in ['shame','guilt']:
            labels.append(e)
            labelset.append([e])
        else:
            exclude.append(i)
    return labels, labelset, exclude

In [6]:
'''
Removes unnecessary characters from sentences
'''
def removal(sentences):
    sentence_list = []
    count = 0
#     for sen in sentences:
#         count += 1
#         print count
#         print sen
#         print type(sen)
    s = nltk.word_tokenize(sentences)
    characters = ["á", "\xc3", "\xa1", "\n", ",", ".", "[", "]", ""]
    l = []
    for t in s:
        if t not in characters:
            l.append(t)
    return l

In [8]:
'''
POS-TAGGER, returns NAVA words
'''
def pos_tag(sentences):
    tags = [] #have the pos tag included
    nava_sen = []
    pt = nltk.pos_tag(sentences)
#     for s in sentences:
#     s_token = nltk.word_tokenize(sentences)
#     pt = nltk.pos_tag(s_token)
    nava = []
    nava_words = []
    for t in pt:
        if t[1].startswith('NN') or t[1].startswith('JJ') or t[1].startswith('VB') or t[1].startswith('RB'):
            nava.append(t)
            nava_words.append(t[0])
    return nava, nava_words

In [9]:
'''
Performs stemming
'''
def stemming(sentences):
    sentence_list = []
    sen_string = []
    sen_token = []
    stemmer = PorterStemmer()
    i = 0
#     for sen in sentences:
#         print i,
    i += 1
    st = ""
    for word in sentences:
        word_l = word.lower()
        if len(word_l) >= 3:
            st += stemmer.stem(word_l) + " "
    sen_string.append(st)
    w_set = nltk.word_tokenize(st)
    sen_token.append(w_set)
    w_text = nltk.Text(w_set)
    sentence_list.append(w_text)
    return w_text, st, w_set
#     return sentence_list, sen_string, sen_token

In [10]:
'''
Write to file
'''
def write_to_file(filename, text):
    o = open(filename,'w')
    o.write(str(text))
    o.close()

In [11]:
'''
Creating the dataframe
'''
def create_frame(Data):
    labels = []
    sen = []
    sen_s = []
    sen_t = []
    labelset = []
    for i in range(len(Data)):
        if i >= 0:
            emotion = Data[0][i]
            sit = Data[1][i]
            labels.append(emotion)
            labelset.append([emotion])
            sent = removal(sit)
            nava, sent_pt = pos_tag(sent)
            sentences, sen_string, sen_token = stemming(sent_pt)
            sen.append(sentences)
            sen_s.append(sen_string)
            sen_t.append(sen_token)
    frame = pd.DataFrame({0 : labels,
                          1 : sen,
                          2 : sen_s,
                          3 : sen_t,
                          4 : labelset})
    return frame, sen_t, labels, sen_s

In [12]:
c, st, labels, senten = create_frame(Data)

In [14]:
'''
Reads the emotion representative words file
'''
def readfile(filename):
    f = open(filename,'r')
    representative_words = []
    for line in f.readlines():
        characters = ["\n", " ", "\r", "\t"]
        new = ''.join([i for i in line if not [e for e in characters if e in i]])
        representative_words.append(new)
    return representative_words

In [15]:
'''
Makes a list of all words semantically related to an emotion and Stemming
'''
def affect_wordlist(words):
    affect_words = []
    stemmer = PorterStemmer()
    for w in words:
        w_l = w.lower()
        word_stem = stemmer.stem(w_l)
        if word_stem not in affect_words:
            affect_words.append(word_stem)
    return affect_words

In [16]:
'''
Creating an emotion wordnet
'''
def emotion_word_set(emotions):
    word_set = {}
    for e in emotions:
        representative_words = readfile(e)
        wordlist = affect_wordlist(representative_words)
        word_set[e] = wordlist
    return word_set

In [25]:
'''
Lexicon based approach - Check for lexicons
The function checks if the input sentence contains any lexicons. If it does, it stores them as shown in the example.
eg: {u'love': ['joy']}, {u'death': ['fear']}
The sentence contains two words, 'love' (which indicates joy) and 'death' (which indicates fear).
'''
def lexicon_based(sentences, word_set):
    text_vector = []
    for sen in sentences:
        s_vector = []
        for word in sen:
            w_vector = {}
            for emo in word_set:
                if word in word_set[emo]:
                    try:
                        if emo not in w_vector[word]:
                            w_vector[word].append(emo)
                    except KeyError:
                        w_vector[word] = [emo]
            if w_vector:
                s_vector.append(w_vector)
        if not s_vector:
            text_vector.append(s_vector)
        else:
            text_vector.append(s_vector)
    return text_vector

In [26]:
'''
Lexicon based approach - Classify based on lexicons
This function tries to detect the emotion in the sentence based on the lexicons extracted by the function named 'lexicon_based'
'''
def classify_lexicon(text_vector, labels, emotion_labels):
    count = 0
    total = 0
    for j in range(len(text_vector)):
        sen = text_vector[j]
        sen_emo = np.empty(len(emotion_labels))
        sen_emo.fill(0)
        if sen:
            total += 1
            w_emo = []
            for word in sen:
                emotions =  word.values()[0][0]
                w_emo.append(emotions)
                i = emotion_labels.index(emotions)
                sen_emo[i] += 1
            winner = np.argwhere(sen_emo == np.amax(sen_emo))
            indices = winner.flatten().tolist()
            for i in indices:
                if emotion_labels[i] == labels[j]:
                    count += 1
                    break
    accuracy = count/len(text_vector)
    tot_accuracy = count/total
    return accuracy, tot_accuracy

In [23]:
e = emotion_word_set(emotion_labels)
l = lexicon_based(c[1],e) 
a, b = classify_lexicon(l, c[0], emotion_labels)
print "The total accuracy of the lexicon based approach = ", a*100 , "%"
print "The accuracy when only those sentences that contain the lexicon are considered for evaluation = ",b*100, "%"

The total accuracy of the lexicon based approach =  22.7514635444 %
The accuracy when only those sentences that contain the lexicon are considered for evaluation =  54.0113708149 %
