In [93]:
from __future__ import division
import nltk 
import re
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import *

In [120]:
'''
Reading the Dataset (ISEAR Dataset)
'''
Data = pd.read_csv('my_table.csv',header=None)
'''
36 - Class Label
40 - Sentence
'''

'\n36 - Class Label\n40 - Sentence\n'

In [121]:
'''
Emotion Labels
'''
emotion_labels = ['joy', 'fear', 'anger', 'sadness', 'disgust', 'shame', 'guilt']

In [122]:
'''
Negation words
'''
negation_words = ['not', 'neither', 'nor', 'but', 'however', 'although', 'nonetheless', 'despite', 'except', 'even though', 'yet']

In [123]:
'''
Returns a list of all corresponding class labels
'''
def class_labels(emotions):
    labels = []
    for e in emotions:
        labels.append(e)
    return labels

In [124]:
'''
Removes unnecessary characters from sentences
'''
def removal(sentences):
    sentence_list = []
    count = 0
    for sen in sentences:
        count += 1
#         print count
#         print sen
#         print type(sen)
        s = nltk.word_tokenize(sen)
        characters = ["á", "\xc3", "\xa1", "\n", ",", "."]
        new = ' '.join([i for i in s if not [e for e in characters if e in i]])
        sentence_list.append(new)
    return sentence_list

In [125]:
'''
POS-TAGGER, returns NAVA words
'''
def pos_tag(sentences):
    tags = []
    nava_sen = []
    for s in sentences:
        s_token = nltk.word_tokenize(s)
        pt = nltk.pos_tag(s_token)
        nava = []
        nava_words = []
        for t in pt:
            if t[1].startswith('NN') or t[1].startswith('JJ') or t[1].startswith('VB') or t[1].startswith('RB'):
                nava.append(t)
                nava_words.append(t[0])
        tags.append(nava)
        nava_sen.append(nava_words)
    return tags, nava_sen

In [126]:
'''
Performs stemming
'''
def stemming(sentences):
    sentence_list = []
    stemmer = PorterStemmer()
    for sen in sentences:
        st = ""
        for word in sen:
            word_l = word.lower()
            if len(word_l) >= 3:
                st += stemmer.stem(word_l) + " "
        w_set = nltk.word_tokenize(st)
        w_text = nltk.Text(w_set)
        sentence_list.append(w_text)
    return sentence_list

In [127]:
'''
Creating the dataframe
'''
def create_frame(Data):
    emotions = Data[36]
    sit = Data[40]
    labels = class_labels(emotions[1:50])
    sent = removal(sit[1:50])
    nava, sent_pt = pos_tag(sent)
    sentences = stemming(sent_pt)
    frame = pd.DataFrame({0 : labels,
                          1 : sentences})
    return frame

In [128]:
c = create_frame(Data)

In [129]:
'''
Reads the emotion representative words file
'''
def readfile(filename):
    f = open(filename,'r')
    representative_words = []
    for line in f.readlines():
        characters = ["\n", " ", "\r", "\t"]
        new = ''.join([i for i in line if not [e for e in characters if e in i]])
        representative_words.append(new)
    return representative_words

In [130]:
'''
Makes a list of all words semantically related to an emotion and Stemming
'''
def affect_wordlist(words):
    affect_words = []
    stemmer = PorterStemmer()
    for w in words:
        w_l = w.lower()
        word_stem = stemmer.stem(w_l)
        if word_stem not in affect_words:
            affect_words.append(word_stem)
    return affect_words

In [131]:
'''
Creating an emotion wordnet
'''
def emotion_word_set(emotions):
    word_set = {}
    for e in emotions:
        representative_words = readfile(e)
        wordlist = affect_wordlist(representative_words)
        word_set[e] = wordlist
    return word_set

In [132]:
def create_textbody(sentences):
    for sen in sentences:
        print sen
    return t

In [133]:
'''
Lexicon based approach - Check for lexicons
'''
def lexicon_based(sentences, word_set):
    text_vector = []
    for sen in sentences:
        s_vector = []
        for word in sen:
            w_vector = {}
            for emo in word_set:
                if word in word_set[emo]:
                    print word
                    try:
                        if emo not in w_vector[word]:
                            w_vector[word].append(emo)
                    except KeyError:
                        w_vector[word] = [emo]
            if w_vector:
                s_vector.append(w_vector)
        if not s_vector:
            text_vector.append(s_vector)
        else:
            text_vector.append(s_vector)
    return text_vector

In [115]:
'''
Lexicon based approach - Classify based on lexicons
'''
def classify_lexicon(text_vector, labels, emotion_labels):
    count = 0
    total = 0
    for j in range(len(text_vector)):
        sen = text_vector[j]
        sen_emo = np.empty(len(emotion_labels))
        sen_emo.fill(0)
        if sen:
            total += 1
            w_emo = []
            for word in sen:
                emotions =  word.values()[0][0]
                print emotions, type(emotions), j
                w_emo.append(emotions)
                i = emotion_labels.index(emotions)
                sen_emo[i] += 1
            print sen_emo
            winner = np.argwhere(sen_emo == np.amax(sen_emo))
            indices = winner.flatten().tolist()
            for i in indices:
                if emotion_labels[i] == labels[j]:
                    count += 1
                    break
                else:
                    print j, text_vector[j]
    accuracy = count/len(text_vector)
    tot_accuracy = count/total
    return accuracy, tot_accuracy

In [134]:
e = emotion_word_set(emotion_labels)
l = lexicon_based(c[1],e) 
a, b = classify_lexicon(l, c[0], emotion_labels)

love
infuri
lost
nauseou
afraid
peac
love
live
low
low
mean
humili
joy
wrong
accus
stupid
good
lost
lost
bad
guilt
guilt
guilt
guilt
accus
fail
fail
sick
joy <type 'str'> 0
[ 1.  0.  0.  0.  0.  0.  0.]
anger <type 'str'> 2
[ 0.  0.  1.  0.  0.  0.  0.]
sadness <type 'str'> 3
[ 0.  0.  0.  1.  0.  0.  0.]
disgust <type 'str'> 4
[ 0.  0.  0.  0.  1.  0.  0.]
fear <type 'str'> 8
[ 0.  1.  0.  0.  0.  0.  0.]
joy <type 'str'> 14
[ 1.  0.  0.  0.  0.  0.  0.]
joy <type 'str'> 15
[ 1.  0.  0.  0.  0.  0.  0.]
15 [{u'love': ['joy']}]
joy <type 'str'> 17
[ 1.  0.  0.  0.  0.  0.  0.]
17 [{u'live': ['joy']}]
shame <type 'str'> 18
[ 0.  0.  0.  0.  0.  1.  0.]
18 [{u'low': ['shame', 'sadness']}]
shame <type 'str'> 22
[ 0.  0.  0.  0.  0.  1.  0.]
22 [{u'mean': ['shame']}]
shame <type 'str'> 26
[ 0.  0.  0.  0.  0.  1.  0.]
joy <type 'str'> 28
[ 1.  0.  0.  0.  0.  0.  0.]
guilt <type 'str'> 29
[ 0.  0.  0.  0.  0.  0.  1.]
29 [{u'wrong': ['guilt']}]
guilt <type 'str'> 30
[ 0.  0.  0.  0.  0.  0

In [117]:
'''
Calculate pmi
'''
def pmi(x, y, sentences):
    count_x = 1
    count_y = 1
    count_xy = 1
    for sen in sentences:
        if x and y in sentences:
            count_xy += 1
            count_x += 1
            count_y += 1
        if x in sentences:
            count_x += 1
        if y in sentences:
            count_y += 1
        result = count_xy/(count_x * count_y)
    return result

In [135]:
print a
print b

0.224489795918
0.478260869565


In [40]:
'''
Emotion Detector
'''
c = create_frame(Data)
emo_word_net = emotion_word_set(emotion_set)

IOError: [Errno 2] No such file or directory: 'joy'

In [20]:
'''
Getting synonyms from wordnet synsets
'''
from nltk.corpus import wordnet as wn
jw = wn.synsets('shame')
for s in jw:
    v = s.name()
    print wn.synset(v).lemma_names()

[u'shame']
[u'shame', u'disgrace', u'ignominy']
[u'pity', u'shame']
[u'dishonor', u'disgrace', u'dishonour', u'attaint', u'shame']
[u'shame']
[u'shame']
[u'shame']
