In [1]:
import warnings

warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import seaborn as sns
%matplotlib inline


pd.options.mode.use_inf_as_na = True
sns.set(rc={'figure.figsize':(11.7,8.27)})

BASEDIR = os.path.dirname(os.path.dirname(os.path.abspath('README.md')))
DATAPATH = os.path.join(BASEDIR, 'data')
CHECKPOINT_PATH = os.path.join(BASEDIR, 'checkpoints')


In [3]:
df = pd.read_csv(os.path.join(DATAPATH, 'ISEAR_dataset.csv'), names=['#', 'emotions', 'texts'])
df.head()

Unnamed: 0,#,emotions,texts
0,0,joy,On days when I feel close to my partner and ot...
1,1,fear,Every time I imagine that someone I love or I ...
2,2,anger,When I had been obviously unjustly treated and...
3,3,sadness,When I think about the short time that we live...
4,4,disgust,At a gathering I found myself involuntarily si...


In [4]:
df['texts'][0]

'On days when I feel close to my partner and other friends.   \nWhen I feel at peace with myself and also experience a close  \ncontact with people whom I regard greatly.'

In [5]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [6]:
_stopwords = stopwords.words('english')
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()

In [7]:
def _tokenize(word):
    tokens = tokenizer.tokenize(word)
    lems = []
    for word in tokens:
 
        if word not in _stopwords:
            lems.append(lemmatizer.lemmatize(word))
            
    return ' '.join(lems)

In [8]:
_tokenize('what is this all shit happening in my life?')

'shit happening life'

In [9]:
df['clean_texts'] = df['texts'].apply(_tokenize)

In [10]:
df.head()

Unnamed: 0,#,emotions,texts,clean_texts
0,0,joy,On days when I feel close to my partner and ot...,On day I feel close partner friend When I feel...
1,1,fear,Every time I imagine that someone I love or I ...,Every time I imagine someone I love I could co...
2,2,anger,When I had been obviously unjustly treated and...,When I obviously unjustly treated possibility ...
3,3,sadness,When I think about the short time that we live...,When I think short time live relate period lif...
4,4,disgust,At a gathering I found myself involuntarily si...,At gathering I found involuntarily sitting nex...


In [11]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
fenc = TfidfVectorizer()
fenc.fit_transform(df['clean_texts'])

<7446x8132 sparse matrix of type '<class 'numpy.float64'>'
	with 75178 stored elements in Compressed Sparse Row format>

In [13]:
vocab  = fenc.vocabulary_

In [14]:
vocab

{'on': 4975,
 'day': 1864,
 'feel': 2786,
 'close': 1363,
 'partner': 5181,
 'friend': 3040,
 'when': 7953,
 'peace': 5218,
 'also': 337,
 'experience': 2642,
 'contact': 1597,
 'people': 5245,
 'regard': 5898,
 'greatly': 3235,
 'every': 2552,
 'time': 7334,
 'imagine': 3610,
 'someone': 6696,
 'love': 4295,
 'could': 1679,
 'serious': 6401,
 'illness': 3605,
 'even': 2546,
 'death': 1877,
 'obviously': 4923,
 'unjustly': 7642,
 'treated': 7450,
 'possibility': 5449,
 'elucidating': 2417,
 'think': 7274,
 'short': 6484,
 'live': 4236,
 'relate': 5927,
 'period': 5261,
 'life': 4194,
 'use': 7729,
 'at': 562,
 'gathering': 3115,
 'found': 3003,
 'involuntarily': 3892,
 'sitting': 6549,
 'next': 4813,
 'two': 7520,
 'expressed': 2670,
 'opinion': 4995,
 'considered': 1580,
 'low': 4301,
 'discriminating': 2142,
 'realized': 5826,
 'directing': 2092,
 'feeling': 2789,
 'discontent': 2129,
 'way': 7904,
 'trying': 7490,
 'put': 5695,
 'blame': 849,
 'instead': 3806,
 'sorting': 6720,
 'fe

In [15]:
for w in sorted(vocab, key=vocab.get, reverse=True):
    print(w, vocab[w])

zoophiliac 8131
zone 8130
zombie 8129
zomba 8128
zipper 8127
zip 8126
zig 8125
zhu 8124
zesco 8123
zero 8122
zemba 8121
zeeland 8120
zealander 8119
zealand 8118
zambia 8117
zambezi 8116
zalu 8115
zaire 8114
zagging 8113
yukky 8112
yugoslavia 8111
yr 8110
youth 8109
yournals 8108
your 8107
youngstters 8106
youngster 8105
youngish 8104
youngest 8103
younger 8102
young 8101
you 8100
york 8099
yielding 8098
yield 8097
yet 8096
yesterday 8095
yes 8094
yellow 8093
yelling 8092
yelled 8091
yell 8090
yeaterday 8089
years 8088
yearrs 8087
year 8086
ye 8085
yavanna 8084
yatch 8083
yastrebetz 8082
yard 8081
yanu 8080
xmas 8079
xiith 8078
wurm 8077
wth 8076
wrote 8075
wrongly 8074
wrong 8073
written 8072
writing 8071
write 8070
writ 8069
wristwatch 8068
wrinkled 8067
wounded 8066
wound 8065
would 8064
worthy 8063
worthwhile 8062
worthless 8061
worth 8060
wort 8059
worst 8058
worse 8057
worry 8056
worried 8055
worm 8054
world 8053
workshift 8052
workplace 8051
worknig 8050
workmate 8049
working 804

team 7179
teaching 7178
teacher 7177
teach 7176
tea 7175
te 7174
tchaikovsdy 7173
taxi 7172
tax 7171
tavern 7170
taught 7169
tatooed 7168
taste 7167
tasman 7166
task 7165
tart 7164
tarmac 7163
target 7162
tapped 7161
tapism 7160
tape 7159
tap 7158
tanzania 7157
tank 7156
tampon 7155
tampering 7154
talking 7153
talkiing 7152
talked 7151
talkative 7150
talk 7149
tale 7148
taking 7147
taken 7146
take 7145
tailor 7144
tail 7143
tai 7142
tage 7141
tactlessly 7140
tactical 7139
tackle 7138
tackels 7137
tackeled 7136
tablet 7135
table 7134
ta 7133
system 7132
syphilitic 7131
syphilis 7130
syndrome 7129
synchronicaly 7128
symphony 7127
sympathy 7126
sympathetic 7125
sydney 7124
swollen 7123
switched 7122
switchboard 7121
switch 7120
swiss 7119
swing 7118
swindled 7117
swindle 7116
swimming 7115
swim 7114
swifty 7113
swerving 7112
swerved 7111
swerve 7110
swept 7109
swelling 7108
swell 7107
sweetheart 7106
sweet 7105
sweep 7104
swedish 7103
sweden 7102
sweatheart 7101
swearing 7100
swan 7099
sw

shovel 6495
shove 6494
shouting 6493
shouted 6492
shout 6491
shoulder 6490
shotgun 6489
shot 6488
shortly 6487
shorter 6486
shortcoming 6485
short 6484
shore 6483
shopping 6482
shoplifting 6481
shoplifted 6480
shopkeeper 6479
shop 6478
shooting 6477
shoot 6476
shook 6475
shoe 6474
shocking 6473
shocked 6472
shock 6471
shit 6470
shirt 6469
shirked 6468
ship 6467
shiny 6466
shining 6465
shima 6464
shillings 6463
shilling 6462
shifted 6461
shift 6460
shepherd 6459
shelter 6458
shell 6457
shelf 6456
sheet 6455
sheer 6454
sheen 6453
shed 6452
she 6451
sharply 6450
sharp 6449
shark 6448
sharing 6447
shared 6446
share 6445
shaped 6444
shape 6443
shameless 6442
shameful 6441
shamed 6440
shame 6439
shall 6438
shaking 6437
shaken 6436
shaked 6435
shake 6434
shaft 6433
shadow 6432
shabby 6431
sgts 6430
sexually 6429
sexual 6428
sexistly 6427
sex 6426
sewer 6425
sevice 6424
severl 6423
severely 6422
severe 6421
several 6420
seventeen 6419
seven 6418
sevely 6417
settler 6416
settlement 6415
settled

range 5782
rang 5781
randomly 5780
ran 5779
rampant 5778
rally 5777
raising 5776
raised 5775
raise 5774
raio 5773
rainy 5772
raining 5771
rained 5770
raincoat 5769
rain 5768
railway 5767
railing 5766
rail 5765
raided 5764
raid 5763
raging 5762
raggy 5761
rage 5760
rag 5759
rafting 5758
raflected 5757
radio 5756
radiator 5755
racket 5754
rack 5753
racist 5752
racism 5751
racing 5750
racially 5749
racial 5748
race 5747
rabid 5746
rabbit 5745
quota 5744
quiz 5743
quitted 5742
quite 5741
quit 5740
quietness 5739
quietly 5738
quietest 5737
quieter 5736
quiet 5735
quickly 5734
quicker 5733
quick 5732
queuing 5731
queue 5730
quetions 5729
questionned 5728
questionnaire 5727
questioning 5726
questioned 5725
questionaire 5724
questionable 5723
question 5722
query 5721
queii 5720
queer 5719
queensland 5718
queen 5717
quarter 5716
quarrelling 5715
quarrelled 5714
quarrell 5713
quarreling 5712
quarreled 5711
quarrel 5710
quarelling 5709
quarelled 5708
quality 5707
qualifying 5706
qualify 5705
qual

pair 5124
painting 5123
painted 5122
paint 5121
painfully 5120
painful 5119
pain 5118
paid 5117
page 5116
paedophiliac 5115
paddy 5114
paddock 5113
paddling 5112
pact 5111
packet 5110
packed 5109
package 5108
pack 5107
pacience 5106
p3 5105
p110 5104
oxygen 5103
owning 5102
owner 5101
owned 5100
owed 5099
overwhelms 5098
overwhelming 5097
overwhelmed 5096
overturned 5095
overturn 5094
overture 5093
overtook 5092
overtime 5091
overtaking 5090
overtake 5089
overt 5088
overstrained 5087
overslept 5086
overshadowing 5085
oversee 5084
overseas 5083
overreacted 5082
overreact 5081
overpriced 5080
overpowering 5079
overnight 5078
overload 5077
overjoyed 5076
overindulged 5075
overheard 5074
overhear 5073
overhastingly 5072
overflowed 5071
overflow 5070
overeating 5069
overeaten 5068
overdue 5067
overdrawed 5066
overdosed 5065
overcooked 5064
overcome 5063
overcame 5062
overboard 5061
overate 5060
overall 5059
over 5058
oven 5057
outwardly 5056
outward 5055
outskirt 5054
outside 5053
outset 50

lung 4320
luncheonette 4319
lunch 4318
lunatic 4317
lump 4316
lumber 4315
luggage 4314
lucky 4313
luckily 4312
luck 4311
lucas 4310
luapula 4309
lt2 4308
lp 4307
lowland 4306
lowest 4305
lowering 4304
lowered 4303
lower 4302
low 4301
loving 4300
lovesickness 4299
lover 4298
lovely 4297
loved 4296
love 4295
lout 4294
lousy 4293
louse 4292
lounge 4291
loudmouth 4290
loudly 4289
loudest 4288
loud 4287
lot 4286
lost 4285
loss 4284
losing 4283
lose 4282
los 4281
lorry 4280
lord 4279
loosing 4278
loose 4277
loop 4276
loooked 4275
looking 4274
looked 4273
look 4272
loo 4271
longlasting 4270
longest 4269
longer 4268
longed 4267
long 4266
lonely 4265
loneliness 4264
london 4263
lonavala 4262
lolly 4261
loking 4260
logic 4259
loft 4258
lodging 4257
lodger 4256
locking 4255
locker 4254
locked 4253
lock 4252
location 4251
located 4250
locally 4249
local 4248
loathed 4247
loaned 4246
loan 4245
loaf 4244
load 4243
lliving 4242
lizzy 4241
lizard 4240
living 4239
liver 4238
lived 4237
live 4236
little

hoping 3515
hopelessness 3514
hopelessly 3513
hopeless 3512
hoped 3511
hope 3510
hooted 3509
hooked 3508
hook 3507
hong 3506
honey 3505
honestly 3504
honest 3503
honduran 3502
honda 3501
homosexuality 3500
homosexual 3499
homework 3498
hometown 3497
homesick 3496
homecoming 3495
home 3494
holy 3493
holocaust 3492
hollow 3491
holidays 3490
holiday 3489
hole 3488
holding 3487
holdidays 3486
holder 3485
hold 3484
hockey 3483
hobby 3482
hitting 3481
hitler 3480
hitchhiking 3479
hitchcock 3478
hitch 3477
hit 3476
history 3475
historical 3474
hissingen 3473
his 3472
hiroshima 3471
hired 3470
hire 3469
hipertonia 3468
hinted 3467
hint 3466
hinged 3465
hindsight 3464
hindrance 3463
hindered 3462
himsellf 3461
hill 3460
hiking 3459
hiked 3458
highway 3457
highschool 3456
highly 3455
highest 3454
higher 3453
high 3452
hiding 3451
hide 3450
hidden 3449
hid 3448
hi 3447
hesitated 3446
hesitate 3445
hesitant 3444
hero 3443
hernan 3442
hereafter 3441
here 3440
herd 3439
herb 3438
her 3437
hepatitis 

exhausting 2619
exercise 2618
exemple 2617
executive 2616
execution 2615
excused 2614
excuse 2613
excursion 2612
excreted 2611
excrement 2610
exclusively 2609
exclusion 2608
excluded 2607
exclude 2606
exciting 2605
excitement 2604
excitedly 2603
excited 2602
exchange 2601
exces 2600
exceptional 2599
exception 2598
except 2597
excellently 2596
excellent 2595
exceedingly 2594
exceeded 2593
excaping 2592
exboyfriend 2591
exasperation 2590
exarcise 2589
exams 2588
example 2587
examplary 2586
examns 2585
examn 2584
examinor 2583
examining 2582
examiner 2581
examined 2580
examine 2579
examinator 2578
examinations 2577
examination 2576
examinateur 2575
exam 2574
exaltation 2573
exaggerating 2572
exaggerated 2571
exacution 2570
exactly 2569
exact 2568
exacly 2567
ex 2566
evrey 2565
evokes 2564
evoked 2563
evil 2562
eveyone 2561
everywhere 2560
everytime 2559
everything 2558
everythiing 2557
everyone 2556
everyday 2555
everybody 2554
everyboby 2553
every 2552
ever 2551
eventually 2550
event 254

declared 1898
declaration 1897
deck 1896
decisive 1895
decision 1894
decided 1893
decide 1892
deception 1891
decently 1890
decent 1889
decency 1888
december 1887
deceiving 1886
deceives 1885
deceived 1884
deceive 1883
deceitful 1882
decapitation 1881
decadent 1880
debut 1879
debt 1878
death 1877
dear 1876
dean 1875
dealing 1874
dealer 1873
deal 1872
deafening 1871
deaf 1870
deadline 1869
dead 1868
de 1867
dazzling 1866
daylight 1865
day 1864
dawn 1863
daughter 1862
dating 1861
dated 1860
date 1859
data 1858
dash 1857
darling 1856
darkness 1855
darker 1854
dark 1853
dared 1852
dare 1851
dardness 1850
danube 1849
dansing 1848
dansed 1847
danse 1846
dannielsson 1845
dangerously 1844
dangerous 1843
danger 1842
dandy 1841
dancing 1840
danced 1839
dance 1838
dampened 1837
damp 1836
dame 1835
damaging 1834
damaged 1833
damage 1832
dam 1831
dallas 1830
daily 1829
daddy 1828
dad 1827
dacing 1826
cycling 1825
cycled 1824
cycle 1823
cv 1822
cutting 1821
cutlery 1820
cute 1819
cut 1818
customs 181

cent 1193
cemetery 1192
cellulose 1191
cellar 1190
cell 1189
celibate 1188
celibacy 1187
celebrations 1186
celebration 1185
celebrating 1184
celebrated 1183
celebrate 1182
celebral 1181
cease 1180
caved 1179
cave 1178
cautious 1177
causing 1176
caused 1175
cause 1174
cauliflower 1173
caught 1172
catty 1171
cattle 1170
catholicism 1169
catholic 1168
caterpillar 1167
catering 1166
catered 1165
categorically 1164
catching 1163
catched 1162
catch 1161
catastrophe 1160
catarrh 1159
cat 1158
casual 1157
castration 1156
cast 1155
cassmate 1154
cassette 1153
cash 1152
case 1151
carton 1150
carterer 1149
carrying 1148
carry 1147
carrot 1146
carried 1145
carriage 1144
carressing 1143
carpet 1142
carpenter 1141
carpark 1140
carnival 1139
carneval 1138
carnaval 1137
carinthia 1136
caring 1135
caretaker 1134
caressed 1133
carelessness 1132
carelessly 1131
careless 1130
carefully 1129
careful 1128
carefree 1127
career 1126
cared 1125
care 1124
cardriver 1123
cardiac 1122
card 1121
carcass 1120
car 1

In [16]:
clean_vocab = {key: value for key, value in vocab.items() if key and value > 102}

In [17]:
feature_encoder = TfidfVectorizer(vocabulary=list(set(clean_vocab)))

In [18]:
X = feature_encoder.fit_transform(df['clean_texts'])

In [19]:
label_encoder = LabelBinarizer()
y = label_encoder.fit_transform(df['emotions'])

In [20]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['emotions'])

In [21]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, shuffle=True)

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from nltk.classify.naivebayes import NaiveBayesClassifier

In [23]:
clf = SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, max_iter=5, random_state=42)

In [24]:
# clf = OneVsRestClassifier(SGDClassifier(loss='hinge', penalty='l2',
#                                            alpha=1e-3, max_iter=5, random_state=42))
clf.fit(X_train, y_train)

SGDClassifier(alpha=0.001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
              n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
              random_state=42, shuffle=True, tol=0.001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [25]:
preds = clf.predict(X_test)

In [26]:
preds[1]

5

In [27]:
y_test[1]

5

In [28]:
from sklearn.metrics import multilabel_confusion_matrix, accuracy_score



In [29]:
accuracy_score(preds, y_test)

0.5735767991407089

In [30]:
accuracy_score(np.argmax(preds, axis=1), np.argmax(y_test, axis=1))

AxisError: axis 1 is out of bounds for array of dimension 1

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline([('encoder', feature_encoder), ('clf', clf)])

In [None]:
a = pipe.predict_proba(['i am very sad'])

In [None]:
label_encoder.inverse_transform(a)

In [None]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [None]:
show_most_informative_features(feature_encoder, clf)

In [None]:
label_encoder.classes_

In [None]:
from lime.lime_text import LimeTextExplainer
explainer = LimeTextExplainer(class_names=label_encoder.classes_)

In [None]:
text = 'i love this product but i hate this company'

In [None]:
exp = explainer.explain_instance(text, pipe.predict_proba, num_features=7, top_labels=7)
print(exp.available_labels())

In [None]:
exp.show_in_notebook(text=text)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(rc={'figure.figsize':(11.7,8.27)})

%matplotlib inline

In [None]:
from nltk.stem.porter import PorterStemmer

stemmer = PorterStemmer()

In [123]:
def _tokenize(word):
    tokens = tokenizer.tokenize(word)
    lems = []
    for word in tokens:
 
        if word not in _stopwords:
            lems.append(stemmer.stem(word))
            
    return lems

In [32]:
from textblob.classifiers import NaiveBayesClassifier
from sklearn.model_selection import KFold
from nltk.classify.naivebayes import NaiveBayesClassifier

In [33]:

def get_words_in_dataset(dataset):
    all_words = []
    for (words, sentiment) in dataset:
        all_words.extend(words)
    return all_words

In [34]:
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

In [35]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [51]:
def create_data(sentence, emotion):
    data = []
    try:
        for i in range(len(sentence)):
            sen = []
            for s in sentence[i]:
                sen.append(str(s))
            emo = emotion[i]
            data.append((sen, emo))
    except:
        pass
    return data

In [97]:
cleaned_data= pd.read_csv(os.path.join(DATAPATH, 'cleaned.csv'))

In [98]:
cleaned_data.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4
0,0,joy,<Text: day feel close partner other friend fee...,day feel close partner other friend feel peac ...,"['day', 'feel', 'close', 'partner', 'other', '...",['joy']
1,1,fear,<Text: time imagin someon love contact seriou ...,time imagin someon love contact seriou ill eve...,"['time', 'imagin', 'someon', 'love', 'contact'...",['fear']
2,2,anger,<Text: had been obvious unjustli treat had pos...,had been obvious unjustli treat had possibl el...,"['had', 'been', 'obvious', 'unjustli', 'treat'...",['anger']
3,3,sadness,<Text: think short time live relat period life...,think short time live relat period life think ...,"['think', 'short', 'time', 'live', 'relat', 'p...",['sadness']
4,4,disgust,<Text: gather found involuntarili sit next peo...,gather found involuntarili sit next peopl expr...,"['gather', 'found', 'involuntarili', 'sit', 'n...",['disgust']


In [110]:
from sklearn.feature_extraction.text import CountVectorizer

encoder = CountVectorizer(tokenizer=_tokenize)
X = encoder.fit_transform(cleaned_data['2'].values)

In [112]:
y = label_encoder.fit_transform(cleaned_data['0'])

In [114]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)

In [115]:
clf = MultinomialNB()
clf.fit(Xtrain, ytrain)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [120]:
preds = clf.predict(Xtest)

In [122]:
accuracy_score(preds, ytest)

0.545502927088877