In [1]:
from libraries import (re,Counter,train_test_split,accuracy_score,SVC,LinearSVC,RandomForestClassifier,DecisionTreeClassifier,DictVectorizer)
from read_data import(read_data)
from token_features import(ngram,create_feature)
from convert_labels import(convert_label)
from models import(svc,lsvc,rforest,dtree)

In [2]:
file = 'data.txt'
data = read_data(file)
print("Number of instances: {}".format(len(data)))

Number of instances: 7480


In [3]:
emotions = ["joy", 'fear', "anger", "sadness", "disgust", "shame", "guilt"]

x_all = []
y_all = []

for label, text in data:
    y_all.append(convert_label(label, emotions))
    x_all.append(create_feature(text, nrange=(1, 4)))

In [4]:
x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size = 0.2, random_state = 123)

In [5]:
vectorizer = DictVectorizer(sparse = True)
x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [6]:
svc.fit(x_train,y_train)
y_predict_svc = svc.predict(x_test)
accuracy_svc = accuracy_score(y_test, y_predict_svc)
print(accuracy_svc)

0.45120320855614976


In [7]:
lsvc.fit(x_train,y_train)
y_predict_lsvc = lsvc.predict(x_test)
accuracy_lsvc = accuracy_score(y_test, y_predict_lsvc)
print(accuracy_lsvc)



0.5768716577540107




In [8]:
rforest.fit(x_train,y_train)
y_predict_rforest = rforest.predict(x_test)
accuracy_rforest = accuracy_score(y_test, y_predict_rforest)
print(accuracy_rforest)

0.554144385026738


In [9]:
dtree.fit(x_train,y_train)
y_predict_dtree = dtree.predict(x_test)
accuracy_dtree = accuracy_score(y_test, y_predict_dtree)
print(accuracy_dtree)

0.46189839572192515


In [10]:
l = ["joy", 'fear', "anger", "sadness", "disgust", "shame", "guilt"]
l.sort()
label_freq = {}
for label, _ in data: 
    label_freq[label] = label_freq.get(label, 0) + 1

# print the labels and their counts in sorted order 
for l in sorted(label_freq, key=label_freq.get, reverse=True):
    print("{:10}({})  {}".format(convert_label(l, emotions), l, label_freq[l]))

joy       (1. 0. 0. 0. 0. 0. 0.)  1084
anger     (0. 0. 1. 0. 0. 0. 0.)  1080
sadness   (0. 0. 0. 1. 0. 0. 0.)  1079
fear      (0. 1. 0. 0. 0. 0. 0.)  1078
disgust   (0. 0. 0. 0. 1. 0. 0.)  1057
guilt     (0. 0. 0. 0. 0. 0. 1.)  1057
shame     (0. 0. 0. 0. 0. 1. 0.)  1045


In [12]:
emoji_dict = {"joy":"ðŸ˜‚", "fear":"ðŸ˜±", "anger":"ðŸ˜ ", "sadness":"ðŸ˜¢", "disgust":"ðŸ˜’", "shame":"ðŸ˜³", "guilt":"ðŸ˜³"}
t1 = "This looks so impressive"
t2 = "I have a fear of dogs"
t3 = "My dog died yesterday"
t4 = "I don't love you anymore..!"

texts = [t1, t2, t3, t4]
for text in texts: 
    features = create_feature(text, nrange=(1, 4))
    features = vectorizer.transform(features)
    prediction = svc.predict(features)[0]
    print( text,emoji_dict[prediction])

This looks so impressive ðŸ˜’
I have a fear of dogs ðŸ˜±
My dog died yesterday ðŸ˜¢
I don't love you anymore..! ðŸ˜³
