# Text emotions detection

In [1]:
# Importing libraries
import re
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Loading dataset
def read_data(file):
    data = []
    with open(file, "r") as f:
        for line in f:
            line = line.strip()
            label = " ".join(line[1:line.find("]")].strip().split())
            text = line[line.find("]")+1:].strip()
            data.append([label, text])
    return data
file = "text.txt"
data = read_data(file)
print("Number of instances: {}".format(len(data)))

Number of instances: 7480


In [3]:
# Functions for tokenization and generating features
def ngram(token, n):
    output = []
    for i in range(n-1, len(token)):
        ngram = " ".join(token[i-n+1 : i+1])
        output.append(ngram)
    return output

def create_feature(text, nrange = (1, 1)):
    text_features = []
    text = text.lower()
    text_alphanum = re.sub("[^a-z0-9#]", " ", text)
    for n in range(nrange[0], nrange[1]+1):
        text_features += ngram(text_alphanum.split(), n)
    text_punc = re.sub("[a-z0-9]", " ", text)
    text_features += ngram(text_punc.split(), 1)
    return Counter(text_features)

In [4]:
# Function for emotions storage
def convert_label(item, name):
    items = list(map(float, item.split()))
    label = ""
    for idx in range(len(items)):
        if items[idx] == 1:
            label += name[idx] + " "
    return label.strip()

emotions = ["joy", "fear", "anger", "sadness", "disgust", "shame", "guilt"]

X = []
y = []

for label, text in data:
    y.append(convert_label(label, emotions))
    X.append(create_feature(text, nrange = (1, 4)))

In [5]:
# Splitting training and testing data sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42)

def train_test(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train)
    train_acc = accuracy_score(y_train, clf.predict(X_train))
    test_acc = accuracy_score(y_test, clf.predict(X_test))
    return train_acc, test_acc

from sklearn.feature_extraction import DictVectorizer
vectorizer = DictVectorizer(sparse = True)
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [6]:
# Training and testing ML models
svc = SVC()
lsvc = LinearSVC(random_state = 42)
rforest = RandomForestClassifier(random_state = 42)
dtree = DecisionTreeClassifier()

clifs = [svc, lsvc, rforest, dtree]

print("| {:25} | {} | {} |".format("Classifier", "Training Accuracy", "Test Accuracy"))
print("| {} | {} | {} |".format("_"*25, "_"*17, "_"*13))

for clf in clifs:
    clf_name = clf.__class__.__name__
    train_acc, test_acc = train_test(clf, X_train, X_test, y_train, y_test)
    print("| {:25} | {:17.10f} | {:13.10f} |".format(clf_name, train_acc, test_acc))

| Classifier                | Training Accuracy | Test Accuracy |
| _________________________ | _________________ | _____________ |
| SVC                       |      0.9072526738 |  0.4545454545 |




| LinearSVC                 |      0.9989973262 |  0.5768716578 |
| RandomForestClassifier    |      0.9989973262 |  0.5427807487 |
| DecisionTreeClassifier    |      0.9989973262 |  0.4772727273 |


In [7]:
# Detecting emotion
l = ["joy", "fear", "anger", "sadness", "disgust", "shame", "guilt"]
l.sort()
label_freq = {}

for label, _ in data:
    label_freq[label] = label_freq.get(label, 0) + 1

for l in sorted(label_freq, key = label_freq.get, reverse = True):
    print("{:10}({})  {}".format(convert_label(l, emotions), l, label_freq[l]))

joy       (1. 0. 0. 0. 0. 0. 0.)  1084
anger     (0. 0. 1. 0. 0. 0. 0.)  1080
sadness   (0. 0. 0. 1. 0. 0. 0.)  1079
fear      (0. 1. 0. 0. 0. 0. 0.)  1078
disgust   (0. 0. 0. 0. 1. 0. 0.)  1057
guilt     (0. 0. 0. 0. 0. 0. 1.)  1057
shame     (0. 0. 0. 0. 0. 1. 0.)  1045


In [8]:
emotion_emoji_dict = {"joy":"😊", "fear":"😨", "anger":"😠", "sadness":"😢", "disgust":"😒", "shame":"😳", "guilt":"😔"}
txt = "I am in a very good mood"
texts = [txt]

for text in texts:
    features = create_feature(text, nrange = (1, 4))
    features = vectorizer.transform(features)
    prediction = clf.predict(features)[0]
    print(text, emotion_emoji_dict[prediction])

I am in a very good mood 😊
