In [1]:
import re 
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer

def read_data(file):
    data = []
    with open(file, 'r')as f:
        for line in f:
            line = line.strip()
            label = ' '.join(line[1:line.find("]")].strip().split())
            text = line[line.find("]")+1:].strip()
            data.append([label, text])
    return data

file = 'text.txt'
data = read_data("Dataset.txt")
# print(data)
# print("Number of instances: {}".format(len(data)))


***Function***

In [2]:
# สร้าง biggram ขึ้นมา
def ngram(token, n): 
    output = []
    for i in range(n-1, len(token)): 
        ngram = ' '.join(token[i-n+1:i+1])
        output.append(ngram) 
    return output

# feature จากข้อความ
def create_feature(text, nrange=(1, 1)):
    text_features = [] 
    text = text.lower() # แปลงเป็นพิมพ์เล็ก
    text_alphanum = re.sub('[^a-z0-9#]', ' ', text) # เอาตัวที่ไม่ใช่อักษร ออก
    for n in range(nrange[0], nrange[1]+1): 
        text_features += ngram(text_alphanum.split(), n)    
    text_punc = re.sub('[a-z0-9]', ' ', text)
    text_features += ngram(text_punc.split(), 1)
    return Counter(text_features) # นับ feature 

def convert_label(item, name): 
    items = list(map(float, item.split())) #str to float
    label = ""
    for idx in range(len(items)): 
        if items[idx] == 1: 
            label += name[idx] + " "
    
    return label.strip()

In [13]:
# test function
text = "I love programming and love dota2."
features = create_feature(text, nrange=(1, 3))

print(features)

Counter({'love': 2, 'i': 1, 'programming': 1, 'and': 1, 'dota2': 1, 'i love': 1, 'love programming': 1, 'programming and': 1, 'and love': 1, 'love dota2': 1, 'i love programming': 1, 'love programming and': 1, 'programming and love': 1, 'and love dota2': 1, '.': 1})


***Encode text***
use to train data

In [14]:
emotions = ["joy", 'fear', "anger", "sadness", "disgust", "shame", "guilt"]

X_all = []
y_all = []
for label, text in data:
    y_all.append(convert_label(label, emotions))
    X_all.append(create_feature(text, nrange=(1, 4)))

# print(X_all[0])
# print(X_all[1])
# # print("-------")
# print(y_all[0])

***Split Data***

In [15]:

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size = 0.2, random_state = 123)

# คิด accuracy
def train_test(clf, X_train, X_test, y_train, y_test):
    clf.fit(X_train, y_train) #train
    train_acc = accuracy_score(y_train, clf.predict(X_train))
    test_acc = accuracy_score(y_test, clf.predict(X_test))
    return train_acc, test_acc

vectorizer = DictVectorizer(sparse = True) #dictionary 
# to vector
X_train = vectorizer.fit_transform(X_train) 
X_test = vectorizer.transform(X_test)

In [16]:
lsvc = LinearSVC(random_state=123)
rforest = RandomForestClassifier(random_state=123)

clifs = [lsvc, rforest]

# train and test them 
print("| {:25} | {} | {} |".format("Classifier", "Training Accuracy", "Test Accuracy"))
print("| {} | {} | {} |".format("-"*25, "-"*17, "-"*13))
for clf in clifs: 
    clf_name = clf.__class__.__name__
    train_acc, test_acc = train_test(clf, X_train, X_test, y_train, y_test)
    print("| {:25} | {:17.7f} | {:13.7f} |".format(clf_name, train_acc, test_acc))

| Classifier                | Training Accuracy | Test Accuracy |
| ------------------------- | ----------------- | ------------- |




| LinearSVC                 |         0.9988302 |     0.5768717 |
| RandomForestClassifier    |         0.9988302 |     0.5541444 |


In [19]:
# count data
l = ["joy", 'fear', "anger", "sadness", "disgust", "shame", "guilt"]
l.sort()
label_freq = {}
for label, _ in data: 
    label_freq[label] = label_freq.get(label, 0) + 1

# print the labels and their counts in sorted order 
for l in sorted(label_freq, key=label_freq.get, reverse=True):
    print("{:10}({})  {}".format(convert_label(l, emotions), l, label_freq[l]))

joy       (1. 0. 0. 0. 0. 0. 0.)  1084
anger     (0. 0. 1. 0. 0. 0. 0.)  1080
sadness   (0. 0. 0. 1. 0. 0. 0.)  1079
fear      (0. 1. 0. 0. 0. 0. 0.)  1078
disgust   (0. 0. 0. 0. 1. 0. 0.)  1057
guilt     (0. 0. 0. 0. 0. 0. 1.)  1057
shame     (0. 0. 0. 0. 0. 1. 0.)  1045


In [23]:
emoji_dict = {"joy":"😂", "fear":"😱", "anger":"😠", "sadness":"😢", "disgust":"😒", "shame":"😳", "guilt":"😞"}
t1 = "This looks so impressive"
t2 = "I have a fear of dogs"
t3 = "My dog died yesterday"
t4 = "I don't love you anymore..!"
t5 = "I am so tried"
t6 = "I should not do it to him"

texts = [t1, t2, t3, t4 ,t5 ,t6]
for text in texts: 
    features = create_feature(text, nrange=(1, 4))
    features = vectorizer.transform(features)
    prediction = clf.predict(features)[0]
    print( text,emoji_dict[prediction])

This looks so impressive 😢
I have a fear of dogs 😱
My dog died yesterday 😢
I don't love you anymore..! 😂
I am so tried 😢
I should not do it to him 😞
