In [6]:
import pandas as pd
import numpy as np
from copy import deepcopy
import matplotlib.pyplot as plt
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import class_likelihood_ratios
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import BernoulliNB

In [10]:
# read emoticon dataset
train_emoticon_df = pd.read_csv("datasets/train/train_emoticon.csv")
train_emoticon_X = train_emoticon_df['input_emoticon'].tolist()
train_emoticon_Y = train_emoticon_df['label'].tolist()

# read emoticon validation-set
test_emoticon_df = pd.read_csv("datasets/valid/valid_emoticon.csv")
test_emoticon_X = test_emoticon_df['input_emoticon'].tolist()
test_emoticon_Y = test_emoticon_df['label'].tolist()

training_data_count = len(train_emoticon_X)
validation_data_count = len(test_emoticon_X)

In [12]:
# create dataset in list format (list of [emozi_string, label])
DATASET = []
for i in range(training_data_count):
    DATASET.append([train_emoticon_X[i], train_emoticon_Y[i]])

VALIDATION_SET = []
for i in range(validation_data_count):
    VALIDATION_SET.append([test_emoticon_X[i], test_emoticon_Y[i]])

In [14]:
# return numpy array converting each emozi string to an integer array
def to_unicode_decimal_array(emozi_string):
    arr = [int(emozi.encode("unicode_escape").hex(),16) for emozi in emozi_string]
    return arr

In [16]:
 # size in fraction of complete dataset, dataset randomized if 'randomized == True'
def pick_dataset(size, randomized = True, to_int = False):
    data_copy = deepcopy(DATASET)
    if(randomized == True):
        np.random.shuffle(data_copy)
    req_split = data_copy[:int(size*training_data_count)]
    if(to_int == True):
        mod_split = [[to_unicode_decimal_array(data_point[0]),data_point[1]] for data_point in req_split]
    else:
        mod_split = [[data_point[0],data_point[1]] for data_point in req_split]
    return mod_split

<h3>Approach 1: convert catagorical data to floating point numbers and train a model</h3>

In [18]:
working_set = pick_dataset(size=1,randomized=True, to_int=True)
test_set = [[to_unicode_decimal_array(data_point[0]),data_point[1]] for data_point in VALIDATION_SET]

In [20]:
# creating training set features and label array in numpy
X = []
for data in working_set:
    X.append(data[0])
X = np.array(X)

Y = []
for data in working_set:
    Y.append(data[1])
Y = np.array(Y)

# creating validation set features and label array in numpy
X_test = []
for data in test_set:
    X_test.append(data[0])
X_test = np.array(X_test)

Y_test = []
for data in test_set:
    Y_test.append(data[1])
Y_test = np.array(Y_test)

In [22]:
# normalize each feature to a value b/w 0 and !
X_normed = (X-X.min(axis=0))/(X.max(axis=0)-X.min(axis=0));
X_test_normed = (X_test-X_test.min(axis=0))/(X_test.max(axis=0)-X_test.min(axis=0))

In [10]:
# RBF_classifier = svm.SVC(kernel='rbf', gamma='auto', C=1) # SVM with RBF kernel
# RBF_classifier.fit(X_normed, Y)

In [11]:
# TP = 0
# FP = 0
# TN = 0
# FN = 0
# for i in range(len(X_normed)):
#     prediction = RBF_classifier.predict([X_normed[i]])
#     if((prediction == 0) and (Y[i] == 0)):
#         TN+=1
#     if((prediction == 0) and (Y[i] == 1)):
#         FN+=1
#     if((prediction == 1) and (Y[i] == 0)):
#         FP+=1
#     if((prediction == 1) and (Y[i] == 1)):
#         TP+=1
# train_accuracy = (TP+TN)/(TP+TN+FP+FN)
# print('training accuracy: ',train_accuracy)

# TP = 0
# FP = 0
# TN = 0
# FN = 0
# for i in range(len(X_test_normed)):
#     prediction = RBF_classifier.predict([X_test_normed[i]])
#     if((prediction == 0) and (Y[i] == 0)):
#         TN+=1
#     if((prediction == 0) and (Y[i] == 1)):
#         FN+=1
#     if((prediction == 1) and (Y[i] == 0)):
#         FP+=1
#     if((prediction == 1) and (Y[i] == 1)):
#         TP+=1
# validation_accuracy = (TP+TN)/(TP+TN+FP+FN)
# print('validation accuracy: ',validation_accuracy)

In [12]:
## models that work with floating point features not working well with catagorical data

<h3>Approach 2: use Bag of words approach</h3>

In [24]:
## imagine each emozi is a word having some meaning

emozi_set = set()
working_set = pick_dataset(1, randomized=True, to_int=False)
for data in working_set:
    features = data[0]
    for emozi in features:
        emozi_set.add(emozi)

In [26]:
## convert each datapoint from training set into a sparse vector denoting presence of each emozi

converted_dataset = []
for datapoint in working_set:
    converted_featuereset = np.zeros(len(emozi_set))
    feature_list = deepcopy(datapoint[0])
    pos = 0
    for emozi in emozi_set:
        if emozi in feature_list:
            converted_featuereset[pos] = 1
        pos+=1
    converted_dataset.append([converted_featuereset,datapoint[1]])

## convert each datapoint from validation set into a sparse vector

converted_validation_set = []
for datapoint in VALIDATION_SET:
    converted_featuereset = np.zeros(len(emozi_set))
    feature_list = deepcopy(datapoint[0])
    pos = 0
    for emozi in emozi_set:
        if emozi in feature_list:
            converted_featuereset[pos] = 1
        pos+=1
    converted_validation_set.append([converted_featuereset,datapoint[1]])

In [28]:
## preparing training data
X = []
y = []
for data in converted_dataset:
    X.append(data[0])
    y.append(data[1])
X = np.array(X)
y = np.array(y)

## preparing validation data
X_v = []
y_v = []
for data in converted_validation_set:
    X_v.append(data[0])
    y_v.append(data[1])
X_v = np.array(X_v)
y_v = np.array(y_v)

In [16]:
# to use:
# 1.normal SVM with RBF
# 2.Decision Tree
# 3.Random Forest
# 4.Do all of these with PCA/SVD

In [30]:
NB_classifier = BernoulliNB()
NB_classifier.fit(X,y)
NB_classifier.score(X_v,y_v)

0.45194274028629855

In [32]:
# dimentionality reduction using SVD
# feature lenght of 130 out of total 214 captures 95.3% of the energy of original data
x_compressed = csr_matrix(X)
x_test_compressed = csr_matrix(X_v)
svd = TruncatedSVD(n_components=130)
svd.fit(x_compressed)
X_reduced = svd.transform(x_compressed)
X_test_reduced = svd.transform(x_test_compressed)

In [34]:
print(X_reduced.shape)
print(X_test_reduced.shape)

(7080, 130)
(489, 130)


In [36]:
# # decision tree on reduced data
# k_val = []
# t_acc = []
# v_acc = []
# for depth in np.arange(1,11,1):
#     k_val.append(depth)
#     DT_classifier = DecisionTreeClassifier(max_depth=depth)
#     DT_classifier.fit(X_reduced,y)
#     t_acc.append(DT_classifier.score(X_reduced,y))
#     v_acc.append(DT_classifier.score(X_test_reduced,y_v))

# plt.plot(k_val, t_acc)
# plt.plot(k_val, v_acc)
# plt.show

<h3>Approach 3: Creating encoding using deep learning</h3>

In [49]:
print(X)
print(y)
print(X_v)
print(y_v)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]]
[0 1 1 ... 1 1 0]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[1 1 0 0 1 1 1 0 0 1 0 1 0 1 0 0 1 1 1 0 0 1 0 0 0 0 1 1 0 1 0 1 0 0 0 1 0
 1 1 1 0 1 0 1 1 1 0 1 1 1 0 1 1 1 1 0 0 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 1 1
 1 0 0 0 0 1 1 1 0 1 0 1 1 1 1 0 0 0 0 1 1 1 1 1 1 0 1 0 0 0 1 0 1 1 1 1 1
 1 1 1 1 1 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 1 1 1 0 0 0
 0 1 1 1 0 1 0 1 0 1 0 1 1 1 0 1 1 1 0 1 1 0 1 0 0 0 1 1 1 0 1 1 1 1 1 0 0
 0 1 0 1 0 1 1 1 1 1 1 0 1 0 0 1 1 0 0 1 1 1 0 0 1 0 0 1 1 1 1 0 0 1 0 1 0
 0 1 1 1 1 1 0 1 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 1 0 0 0 1 0 0 1 0 0 0 1 1 0
 0 0 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 1
 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 1 1

In [51]:
for i in range(25):
    print([train_emoticon_X[i],train_emoticon_Y[i]])

['😛🛐😻😑😣🙠🙯🚼😒🙼😑🙯😣', 0]
['🛐😑😪😛🚼🙯😣🚅😑🙯😹😣🙼', 0]
['😛🙯😑🚡😣🚼🛐🙲😣🙯🛑😑🙼', 0]
['😛🚼🛐🙐😣🙯😑🙪😑🙼🛆😣🙯', 1]
['🛐🚟🚼😛🙋😑😣🙯😹🙯😑😣🙼', 1]
['😑😣🚧😛🚜🚼🙯🛐🙼😣😑🙕🙯', 1]
['😣😑🙯🚼🛐🚥😬😛😣🚄😑🙼🙯', 0]
['🚡🚼😑🛐🚔🙯😛😣😑🙯🛓🙼😣', 0]
['🛐😛🛜😑🚼😚😣🙯😣😑🙯🚠🙼', 0]
['🙯😑🙷🛐🚼😣😛😍😿🙯🙼😑😣', 1]
['😣🙯🛐😑😛🚼🙚😍🙯🙼😣😑😸', 0]
['😛😯🚼🙯😑🛐😻😣🙼🙯😹😑😣', 1]
['😑🙯😛🛐🚼🙒🚙😣🙯😣😑🙼😬', 1]
['😿😣🚼🚴😛😑🙯🛐😣🙯😑😴🙼', 1]
['😛🛐🚼🚂🙯😦😣😑🙯😑😣🙼🙨', 1]
['🛐🚼🙯🛝😑😉😛😣🙒😣🙼😑🙯', 0]
['😣🙯😵😑😛🚃🚼🛐😣🙯🚜🙼😑', 0]
['😛😑🙯🚼🛆🛐😘😣😣😊🙼😑🙯', 0]
['🚟😣😑🛐😌🚼😛🙯🙯😣😑😿🙼', 1]
['😛🚼😑😣🙧🛐🙯😉🚍😣😑🙯🙼', 0]
['🛓🛐😣🚼🙯🚥😑😛😢😣🙯😑🙼', 1]
['😣😑🚵😛🚼🛐🙯😊🙼😣🙯😑😩', 0]
['🛐🙯😛🚼😣😬😠😑🙯🙼😣😑🛡', 0]
['🛐🙯🚜😣🚼😑😰😛😑🙯🚏🙼😣', 1]
['🚼😛🙯😣🚗🛐🙸😑😣🙼🙯🚌😑', 1]


In [53]:
print(type(train_emoticon_X[1]))

<class 'str'>


In [55]:
int(train_emoticon_X[0][9].encode("unicode_escape").hex(),16)

436029161998574100559715

In [57]:
arr = to_unicode_decimal_array(train_emoticon_X[0])
print(arr)

[436029161998574100558178, 436029161998574100571184, 436029161998574100558690, 436029161998574100558129, 436029161998574100558387, 436029161998574100559408, 436029161998574100559462, 436029161998574100570723, 436029161998574100558130, 436029161998574100559715, 436029161998574100558129, 436029161998574100559462, 436029161998574100558387]


In [26]:
print(len(emozi_set))
print(emozi_set)

214
{'🚣', '🚄', '🙱', '😸', '🙄', '\U0001f6d8', '🙔', '🚆', '🚓', '🚲', '🙯', '🚭', '🚼', '🙊', '😭', '🛆', '🚯', '🙠', '🚃', '🙏', '🙟', '🙅', '🚀', '😒', '😎', '😊', '🙮', '🚾', '🚶', '🙚', '\U0001f6dc', '🚎', '😲', '🛁', '🙃', '\U0001f6d9', '🚁', '😣', '😦', '🛏', '🙶', '🚥', '🛅', '🚌', '🚜', '🙩', '🙈', '🛓', '🛎', '😧', '🛍', '🙉', '🚵', '😨', '🛈', '🚕', '🛀', '🙢', '🚡', '😄', '🛕', '🚏', '😴', '😱', '🙗', '😵', '🙸', '\U0001f6de', '🙾', '🛋', '🙣', '🚟', '🚽', '😽', '🙐', '🛒', '😇', '😅', '😓', '🙫', '😘', '🚹', '😗', '🛇', '😶', '🚉', '🚞', '🚝', '🙳', '🚮', '🛑', '😬', '😛', '🚇', '🙌', '🙑', '🙧', '🙜', '🙬', '🚘', '🚖', '😢', '😥', '😏', '🙍', '🚱', '🚅', '🙇', '🙛', '🙦', '🚷', '😫', '🛂', '🚍', '🚠', '😤', '😿', '🚨', '🙹', '🙼', '🚚', '😀', '🚩', '🙨', '🙕', '🙋', '🙰', '😐', '😺', '😻', '🙻', '🚿', '🙀', '🛡', '🚛', '🚸', '😉', '🛉', '😆', '🚔', '😜', '😯', '😠', '🚈', '🛌', '😩', '😪', '🚂', '😟', '🚑', '😾', '🚙', '🛔', '🙲', '😑', '😮', '🚋', '😃', '🙆', '😂', '🛗', '🙎', '🛊', '🙘', '\U0001f6dd', '🙙', '🙴', '\U0001f6db', '😞', '😔', '😰', '😙', '🙁', '🛄', '🚐', '🙞', '😡', '🙷', '😌', '\U0001f6df', '🛐', '🚗', '🙥', '🚧', '🚢', '🚳', '🛖

In [27]:
np.zeros(len(emozi_set))

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [28]:
print(converted_dataset[1][0])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]


In [29]:
print(X_v)
print(len(X))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
7080


In [30]:
len(converted_validation_set)

489