# Self-made Dataset (Emoji)

In [None]:
import numpy as np
from tensorflow import keras
# import keras
from keras.layers.wrappers import Bidirectional
from keras.layers import Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn import preprocessing
from keras.utils.np_utils import to_categorical
import keras
from keras.models import Sequential
from keras.layers import LSTM, SimpleRNN, Dense, Dropout, Activation, BatchNormalization, LayerNormalization
from keras.callbacks import LearningRateScheduler
import pandas as pd
from sklearn.model_selection import KFold
import torch
from sklearn.metrics import *
import seaborn as sb
import matplotlib.pyplot as plt

### Variables

In [None]:
DIM = [50,100,200,300]
data = pd.read_csv('drive/MyDrive/emoji/new_dataset.csv',sep=',')

### Data Preprocessing

In [None]:
def intialize_emb_matrix(file):
  embedding_matrix = {}
  for line in file:
    values = line.split()
    word = values[0]
    embedding = np.array(values[1:], dtype='float64')
    embedding_matrix[word] = embedding

  return embedding_matrix 

def get_emb_data(data, max_len, file, DIM):
  embedding_matrix = intialize_emb_matrix(file)
  embedding_data = np.zeros((len(data), max_len, DIM))
  
  for idx in range(data.shape[0]):
    words_in_sentence = data[idx].split()
    
    for i in range(len(words_in_sentence)):
      if embedding_matrix.get(words_in_sentence[i].lower()) is not None:
        embedding_data[idx][i] = embedding_matrix[words_in_sentence[i].lower()]
              
  return embedding_data
  
def w2vector(data, DIM):
  file = open('drive/MyDrive/emoji/glove.6B.'+str(DIM)+'d.txt',encoding='utf8')
  X_train, y_train = data['Tweet'].values, data['Emoji'].values

  le = preprocessing.LabelEncoder()
  y_train = le.fit_transform(y_train)

  MAX = 0
  for i in range(X_train.shape[0]):
    text = X_train[i].split()
    MAX = max(MAX,len(text))
    # print(X_train[i])
  # print(MAX)

  X_temb = get_emb_data(X_train, MAX, file, DIM)
  y_train = to_categorical(y_train)

  return MAX, X_temb, y_train

### Model Construct

In [None]:
def lstm(MAX, DIM):
  model = Sequential()
  model.add(LSTM(units=64, input_shape=(MAX, DIM), return_sequences=True))
  model.add(Dropout(0.3))
  model.add(LSTM(units=32))
  model.add(Dropout(0.2))
  model.add(LayerNormalization())
  model.add(Dense(units=10, activation='relu'))
  model.add(Dense(units=6, activation='softmax'))

  model.compile(optimizer='adam', loss=keras.losses.categorical_crossentropy, metrics=['acc'])

  return model

### Different glove file

In [None]:
for d in DIM:
  print('dim :',d)
  MAX, X_temb, y_train = w2vector(data, d)
  model = lstm(MAX, d)
  model.fit(X_temb, y_train, validation_split=0.2, batch_size=128, epochs=5, verbose=1)


### Train

In [None]:
MAX, X, y = w2vector(data, 300)
kf = KFold(n_splits=3,shuffle=True)

acc = 0
rec = [0,0,0,0,0,0]
prec = [0,0,0,0,0,0]
f1 = [0,0,0,0,0,0]
matrix = np.zeros((6,6)).tolist()
for train, test in kf.split(X):
  x_train = X[train]
  y_train = y[train]
  x_test = X[test]
  y_test = [np.argmax(y[test][i]) for i in range(len(y[test]))]

  model = lstm(MAX, 300)
  model.fit(x_train, y_train, validation_split=0, batch_size=256, epochs=10, verbose=1)
  result = model.predict(x_test)
  y_pred = [np.argmax(result[i]) for i in range(len(result))]

  acc += accuracy_score(y_test,y_pred)
  rec += recall_score(y_test,y_pred,average=None)
  prec += precision_score(y_test,y_pred,average=None)
  f1 += f1_score(y_test,y_pred,average=None)
  matrix += confusion_matrix(y_test,y_pred)
acc /= 3
rec /= 3
prec /= 3
f1 /= 3
matrix /= 3

matrix = matrix.tolist()
for i in matrix:
  for j in range(len(i)):
    i[j] = round(i[j])

### Result

In [None]:
# [😜,😍,😉,🔥,💜,💯]
print('accrucy score :',acc)

form = pd.DataFrame([rec,prec,f1],columns=[i for i in range(6)],index=['recall score','precision score','f1 score'])
display(form)

form = pd.DataFrame(matrix,[i for i in range(6)],[i for i in range(6)])
sb.heatmap(form,annot=True,fmt='d',linewidth=5,cmap='YlGnBu')
plt.xlabel('predicted')
plt.ylabel('real')
plt.title('confusion matrix')
plt.show()