In [0]:
import numpy as np
import csv
import keras
import sklearn
import gensim
import random
import scipy
import pandas as pd
import emoji
import re
from keras.preprocessing import text
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers.core import Dense , Dropout , Activation
from keras.layers import Embedding , LSTM, BatchNormalization, SpatialDropout1D
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report 
from gensim.models.word2vec import Word2Vec
from gensim.models.doc2vec import Doc2Vec , TaggedDocument
from keras.utils.np_utils import to_categorical
from sklearn.utils import class_weight

# size of the word embeddings
embeddings_dim = 100

# maximum number of words to consider in the representations
max_features = 15000

# maximum length of a sentence
max_sent_len = 20

# percentage of the data used for model training
percent = 0.80

# number of classes
num_classes = 4

path_to_glove_embed = 'glove.6B.100d.txt'

embeddings = dict( )

data = [ ( row["single"] , row["label"]  ) for row in csv.DictReader(open("train.txt",encoding='utf8'), delimiter='\t', quoting=csv.QUOTE_NONE) ]
random.shuffle( data )
train_size = int(len(data) * percent)

train_texts = []
test_texts = []
emo_train_texts = [ txt.lower() for ( txt, label ) in data[0:train_size] ]
train_labels = [ label for ( txt , label ) in data[0:train_size] ]

emo_test_texts = [ txt.lower() for ( txt, label ) in data[train_size:-1] ]
test_labels = [ label for ( txt , label ) in data[train_size:-1] ]

for i in emo_train_texts:
    train_texts.append(re.sub(r'[-()_,.:@#?!&$]', ' ', emoji.demojize(i)))
for i in emo_test_texts:
    st = ''.join((x for x in emoji.demojize(i) if x not in string.punctuation))
    test_texts.append(re.sub(r'[-()_,.:@#?!&$]', ' ', emoji.demojize(i)))
    
# print(train_texts)
for i in range(0,len(train_labels)):
    if train_labels[i] == "happy":
        train_labels[i]=0
    elif train_labels[i] == "sad":
        train_labels[i]=1
    elif train_labels[i] == "angry":
        train_labels[i]=2
    elif train_labels[i] == "others":
        train_labels[i]=3

for i in range(0,len(test_labels)):
    if test_labels[i] == "happy":
        test_labels[i]=0
    elif test_labels[i] == "sad":
        test_labels[i]=1
    elif test_labels[i] == "angry":
        test_labels[i]=2
    elif test_labels[i] == "others":
        test_labels[i]=3


In [0]:
embeddings =  gensim.models.Word2Vec(train_texts, min_count=1, size=300)

tokenizer = Tokenizer(num_words=max_features,lower=True)
tokenizer.fit_on_texts(train_texts)
vocab_size = len(tokenizer.word_index) + 1
train_sequences = sequence.pad_sequences( tokenizer.texts_to_sequences( train_texts ) , maxlen=max_sent_len )
test_sequences = sequence.pad_sequences( tokenizer.texts_to_sequences( test_texts ) , maxlen=max_sent_len )
print(train_sequences)
train_matrix = tokenizer.texts_to_matrix( train_texts )
test_matrix = tokenizer.texts_to_matrix( test_texts )
embedding_weights = np.zeros( ( max_features , embeddings_dim ) )
for word,index in tokenizer.word_index.items():
  if index < max_features:
    try: embedding_weights[index,:] = embeddings[word]
    except: embedding_weights[index,:] = np.random.rand( 1 , embeddings_dim )

le = preprocessing.LabelEncoder( )
le.fit( train_labels + test_labels )

class_weights = class_weight.compute_class_weight('balanced',
                                                  np.unique(train_labels),
                                                  train_labels)
class_weights_dict = dict(zip(le.transform(list(le.classes_)),
                              class_weights))

train_labels = le.transform( train_labels )
test_labels = le.transform( test_labels )
print("Classi considerate: " + repr( le.classes_ ))

train_labels = to_categorical(train_labels)

np.random.seed(0)

"""pre-trained Glove"""
embeddings_index = dict()


f = open("glove.6B.300d.txt", encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()


embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

[[   0    0    0 ...  373    6    2]
 [   0   35   13 ...   20   10   21]
 [   1   44  103 ...   30   80  243]
 ...
 [  15  171  205 ...   12  139  314]
 [   0    0    0 ...   31   11  193]
 [  38    2 2396 ...   38  321    3]]


  


Classi considerate: array([0, 1, 2, 3])


In [0]:
model = Sequential()
model.add(Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=20,trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(64, activation='tanh', dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(BatchNormalization())
model.add(LSTM(64,activation='tanh', dropout=0.2, recurrent_dropout=0.2))
model.add(BatchNormalization())
model.add(Dense(num_classes))
model.add(Activation('softmax'))

adam=keras.optimizers.Adam(lr=0.01)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

model.summary()


print("---CLASS WEIGHTS---")
class_weights_dict[3] = 1.75
print(class_weights_dict)

model.fit(train_sequences, train_labels , epochs=5, batch_size=40, class_weight=class_weights_dict)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_20 (Embedding)     (None, 20, 300)           3960600   
_________________________________________________________________
spatial_dropout1d_20 (Spatia (None, 20, 300)           0         
_________________________________________________________________
lstm_45 (LSTM)               (None, 20, 64)            93440     
_________________________________________________________________
batch_normalization_43 (Batc (None, 20, 64)            256       
_________________________________________________________________
lstm_46 (LSTM)               (None, 64)                33024     
_________________________________________________________________
batch_normalization_44 (Batc (None, 64)                256       
_________________________________________________________________
dense_18 (Dense)             (None, 4)                 260       
__________

<keras.callbacks.History at 0x20fcee82a20>

In [0]:
results = model.predict_classes( test_sequences )
test_labels = list(test_labels)
results = list(results)
# print(test_labels,results)
print ("confusion_matrix : ")
print (confusion_matrix(test_labels, results))
print ("classification_report: ")
print (classification_report(test_labels, results))
count = 0
f = open("results.txt", "w+",encoding="utf8", newline="\n")
for i in range(0,len(results)):
    if test_labels[i] == results[i]:
        count = count+1;
    if test_labels[i] == 0:
        test_labels[i]="happy"
    elif test_labels[i] == 1:
        test_labels[i]= "sad"
    elif test_labels[i] == 2:
        test_labels[i]= "angry"
    elif test_labels[i] == 3:
        test_labels[i]= "others" 
    if results[i] == 0:
        results[i]="happy"
    elif results[i] == 1:
        results[i]= "sad"
    elif results[i] == 2:
        results[i]= "angry"
    elif results[i] == 3:
        results[i]= "others" 
    
    f.write(str(test_texts[i]) + "\t" + str(test_labels[i]) + "\t" + str(results[i]) + "\n")

f.write("number of correct: " + str(count) + " out of " + str(len(test_labels)) + "\n")

f.close()
print("accuracy =",count/len(test_labels),"%")

confusion_matrix : 
[[ 687   10    7  124]
 [   7  963   30  125]
 [   7   57  858  165]
 [ 135   94   88 2674]]
classification_report: 
             precision    recall  f1-score   support

          0       0.82      0.83      0.83       828
          1       0.86      0.86      0.86      1125
          2       0.87      0.79      0.83      1087
          3       0.87      0.89      0.88      2991

avg / total       0.86      0.86      0.86      6031

accuracy = 0.8592273254849941 %


In [0]:
final_data = pd.read_csv("devwithoutlabels.txt",sep='\t')
final_data['single'] = final_data['turn1']+" "+final_data['turn2']+" "+final_data['turn3']
# drop_features(['id','turn1','turn2','turn3'],test_data)
final_text = []
for i in list(final_data['single']):
    final_text.append(re.sub(r'[-()_,.:@#?!&$]', ' ', emoji.demojize(i)))
final_data_sequences = sequence.pad_sequences( tokenizer.texts_to_sequences( final_text ) , maxlen=max_sent_len )
test_results = model.predict_classes( final_data_sequences )
test_results = list(test_results)
for i in range(len(test_results)):
    if test_results[i] == 0:
        test_results[i]= "happy"
    elif test_results[i] == 1:
        test_results[i]= "sad"
    elif test_results[i] == 2:
        test_results[i]= "angry"
    elif test_results[i] == 3:
        test_results[i]= "others" 
final_result = pd.DataFrame({'turn1':final_data['turn1'],'turn2':final_data['turn2'],'turn3':final_data['turn3']})
final_result['label'] = test_results
final_result.index.names=['id']
final_result.to_csv('test.txt',sep='\t')