In [2]:
# Importing Libraries
import pandas as pd
import numpy as np
from keras.layers import Dense, GRU, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.models import Model
import string
import re
import keras
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences


In [3]:
#Mount drive to access files in gdrive
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [4]:
# Read Train, Test and Gold dataset
df_train = pd.read_csv("/content/gdrive/MyDrive/NLP_Project/train.csv")
df_test = pd.read_csv("/content/gdrive/MyDrive/NLP_Project/dev.csv")
df_gold = pd.read_csv("/content/gdrive/MyDrive/NLP_Project/gold-test.csv")

In [5]:
# Preprocessing data lowercase and removing hyperlinks
def preProcessData(data_frame):
    data_frame['text'] = (data_frame['text']).str.lower()
    data_frame['text'] = data_frame['text'].apply(lambda x: re.sub(r"^https?:\/\/.*[\r\n]*", "", x, flags=re.MULTILINE))
    return data_frame

In [6]:
# Creating train, test and gold labels
X_train = preProcessData(df_train)['text']
X_test = preProcessData(df_test)['text']
X_gold = preProcessData(df_gold)['text']
Y_train = df_train['is_humor']
Y_test = df_test['is_humor']
Y_gold = df_gold['is_humor']

In [7]:
# Tokenizing the sentences and adding padding
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(X_train)

sequences_train = tokenizer.texts_to_sequences(X_train)
seq_matrix_train = pad_sequences(sequences_train, maxlen=200)

sequences_test = tokenizer.texts_to_sequences(X_test)
seq_matrix_test = pad_sequences(sequences_test, maxlen= 200)

sequences_gold = tokenizer.texts_to_sequences(X_gold)
seq_matrix_gold = pad_sequences(sequences_gold, maxlen= 200)

In [8]:
# Reading GloVe Embeddings
embeddings_index = dict()
f = open('/content/gdrive/MyDrive/NLP_Project/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))



Loaded 400000 word vectors.


In [9]:
# Creating weight matrix 
embedding_matrix = np.zeros((vocabulary_size, 300))
for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

In [10]:
# Metrics to evaluate model

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))



In [17]:
%%time
# Initializing Model 
model = Sequential()
model.add(Embedding(vocabulary_size, 300, input_length=50, weights=[embedding_matrix], trainable=False))
model.add(GRU(100))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy',f1_m, precision_m, recall_m])

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 300)           6000000   
_________________________________________________________________
gru_2 (GRU)                  (None, 100)               120600    
_________________________________________________________________
dense_3 (Dense)              (None, 128)               12928     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 129       
Total params: 6,133,657
Trainable params: 133,657
Non-trainable params: 6,000,000
_________________________________________________________________
CPU times: user 299 ms, sys: 32.3 ms, total: 331 ms
Wall time: 281 ms


In [15]:
%%time
# Training the model
model.fit(seq_matrix_train, Y_train, epochs = 10, batch_size= 128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 15.5 s, sys: 536 ms, total: 16.1 s
Wall time: 16.4 s


<keras.callbacks.History at 0x7f4af7f3efd0>

In [16]:
# Evaluating model for gold and test data
gold_accuracy = model.evaluate(seq_matrix_gold, Y_gold)
test_accuracy = model.evaluate(seq_matrix_test, Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n F1 Score:{:0.3f}'.format(test_accuracy[0],test_accuracy[1], test_accuracy[2]))
print('Gold set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}\n F1 Score:{:0.3f}'.format(gold_accuracy[0],gold_accuracy[1], gold_accuracy[2]))

Test set
  Loss: 0.543
  Accuracy: 0.820
 F1 Score:0.862
Gold set
  Loss: 0.484
  Accuracy: 0.846
 F1 Score:0.876
