In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from gensim.parsing.porter import PorterStemmer
from sklearn.model_selection import train_test_split
import time
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, SpatialDropout1D, Conv1D, MaxPooling1D, GRU, CuDNNLSTM
from keras.layers import Bidirectional, LeakyReLU, Activation
from keras import regularizers
import keras
import tensorflow as tf
from sklearn.metrics import f1_score

plt.style.use(style="seaborn")
%matplotlib inline
nltk.download('popular')

In [None]:
train_set = pd.read_csv('raw_data/fulltrain.csv')
train_set.columns = ['label', 'text']

In [None]:
#take the first 500 words from first and last paragraph each
data= []
for index, row in train_set.iterrows():
    text = row['text']
    #split into paragraphs
    paragraphs = text.split('\n\n')
    np = len(paragraphs)
    if len(paragraphs[0].split()) < 500:
      fist_half = paragraphs[0]
    else:
      fist_half = ' '.join(paragraphs[0].split()[:500])
    if np > 1 and len(paragraphs[np-1].split()) < 500:
      second_half = paragraphs[np-1]
    else:
      second_half = ' '.join(paragraphs[np-1].split()[:500])
    new_sentence = fist_half + '. ' + second_half
    data.append([row.label, new_sentence])
df = pd.DataFrame(data, columns=['label', 'text'])
train_set = df

#Define utility functions

In [None]:
import string

def remove_punctuations(text):
    # keeps punctuation marks and question marks
    raw = str.maketrans('', '', string.punctuation[1:20] + string.punctuation[21:])
    return text.translate(raw)

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    text = [word.lower() for word in text.split() if word.lower() not in stop_words]
    return " ".join(text)

def transform_lower(text):
    return text.lower()

def perform_stemming(text):
    porter = PorterStemmer()
    text = [porter.stem(word) for word in text.split()]
    return " ".join(text)


In [None]:
def pad_sequences_and_truncate(sequences, max_len):
    res = []
    for s in sequences:
        if len(s) >= 2 * max_len:
            # take the first and last max_len/2 words
            res.append(s[:max_len//2] + s[-max_len//2:])
        elif len(s) > max_len and len(s) < 2 * max_len:
            res.append(s[:max_len])
        else:
            res.append(s + [0] * (max_len - len(s)))

    return res


#Perform preprocessing

In [None]:
train_set['text'] = train_set.text.map(lambda x: transform_lower(x))
train_set['text'] = train_set.text.map(lambda x: remove_stopwords(x))

#Create corpus


In [None]:
from nltk.tokenize import word_tokenize

def create_corpus_tk(df):
  corpus = []
  for text in train_set['text']:
    words = [word.lower() for word in word_tokenize(text)]
    corpus.append(words)
  return corpus

In [None]:
corpus = create_corpus_tk(train_set)
num_words = len(corpus)
print(num_words)

48853


In [None]:
padding_len = 1000

#Implementing LSTM baseline model

In [None]:
def get_lstm_model(num_words, embedding_matrix, padding_len):
  filters = 100
  kernel_size = 5
  lstm_units = 32
  embed_dim = 200

  model = Sequential()
  model.add(Embedding(input_dim=num_words, output_dim=embed_dim, weights=[embedding_matrix], input_length=padding_len, trainable=False))
  model.add(SpatialDropout1D(0.5))
  model.add(Conv1D(filters, kernel_size=kernel_size, kernel_regularizer=regularizers.l2(0.00001), padding='same'))
  model.add(LeakyReLU(alpha=0.2))
  model.add(MaxPooling1D(pool_size=2))
  #model.add(Bidirectional(LSTM(lstm_units, dropout=0.5, recurrent_dropout=0.5,return_sequences=True)))
  model.add(Bidirectional(CuDNNLSTM(lstm_units, return_sequences=True)))
  model.add(SpatialDropout1D(0.5))
  model.add(Conv1D(filters, kernel_size=kernel_size, kernel_regularizer=regularizers.l2(0.00001), padding='same'))
  model.add(LeakyReLU(alpha=0.2))
  model.add(MaxPooling1D(pool_size=2))
  #model.add(Bidirectional(LSTM(lstm_units ,dropout=0.5, recurrent_dropout=0.5, return_sequences=True)))
  model.add(Bidirectional(CuDNNLSTM(lstm_units, return_sequences=True)))
  model.add(SpatialDropout1D(0.5))
  model.add(Conv1D(filters, kernel_size=kernel_size, kernel_regularizer=regularizers.l2(0.00001), padding='same'))
  model.add(LeakyReLU(alpha=0.2))
  model.add(MaxPooling1D(pool_size=2))
  #model.add(Bidirectional(LSTM(lstm_units, dropout=0.5, recurrent_dropout=0.5)))
  model.add(Bidirectional(CuDNNLSTM(lstm_units)))
  model.add(Dense(50, input_shape=(lstm_units,)))
  model.add(Activation('relu'))
  model.add(Dense(16, input_shape=(50,)))
  model.add(Activation('relu'))
  model.add(Dense(2, activation='softmax'))
  model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
  return model

In [None]:
test = pd.read_csv('raw_data/balancedtest.csv')
test.columns = ['label', 'text']

# Model_1

In [None]:
train_set_1 = train_set.copy(deep = True)
train_set_1.loc[train_set['label'] == 1, ['label']] = 1
train_set_1.loc[train_set['label'] == 2, ['label']] = 0
train_set_1.loc[train_set['label'] == 3, ['label']] = 0
train_set_1.loc[train_set['label'] == 4, ['label']] = 0

In [None]:
import numpy as np
#create word_embedding using glove
embedding_dict = {}
with open("glove.twitter.27B.200d.txt", "r", encoding="UTF-8") as f:
  for line in f:
    values = line.split()
    word = values[0]
    vectors = np.asarray(values[1:], "float32")
    embedding_dict[word] = vectors
f.close()
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_set_1['text'])
word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 200))
for word, i in word_index.items():
  if i < num_words:
    vector = embedding_dict.get(word)
    if vector is not None:
      embedding_matrix[i] = vector

In [None]:
X = train_set_1['text'].values
Y = pd.get_dummies(train_set_1['label']).values
X_train, Y_train, X_train_labels, Y_train_labels = train_test_split(X, Y, test_size = 0.1, random_state = 42)

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
Y_train = tokenizer.texts_to_sequences(Y_train)
X_train_padded_raw = pad_sequences_and_truncate(X_train, padding_len)
Y_train_padded_raw = pad_sequences_and_truncate(Y_train, padding_len)

# transform to numpy ndarray otherwise memory error
X_train_padded = pad_sequences(X_train_padded_raw, padding='post', maxlen=padding_len, truncating='post')
Y_train_padded = pad_sequences(Y_train_padded_raw, padding='post', maxlen=padding_len, truncating='post')

In [None]:
model_1 = get_lstm_model(num_words, embedding_matrix, padding_len)
model_1.summary()

In [None]:
history = model_1.fit(
    X_train_padded,
    X_train_labels,
    epochs=20,
    validation_split=0.1,
    verbose=1,
    batch_size=128,
    shuffle=True
)

In [None]:
accr = model_1.evaluate(Y_train_padded, Y_train_labels)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))

In [None]:
test_1 = test.copy(deep = True)
test_1.loc[test['label'] == 1, ['label']] = 1
test_1.loc[test['label'] == 2, ['label']] = 0
test_1.loc[test['label'] == 3, ['label']] = 0
test_1.loc[test['label'] == 4, ['label']] = 0

In [None]:
X_test = tokenizer.texts_to_sequences(test_1['text'].values)
X_test = pad_sequences_and_truncate(X_test, padding_len)
X_test = pad_sequences(X_test, maxlen=padding_len, truncating='post', padding='post')
Y_test = pd.get_dummies(test_1['label']).values
accr = model_1.evaluate(X_test, Y_test)
print('Test set \n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))

In [None]:
y_probs_1 = model_1.predict(X_test) 
y_classes_1 = y_probs_1.argmax(axis=1)
y_probs_1_max = np.amax(y_probs_1, axis = 1)
prediction_array_1 = np.dstack((y_classes_1, y_probs_1_max))

In [None]:
predict_x = model_1.predict(X_test) 
classes_x = np.argmax(predict_x, axis=1)
actual_x = np.argmax(Y_test, axis=1)

f1_score(actual_x, classes_x, average=None)

# Model 2

In [None]:
train_set_2 = train_set.copy(deep = True)
train_set_2.loc[train_set['label'] == 1, ['label']] = 0
train_set_2.loc[train_set['label'] == 2, ['label']] = 1
train_set_2.loc[train_set['label'] == 3, ['label']] = 0
train_set_2.loc[train_set['label'] == 4, ['label']] = 0

In [None]:
#create word_embedding using glove
embedding_dict = {}
with open("glove.twitter.27B.200d.txt", "r", encoding="UTF-8") as f:
  for line in f:
    values = line.split()
    word = values[0]
    vectors = np.asarray(values[1:], "float32")
    embedding_dict[word] = vectors
f.close()
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_set_2['text'])
word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 200))
for word, i in word_index.items():
  if i < num_words:
    vector = embedding_dict.get(word)
    if vector is not None:
      embedding_matrix[i] = vector

In [None]:
X = train_set_2['text'].values
Y = pd.get_dummies(train_set_2['label']).values
X_train, Y_train, X_train_labels, Y_train_labels = train_test_split(X, Y, test_size = 0.1, random_state = 42)

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
Y_train = tokenizer.texts_to_sequences(Y_train)
X_train_padded_raw = pad_sequences_and_truncate(X_train, padding_len)
Y_train_padded_raw = pad_sequences_and_truncate(Y_train, padding_len)

# transform to numpy ndarray otherwise memory error
X_train_padded = pad_sequences(X_train_padded_raw, padding='post', maxlen=padding_len, truncating='post')
Y_train_padded = pad_sequences(Y_train_padded_raw, padding='post', maxlen=padding_len, truncating='post')

In [None]:
model_2 = get_lstm_model(num_words, embedding_matrix, padding_len)
model_2.summary()

In [None]:
history = model_2.fit(
    X_train_padded,
    X_train_labels,
    epochs=20,
    validation_split=0.1,
    verbose=1,
    batch_size=128,
    shuffle=True
)

In [None]:
accr = model_2.evaluate(Y_train_padded, Y_train_labels)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))

In [None]:
test_2 = test.copy(deep = True)
test_2.loc[test['label'] == 1, ['label']] = 0
test_2.loc[test['label'] == 2, ['label']] = 1
test_2.loc[test['label'] == 3, ['label']] = 0
test_2.loc[test['label'] == 4, ['label']] = 0

In [None]:
X_test = tokenizer.texts_to_sequences(test_2['text'].values)
X_test = pad_sequences_and_truncate(X_test, padding_len)
X_test = pad_sequences(X_test, maxlen=padding_len, truncating='post', padding='post')
Y_test = pd.get_dummies(test_2['label']).values
accr = model_2.evaluate(X_test, Y_test)
print('Test set \n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))

In [None]:
y_probs_2 = model_2.predict(X_test) 
y_classes_2 = y_probs_2.argmax(axis=1)
y_probs_2_max = np.amax(y_probs_2, axis = 1)
prediction_array_2 = np.dstack((y_classes_2, y_probs_2_max))

In [None]:
predict_x = model_2.predict(X_test) 
classes_x = np.argmax(predict_x, axis=1)
actual_x = np.argmax(Y_test, axis=1)

f1_score(actual_x, classes_x, average=None)

# Model 3

In [None]:
train_set_3 = train_set.copy(deep = True)
train_set_3.loc[train_set['label'] == 1, ['label']] = 0
train_set_3.loc[train_set['label'] == 2, ['label']] = 0
train_set_3.loc[train_set['label'] == 3, ['label']] = 1
train_set_3.loc[train_set['label'] == 4, ['label']] = 0
train_set_3.groupby('label').count()

In [None]:
#create word_embedding using glove
embedding_dict = {}
with open("glove.twitter.27B.200d.txt", "r", encoding="UTF-8") as f:
  for line in f:
    values = line.split()
    word = values[0]
    vectors = np.asarray(values[1:], "float32")
    embedding_dict[word] = vectors
f.close()
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_set_3['text'])
word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 200))
for word, i in word_index.items():
  if i < num_words:
    vector = embedding_dict.get(word)
    if vector is not None:
      embedding_matrix[i] = vector

In [None]:
X = train_set_3['text'].values
Y = pd.get_dummies(train_set_3['label']).values
X_train, Y_train, X_train_labels, Y_train_labels = train_test_split(X, Y, test_size = 0.1, random_state = 42)

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
Y_train = tokenizer.texts_to_sequences(Y_train)
X_train_padded_raw = pad_sequences_and_truncate(X_train, padding_len)
Y_train_padded_raw = pad_sequences_and_truncate(Y_train, padding_len)

# transform to numpy ndarray otherwise memory error
X_train_padded = pad_sequences(X_train_padded_raw, padding='post', maxlen=padding_len, truncating='post')
Y_train_padded = pad_sequences(Y_train_padded_raw, padding='post', maxlen=padding_len, truncating='post')

In [None]:
model_3 = get_lstm_model(num_words, embedding_matrix, padding_len)
model_3.summary()

In [None]:
history = model_3.fit(
    X_train_padded,
    X_train_labels,
    epochs=20,
    validation_split=0.1,
    verbose=1,
    batch_size=128,
    shuffle=True
)

In [None]:
accr = model_3.evaluate(Y_train_padded, Y_train_labels)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))

In [None]:
test_3 = test.copy(deep = True)
test_3.loc[test['label'] == 1, ['label']] = 0
test_3.loc[test['label'] == 2, ['label']] = 0
test_3.loc[test['label'] == 3, ['label']] = 1
test_3.loc[test['label'] == 4, ['label']] = 0

In [None]:
X_test = tokenizer.texts_to_sequences(test_3['text'].values)
X_test = pad_sequences_and_truncate(X_test, padding_len)
X_test = pad_sequences(X_test, maxlen=padding_len, truncating='post', padding='post')
Y_test = pd.get_dummies(test_3['label']).values
accr = model_3.evaluate(X_test, Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))

In [None]:
y_probs_3 = model_3.predict(X_test) 
y_classes_3 = y_probs_3.argmax(axis=1)
y_probs_3_max = np.amax(y_probs_3, axis = 1)
prediction_array_3 = np.dstack((y_classes_3, y_probs_3_max))

In [None]:
predict_x = model_3.predict(X_test) 
classes_x = np.argmax(predict_x, axis=1)
actual_x = np.argmax(Y_test, axis=1)

f1_score(actual_x, classes_x, average=None)

# Model 4

In [None]:
train_set_4 = train_set.copy(deep = True)
train_set_4.loc[train_set['label'] == 1, ['label']] = 0
train_set_4.loc[train_set['label'] == 2, ['label']] = 0
train_set_4.loc[train_set['label'] == 3, ['label']] = 0
train_set_4.loc[train_set['label'] == 4, ['label']] = 1
train_set_4.groupby('label').count()

In [None]:
#create word_embedding using glove
embedding_dict = {}
with open("glove.twitter.27B.200d.txt", "r", encoding="UTF-8") as f:
  for line in f:
    values = line.split()
    word = values[0]
    vectors = np.asarray(values[1:], "float32")
    embedding_dict[word] = vectors
f.close()
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(train_set_4['text'])
word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, 200))
for word, i in word_index.items():
  if i < num_words:
    vector = embedding_dict.get(word)
    if vector is not None:
      embedding_matrix[i] = vector

In [None]:
X = train_set_4['text'].values
Y = pd.get_dummies(train_set_4['label']).values
X_train, Y_train, X_train_labels, Y_train_labels = train_test_split(X, Y, test_size = 0.1, random_state = 42)

In [None]:
X_train = tokenizer.texts_to_sequences(X_train)
Y_train = tokenizer.texts_to_sequences(Y_train)
X_train_padded_raw = pad_sequences_and_truncate(X_train, padding_len)
Y_train_padded_raw = pad_sequences_and_truncate(Y_train, padding_len)

# transform to numpy ndarray otherwise memory error
X_train_padded = pad_sequences(X_train_padded_raw, padding='post', maxlen=padding_len, truncating='post')
Y_train_padded = pad_sequences(Y_train_padded_raw, padding='post', maxlen=padding_len, truncating='post')

In [None]:
#embedding_matrix = get_embedding_matrix(tokenizer)
model_4 = get_lstm_model(num_words, embedding_matrix, padding_len)
model_4.summary()

In [None]:
history = model_4.fit(
    X_train_padded,
    X_train_labels,
    epochs=20,
    validation_split=0.1,
    verbose=1,
    batch_size=128,
    shuffle=True
)

In [None]:
accr = model_4.evaluate(Y_train_padded, Y_train_labels)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))

In [None]:
test_4 = test.copy(deep = True)
test_4.loc[test['label'] == 1, ['label']] = 0
test_4.loc[test['label'] == 2, ['label']] = 0
test_4.loc[test['label'] == 3, ['label']] = 0
test_4.loc[test['label'] == 4, ['label']] = 1
test_4.groupby('label').count()

In [None]:
X_test = tokenizer.texts_to_sequences(test_4['text'].values)
X_test = pad_sequences_and_truncate(X_test, padding_len)
X_test = pad_sequences(X_test, maxlen=padding_len, truncating='post', padding='post')
Y_test = pd.get_dummies(test_4['label']).values
accr = model_4.evaluate(X_test, Y_test)
print('Test set \n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0], accr[1]))

In [None]:
y_probs_4 = model_4.predict(X_test) 
y_classes_4 = y_probs_4.argmax(axis=1)
y_probs_4_max = np.amax(y_probs_4, axis = 1)
prediction_array_4 = np.dstack((y_classes_4, y_probs_4_max))

In [None]:
predict_x = model_4.predict(X_test) 
classes_x = np.argmax(predict_x, axis=1)
actual_x = np.argmax(Y_test, axis=1)

f1_score(actual_x, classes_x, average=None)

# Combine Models

In [None]:
pred_len = len(prediction_array_1[0])

In [None]:
predict_list = []
for i in range(pred_len):
  p1 = prediction_array_1[0][i]
  p2 = prediction_array_2[0][i]
  p3 = prediction_array_3[0][i]
  p4 = prediction_array_4[0][i]

  p_val = {p1[1]: 'p1', p2[1]: 'p2', p3[1]: 'p3', p4[1]: 'p4'}

  if p1[0] == 0.0 and p2[0] == 0.0 and p3[0] == 0.0 and p4[0] == 0.0:
    max_pred = p_val.get(min(p_val))
    if max_pred == 'p1':
      max_class = 0
    elif max_pred == 'p2':
      max_class = 1
    elif max_pred == 'p3':
      max_class = 2
    elif max_pred == 'p4':
      max_class = 3
  else:
    max_score = -1
    max_class = -1
    if p1[0] == 1.0 and p1[1] > max_score:
      max_class = 0
      max_score = p1[1]
    elif p2[0] == 1.0 and p2[1] > max_score:
      max_class = 1
      max_score = p2[1]
    elif p3[0] == 1.0 and p3[1] > max_score:
      max_class = 2
      max_score = p3[1]
    elif p4[0] == 1.0 and p4[1] > max_score:
      max_class = 3
      max_score = p4[1]
  
  predict_list.append(max_class)

In [None]:
X_test = tokenizer.texts_to_sequences(test['text'].values)
X_test = pad_sequences_and_truncate(X_test, padding_len)
X_test = pad_sequences(X_test, maxlen=padding_len, truncating='post', padding='post')
Y_test = pd.get_dummies(test['label']).values

In [None]:
actual_x = np.argmax(Y_test, axis=1)

f1_score(actual_x, predict_list, average=None)