In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
!mkdir data
!mkdir data/glove
!mkdir data/glove_twitter
!wget http://nlp.stanford.edu/data/glove.twitter.27B.zip
!wget http://nlp.stanford.edu/data/glove.6B.zip

!unzip glove.twitter.27B.zip -d data/glove_twitter/
!unzip glove.6B.zip -d data/glove

--2022-08-06 18:12:33--  http://nlp.stanford.edu/data/glove.twitter.27B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.twitter.27B.zip [following]
--2022-08-06 18:12:33--  https://nlp.stanford.edu/data/glove.twitter.27B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip [following]
--2022-08-06 18:12:33--  https://downloads.cs.stanford.edu/nlp/data/glove.twitter.27B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1520408563 (1.4G) [ap

In [2]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
!unzip wiki-news-300d-1M.vec.zip -d data/wiki-news-vec/

--2022-08-06 18:20:59--  https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 104.22.74.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 681808098 (650M) [application/zip]
Saving to: ‘wiki-news-300d-1M.vec.zip’


2022-08-06 18:21:13 (50.3 MB/s) - ‘wiki-news-300d-1M.vec.zip’ saved [681808098/681808098]

Archive:  wiki-news-300d-1M.vec.zip
  inflating: data/wiki-news-vec/wiki-news-300d-1M.vec  


In [34]:
import numpy as np, pandas as pd, csv, tensorflow as tf, gensim as gensim
from collections import Counter
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from keras.models import Sequential, Model
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, BatchNormalization, Activation, Bidirectional, Flatten
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils.vis_utils import plot_model
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical
from nltk.corpus import stopwords

In [35]:
def read_data(path1, path2):
  train = pd.read_csv(path1)
  test = pd.read_csv(path2)
  print("Train Shape" + str(train.shape))
  print("Test Shape" + str(test.shape))
  return train, test

In [36]:
#separate the text data based on the space
def sequentialize_data(train, test):
  train_head_seq = [text_to_word_sequence(head) for head in train['Headline']]
  train_body_seq = [text_to_word_sequence(body) for body in train['articleBody']]
  test_head_seq = [text_to_word_sequence(head) for head in test['Headline']]
  test_body_seq = [text_to_word_sequence(body) for body in test['articleBody']]
  print("Train head sequence " +str(len(train_head_seq)))
  print("Train body sequence " +str(len(train_body_seq)))
  print("Test head sequence " +str(len(test_head_seq)))
  print("Test body sequence " +str(len(test_body_seq)))
  return (train_head_seq, train_body_seq, test_head_seq, test_body_seq)

In [37]:
def concatenate_train_data(train_head_seq, train_body_seq):
  words = []
  for i in range(len(train_head_seq)):    
      words.append(train_head_seq[i])
  print("Length of words after adding Headlines" + str(len(words)))
  print(words[:250])
  for i in range(len(train_body_seq)):
    words.append(train_body_seq[i])
  print("Length of words after adding Headlines" + str(len(words)))
  print(words[250:500])
  return words

In [38]:
# vectorize the data - the sequence that has been received has to now be converted into a set of vectors for the tensor to process.
def vectorize_data(list):
  symbols = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
  tokenizer = Tokenizer(num_words= 30000, filters = symbols)
  tokenizer.fit_on_texts([word for word in words])
  # print("Size of Vocabulary:", len(tokenizer.word_index))
  return tokenizer

In [39]:
def extend_data(train_head_seq, test_head_seq, train_body_seq, train, test, tokenizer):
  train_seq = [list(i) for i in train_head_seq]
  for i in range(len(train_head_seq)):
      train_seq[i].extend(train_body_seq[i]) 
  test_seq = [list(i) for i in test_head_seq]
  for i in range(len(test_head_seq)):
      test_seq[i].extend(test_body_seq[i])
  # print('After Sequentialization')
  # print("Length of train_seq " +str(len(train_seq)))
  # print("Length of test_seq " +str(len(test_seq)))
  X_train = tokenizer.texts_to_sequences([' '.join(seq[:128]) for seq in train_seq])
  X_train = pad_sequences(X_train, maxlen = 128, padding = 'post', truncating = 'post')
  y_train = train['Stance']
  # print('Train data After Tokenization')
  # print("X_train Shape " + str(X_train.shape))
  # print("y_train Shape " + str(y_train.shape))
  X_test = tokenizer.texts_to_sequences([' '.join(seq[:128]) for seq in test_seq])
  X_test = pad_sequences(X_test, maxlen = 128, padding = 'post', truncating = 'post')
  y_test = test['Stance']
  # print('Test data After Tokenization')
  # print("X_test Shape " + str(X_test.shape))
  # print("y_test Shape " + str(y_test.shape))
  encoder = LabelEncoder()
  encoder.fit(y_train)
  train_encoded = encoder.transform(y_train)
  y_train = to_categorical(train_encoded)
  print('Train Data After Encoding, \n')
  print("X_train Shape " + str(X_train.shape))
  print("X_test Shape " + str(X_test.shape))
  print("y_train Shape " + str(y_train.shape))
  print("y_test Shape " + str(y_test.shape)+'\n')
  X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state = 42, test_size = 0.1)
  encoder = LabelEncoder()
  encoder.fit(y_test)
  test_encoded = encoder.transform(y_test)
  y_test = to_categorical(test_encoded)
  print('Train Data After Encoding and splitting, \n')
  print("X_train Shape " + str(X_train.shape))
  print("X_test Shape " + str(X_test.shape))
  print("y_train Shape " + str(y_train.shape))
  print("y_test Shape " + str(y_test.shape))
  return X_train, y_train, X_val, y_val, X_test, y_test

In [40]:
i = 0
if i == 0:
  path1 = './train.csv'
elif i == -1: 
  path1 = './train_undersample.csv'
elif i == 1:
  path1 = './train_oversample.csv'
else:
  path1 = './train.csv'
train, test = read_data(path1, './test.csv')

Train Shape(49972, 4)
Test Shape(25413, 4)


In [41]:
train_head_seq, train_body_seq, test_head_seq, test_body_seq = sequentialize_data(train, test)

Train head sequence 49972
Train body sequence 49972
Test head sequence 25413
Test body sequence 25413


In [None]:
words = []
words = concatenate_train_data(train_head_seq, train_body_seq)

In [43]:
tokenizer = vectorize_data(words)
X_train, y_train, X_val, y_val, X_test, y_test = extend_data(train_head_seq, test_head_seq, train_body_seq, train, test, tokenizer)

Train Data After Encoding, 

X_train Shape (49972, 128)
X_test Shape (25413, 128)
y_train Shape (49972, 4)
y_test Shape (25413,)

Train Data After Encoding and splitting, 

X_train Shape (44974, 128)
X_test Shape (25413, 128)
y_train Shape (44974, 4)
y_test Shape (25413, 4)


In [44]:
print(len(X_train))
print(len(y_train))

44974
44974


In [45]:
glove_input_file = './data/glove_twitter/glove.twitter.27B.50d.txt'
word2vec_output_file = 'glove.50d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)
embeddings = gensim.models.KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [46]:
weight_matrix = np.random.uniform(-0.05, 0.05, size=(len(tokenizer.word_index)+1, 50)) 
for word, i in tokenizer.word_index.items():
    try:
        word_embeddings_vector = embeddings[word]
    except KeyError:
        word_embeddings_vector = None
    if word_embeddings_vector is not None:
        weight_matrix[i] = word_embeddings_vector
del embeddings

In [47]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim = 50, weights = [weight_matrix], trainable = True, mask_zero=True))
model.add(LSTM(120, return_sequences = False))
model.add(Dropout(rate = 0.3)) 
model.add(Dense(4, activation='softmax'))
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam', metrics = ['accuracy'])

In [48]:
history = model.fit(X_train, y_train, batch_size = 128, epochs = 25, validation_data = (X_val, y_val))

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [49]:
pred = model.predict(X_test)
predicted_labels = [np.argmax(p, axis = -1) for p in pred]
True_labels = [np.argmax(p, axis = -1) for p in y_test]
for i in range(len(predicted_labels)):
    if predicted_labels[i] == 0: predicted_labels[i] = "unrelated"
    if predicted_labels[i] == 1: predicted_labels[i] = "agree"
    if predicted_labels[i] == 2: predicted_labels[i] = "disagree"
    if predicted_labels[i] == 3: predicted_labels[i] = "discuss"
for i in range(len(True_labels)):
    if True_labels[i] == 0: True_labels[i] = "unrelated"
    if True_labels[i] == 1: True_labels[i] = "agree"
    if True_labels[i] == 2: True_labels[i] = "disagree"
    if True_labels[i] == 3: True_labels[i] = "discuss"
test_df = pd.read_csv('competition_test_stances.csv')
pred_stance = predicted_labels
bodyid = []
headline = []
for i in range(len(test_df.Stance)):
    bodyid.append(test_df['Body ID'][i])
    headline.append(test_df['Headline'][i])
df_submit = pd.DataFrame( data = {'Headline': headline, 'Body ID': bodyid, "Stance": pred_stance})
df_submit.to_csv('answer_BaseLSTM.csv', index = False, encoding = 'utf-8')

In [50]:
# from keras.models import save_model
# filepath = './lstm_base_model'
# save_model(model, filepath)

In [51]:
from score import report_score, score_submission
predicted = df_submit['Stance'].values
actual = test_df['Stance'].values
report_score(actual, predicted)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    256    |    22     |    272    |   1353    |
-------------------------------------------------------------
| disagree  |    52     |    12     |    85     |    548    |
-------------------------------------------------------------
|  discuss  |    281    |    28     |   1417    |   2738    |
-------------------------------------------------------------
| unrelated |   1236    |    64     |   3231    |   13818   |
-------------------------------------------------------------
Score: 5324.5 out of 11651.25	(45.6989593391267%)


45.6989593391267

In [52]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim = 50, weights = [weight_matrix], trainable = True, mask_zero=True))
model.add(Bidirectional(LSTM(120, return_sequences = False)))
model.add(Dropout(rate = 0.3)) 
model.add(Dense(4, activation='softmax'))
model.compile(loss = 'categorical_crossentropy',optimizer = 'adam', metrics = ['accuracy'])

In [53]:
history = model.fit(X_train, y_train, batch_size = 128, epochs = 20, validation_data = (X_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [54]:
pred = model.predict(X_test)
predicted_labels = [np.argmax(p, axis = -1) for p in pred]
True_labels = [np.argmax(p, axis = -1) for p in y_test]
for i in range(len(predicted_labels)):
    if predicted_labels[i] == 0: predicted_labels[i] = "unrelated"
    if predicted_labels[i] == 1: predicted_labels[i] = "agree"
    if predicted_labels[i] == 2: predicted_labels[i] = "disagree"
    if predicted_labels[i] == 3: predicted_labels[i] = "discuss"
for i in range(len(True_labels)):
    if True_labels[i] == 0: True_labels[i] = "unrelated"
    if True_labels[i] == 1: True_labels[i] = "agree"
    if True_labels[i] == 2: True_labels[i] = "disagree"
    if True_labels[i] == 3: True_labels[i] = "discuss"
test_df = pd.read_csv('competition_test_stances.csv')
pred_stance = predicted_labels
bodyid = []
headline = []
for i in range(len(test_df.Stance)):
    bodyid.append(test_df['Body ID'][i])
    headline.append(test_df['Headline'][i])
df_submit = pd.DataFrame( data = {'Headline': headline, 'Body ID': bodyid, "Stance": pred_stance})
df_submit.to_csv('answer_BiLSTM.csv', index = False, encoding = 'utf-8')

In [55]:
predicted = df_submit['Stance'].values
actual = test_df['Stance'].values
report_score(actual, predicted)
# filepath = './lstm_bidirectional_model'
# save_model(model, filepath)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    517    |    16     |    389    |    981    |
-------------------------------------------------------------
| disagree  |    139    |    20     |    135    |    403    |
-------------------------------------------------------------
|  discuss  |    574    |    43     |   1883    |   1964    |
-------------------------------------------------------------
| unrelated |   1086    |    56     |   1788    |   15419   |
-------------------------------------------------------------
Score: 6598.75 out of 11651.25	(56.635554125093876%)


56.635554125093876

In [56]:
model = Sequential()
model.add(Embedding(input_dim = len(tokenizer.word_index)+1, output_dim = 50, weights = [weight_matrix], trainable = True, input_length = 128))
model.add(Conv1D(512, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(128, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(64, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(rate = 0.25))
model.add(Dense(4,activation ='softmax'))
model.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [57]:
history = model.fit(X_train, y_train, batch_size = 128, epochs = 20, validation_data = (X_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [58]:
pred = model.predict(X_test)
predicted_labels = [np.argmax(p, axis = -1) for p in pred]
True_labels = [np.argmax(p, axis = -1) for p in y_test]
for i in range(len(predicted_labels)):
    if predicted_labels[i] == 0: predicted_labels[i] = "unrelated"
    if predicted_labels[i] == 1: predicted_labels[i] = "agree"
    if predicted_labels[i] == 2: predicted_labels[i] = "disagree"
    if predicted_labels[i] == 3: predicted_labels[i] = "discuss"
for i in range(len(True_labels)):
    if True_labels[i] == 0: True_labels[i] = "unrelated"
    if True_labels[i] == 1: True_labels[i] = "agree"
    if True_labels[i] == 2: True_labels[i] = "disagree"
    if True_labels[i] == 3: True_labels[i] = "discuss"
test_df = pd.read_csv('competition_test_stances.csv')
pred_stance = predicted_labels
bodyid = []
headline = []
for i in range(len(test_df.Stance)):
    bodyid.append(test_df['Body ID'][i])
    headline.append(test_df['Headline'][i])
df_submit = pd.DataFrame( data = {'Headline': headline, 'Body ID': bodyid, "Stance": pred_stance})
df_submit.to_csv('answer_cnn.csv', index = False, encoding = 'utf-8')

In [59]:
predicted = df_submit['Stance'].values
actual = test_df['Stance'].values
report_score(actual, predicted)
# filepath = './cnn_model'
# save_model(model, filepath)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    416    |    44     |    496    |    947    |
-------------------------------------------------------------
| disagree  |    149    |    10     |    130    |    408    |
-------------------------------------------------------------
|  discuss  |    588    |    38     |   2099    |   1739    |
-------------------------------------------------------------
| unrelated |   1144    |    127    |   2181    |   14897   |
-------------------------------------------------------------
Score: 6610.5 out of 11651.25	(56.73640167364017%)


56.73640167364017

In [60]:
word_embeddings = gensim.models.KeyedVectors.load_word2vec_format('data/wiki-news-vec/wiki-news-300d-1M.vec', binary=False)
weight_matrix = np.random.uniform(-0.05, 0.05, size=(len(tokenizer.word_index)+1, 300)) 
for word, i in tokenizer.word_index.items(): 
    try:
        word_embeddings_vector = word_embeddings[word]
    except KeyError:
        word_embeddings_vector = None
    if word_embeddings_vector is not None:
        weight_matrix[i] = word_embeddings_vector            
del word_embeddings

In [61]:
model = Sequential()
model.add(Embedding(input_dim = len(tokenizer.word_index)+1, output_dim = 300, weights = [weight_matrix], trainable = True, input_length = 128))
model.add(Conv1D(256, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(128, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Conv1D(64, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(rate = 0.25))
model.add(Dense(4,activation ='softmax'))
model.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [62]:
history = model.fit(X_train, y_train, batch_size = 128, epochs = 20, validation_data = (X_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [63]:
pred = model.predict(X_test)
predicted_labels = [np.argmax(p, axis = -1) for p in pred]
True_labels = [np.argmax(p, axis = -1) for p in y_test]
for i in range(len(predicted_labels)):
    if predicted_labels[i] == 0: predicted_labels[i] = "unrelated"
    if predicted_labels[i] == 1: predicted_labels[i] = "agree"
    if predicted_labels[i] == 2: predicted_labels[i] = "disagree"
    if predicted_labels[i] == 3: predicted_labels[i] = "discuss"
for i in range(len(True_labels)):
    if True_labels[i] == 0: True_labels[i] = "unrelated"
    if True_labels[i] == 1: True_labels[i] = "agree"
    if True_labels[i] == 2: True_labels[i] = "disagree"
    if True_labels[i] == 3: True_labels[i] = "discuss"
test_df = pd.read_csv('competition_test_stances.csv')
pred_stance = predicted_labels
bodyid = []
headline = []
for i in range(len(test_df.Stance)):
    bodyid.append(test_df['Body ID'][i])
    headline.append(test_df['Headline'][i])
df_submit = pd.DataFrame( data = {'Headline': headline, 'Body ID': bodyid, "Stance": pred_stance})
df_submit.to_csv('answer_cnn_fastrack.csv', index = False, encoding = 'utf-8')

In [64]:
predicted = df_submit['Stance'].values
actual = test_df['Stance'].values
report_score(actual, predicted)
# filepath = './cnn_fastext_model'
# save_model(model, filepath)

-------------------------------------------------------------
|           |   agree   | disagree  |  discuss  | unrelated |
-------------------------------------------------------------
|   agree   |    549    |    20     |    457    |    877    |
-------------------------------------------------------------
| disagree  |    213    |    11     |    117    |    356    |
-------------------------------------------------------------
|  discuss  |    789    |    32     |   1722    |   1921    |
-------------------------------------------------------------
| unrelated |   1739    |    115    |   2358    |   14137   |
-------------------------------------------------------------
Score: 6223.25 out of 11651.25	(53.41272395665701%)


53.41272395665701