In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from collections import Counter
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import keras
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras import Input
from keras.layers import Embedding,LSTM, concatenate, Dense
from keras.models import Model
from keras.utils.vis_utils import plot_model
#from keras.utils import plot_model
import matplotlib.pyplot as plt
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from tensorflow.keras.optimizers import Adam
from textblob import TextBlob
import cufflinks as cf
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [2]:
nltk.download("stopwords") #words that we don't help us understand the semantic of the sentence

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
train = pd.read_csv("/content/train.csv")

In [4]:
# Shape of dataset
print("\n Shape of the train DataSet")
print(train.head())
print(train.shape)
print("\n Label", Counter(train['label']))


 Shape of the train DataSet
       id  tid1  tid2                                          title1_en  \
0  195611     0     1  There are two new old-age insurance benefits f...   
1  191474     2     3  "If you do not come to Shenzhen, sooner or lat...   
2   25300     2     4  "If you do not come to Shenzhen, sooner or lat...   
3  123757     2     8  "If you do not come to Shenzhen, sooner or lat...   
4  141761     2    11  "If you do not come to Shenzhen, sooner or lat...   

                                           title2_en      label  
0  Police disprove "bird's nest congress each per...  unrelated  
1  Shenzhen's GDP outstrips Hong Kong? Shenzhen S...  unrelated  
2  The GDP overtopped Hong Kong? Shenzhen clarifi...  unrelated  
3  Shenzhen's GDP overtakes Hong Kong? Bureau of ...  unrelated  
4  Shenzhen's GDP outpaces Hong Kong? Defending R...  unrelated  
(256442, 6)

 Label Counter({'unrelated': 175598, 'agreed': 74238, 'disagreed': 6606})


In [5]:
print("\nNull values", train.isnull().sum())
train=train.dropna()
print("\nAfter dropping null values")
print(train.isnull().sum())
print("\nFinal train data head after null value:")


Null values id           0
tid1         0
tid2         0
title1_en    0
title2_en    0
label        0
dtype: int64

After dropping null values
id           0
tid1         0
tid2         0
title1_en    0
title2_en    0
label        0
dtype: int64

Final train data head after null value:


In [6]:
# incase of any null data it resets its index
train = train.copy()
train.reset_index(inplace=True)
train

Unnamed: 0,index,id,tid1,tid2,title1_en,title2_en,label
0,0,195611,0,1,There are two new old-age insurance benefits f...,"Police disprove ""bird's nest congress each per...",unrelated
1,1,191474,2,3,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outstrips Hong Kong? Shenzhen S...,unrelated
2,2,25300,2,4,"""If you do not come to Shenzhen, sooner or lat...",The GDP overtopped Hong Kong? Shenzhen clarifi...,unrelated
3,3,123757,2,8,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP overtakes Hong Kong? Bureau of ...,unrelated
4,4,141761,2,11,"""If you do not come to Shenzhen, sooner or lat...",Shenzhen's GDP outpaces Hong Kong? Defending R...,unrelated
...,...,...,...,...,...,...,...
256437,256437,113364,167562,48447,egypt 's presidential election failed to win m...,Salah is retiring? Football Association offici...,unrelated
256438,256438,49407,167562,49795,egypt 's presidential election failed to win m...,Liverpool's bid for Little Germany? The Echo's...,unrelated
256439,256439,130134,167562,114783,egypt 's presidential election failed to win m...,West Media Exposing Tallahlach has been recomm...,unrelated
256440,256440,101494,167562,137705,egypt 's presidential election failed to win m...,Rumor has it that Egypt is very united and the...,unrelated


In [7]:
ps = PorterStemmer() #Stemming words with NLTK
corpus1 = []
for i in range(0, len(train)):
    review = re.sub('[^a-zA-Z]', ' ', train['title1_en'][i]) #function used to replace occurrences of a specific sub-string with a different sub-string.
    review = review.lower() # convert to lower case
    review = review.split() # split the sentence

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')] # checking if each word in the sentence belongs to the list of stopwords or not
    review = ' '.join(review) # join the words without the stopwords
    corpus1.append(review) # add back the sentence in the list without the stopwords

corpus2 = []
for i in range(0, len(train)):
    review = re.sub('[^a-zA-Z]', ' ', train['title2_en'][i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus2.append(review)

In [9]:
print(corpus1[83])
print(corpus2[83])

special treatment cervic spondylosi use tradit chines medicin sparkl wine
special treatment cervic spine protrud tradit chines herbal medicin


In [10]:
MAX_NUM_WORDS = 10000
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS) #vectorizing a text corpus

In [11]:
train['title1_en_tokenized'] = corpus1
train[['title1_en', 'title1_en_tokenized']].head()

Unnamed: 0,title1_en,title1_en_tokenized
0,There are two new old-age insurance benefits f...,two new old age insur benefit old peopl rural ...
1,"""If you do not come to Shenzhen, sooner or lat...",come shenzhen sooner later son also come less ...
2,"""If you do not come to Shenzhen, sooner or lat...",come shenzhen sooner later son also come less ...
3,"""If you do not come to Shenzhen, sooner or lat...",come shenzhen sooner later son also come less ...
4,"""If you do not come to Shenzhen, sooner or lat...",come shenzhen sooner later son also come less ...


In [12]:
train['title2_en_tokenized'] = corpus2
train[['title2_en', 'title2_en_tokenized']].head()

Unnamed: 0,title2_en,title2_en_tokenized
0,"Police disprove ""bird's nest congress each per...",polic disprov bird nest congress person get yu...
1,Shenzhen's GDP outstrips Hong Kong? Shenzhen S...,shenzhen gdp outstrip hong kong shenzhen stati...
2,The GDP overtopped Hong Kong? Shenzhen clarifi...,gdp overtop hong kong shenzhen clarifi littl bit
3,Shenzhen's GDP overtakes Hong Kong? Bureau of ...,shenzhen gdp overtak hong kong bureau statist ...
4,Shenzhen's GDP outpaces Hong Kong? Defending R...,shenzhen gdp outpac hong kong defend rumor gap...


In [13]:
corpus_x1 = train.title1_en_tokenized
corpus_x2 = train.title2_en_tokenized
corpus = pd.concat([corpus_x1, corpus_x2]) #Concatenating the two corpus in series
corpus.shape

(512884,)

In [14]:
pd.DataFrame(corpus.iloc[:5],columns=['title'])

Unnamed: 0,title
0,two new old age insur benefit old peopl rural ...
1,come shenzhen sooner later son also come less ...
2,come shenzhen sooner later son also come less ...
3,come shenzhen sooner later son also come less ...
4,come shenzhen sooner later son also come less ...


In [15]:
corpus.isna().any() # detecting any missing values

False

In [16]:
tokenizer.fit_on_texts(corpus) #update the internal vocabulary 
x1_train = tokenizer.texts_to_sequences(corpus_x1) #converting tokens of text corpus into a sequence of integers.
x2_train = tokenizer.texts_to_sequences(corpus_x2) #converting tokens of text corpus into a sequence of integers.

In [17]:
len(x1_train)

256442

In [18]:
for seq in x1_train[:1]:
    print([tokenizer.index_word[idx] for idx in seq])

['two', 'new', 'old', 'age', 'insur', 'benefit', 'old', 'peopl', 'rural', 'area', 'got']


In [19]:
# Zero Padding
Max_Seq_Length = 20 #ensure that all sequences in a list have the same length
x1_train = keras.preprocessing.sequence.pad_sequences(x1_train, maxlen=Max_Seq_Length) 
x2_train = keras.preprocessing.sequence.pad_sequences(x2_train, maxlen=Max_Seq_Length) 

In [20]:
label_to_index = {'unrelated': 0,'agreed': 1,'disagreed': 2} #Label Mapping
y_train = train.label.apply(lambda x: label_to_index[x]) # Small function w/ single arg.
y_train = np.asarray(y_train).astype('float32') # Creating an array then converting to float type.
y_train[:5] # Sample from above created array

array([0., 0., 0., 0., 0.], dtype=float32)

In [21]:
y_train = keras.utils.np_utils.to_categorical(y_train)
y_train[:5]

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]], dtype=float32)

In [22]:
x1_train, x1_test, x2_train, x2_test, y_train, y_test = train_test_split(x1_train, x2_train, y_train, test_size=0.2, random_state=42) 

In [35]:
Max_Words = 10000
Max_Seq_Length = 256
Embed_Dim = 128
Lstm_Units = 128

In [36]:
InputA = keras.layers.Input(shape=(Max_Seq_Length, ), dtype='int32')
InputB = keras.layers.Input(shape=(Max_Seq_Length, ), dtype='int32')
embedding_layer = keras.layers.Embedding(Max_Words, Embed_Dim)
EmbedA = embedding_layer(InputA)
EmbedB = embedding_layer(InputB)
LstmLayer = keras.layers.LSTM(Lstm_Units)
OutputA = LstmLayer(EmbedA)
OutputB = LstmLayer(EmbedB)
Concat_Layer = keras.layers.concatenate([OutputA, OutputB], axis=-1) # Combine the output of the two branches
DenseLayer = keras.layers.Dense(units=3, activation='softmax')
Pred_Data = DenseLayer(Concat_Layer)
model = Model(inputs=[InputA, InputB], outputs=Pred_Data)
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 256)]        0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 256, 128)     1280000     ['input_3[0][0]',                
                                                                  'input_4[0][0]']                
                                                                                                  
 lstm_1 (LSTM)                  (None, 128)          131584      ['embedding_1[0][0]',      

In [25]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [26]:
Batch_size = 128
Epoch = 10
history = model.fit(x=[x1_train, x2_train], y=y_train, batch_size=Batch_size, epochs=Epoch, validation_data=([x1_test, x2_test], y_test), shuffle=True)



In [37]:
test = pd.read_csv("/content/test.csv")
test.head()

Unnamed: 0,id,tid1,tid2,title1_en,title2_en
0,256442,100672,100673,"The great coat brother Zhu Zhu Wen, in the man...","Lin xinsheng after the birth of ""hard milking,..."
1,256443,162269,162270,NASA reveals facts about UFO wreckage found on...,"The UFO found in Yuancun, Jiaocheng County, Sh..."
2,256444,157826,157854,The hollow tomatoes are loaded with hormones.,"Li chenfan bingbing home photos, netizen: this..."
3,256445,109579,74076,Ange Pavilion Geoshui: How accurate is Matrimo...,Master one: the eight-character presumption of...
4,256446,15068,15085,A 50-year-old bus-bus blows up an 8-year-old c...,< i > Joe Johnson's disgruntled timing and ord...


In [29]:
test_corpus1 = []
for i in range(0, len(test)):
    review = re.sub('[^a-zA-Z]', ' ', test['title2_en'][i]) #function used to replace occurrences of a specific sub-string with a different sub-string.
    review = review.lower() # convert to lower case
    review = review.split() # split the sentence
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')] # checking if each word in the sentence belongs to the list of stopwords or not
    review = ' '.join(review) # join the words without the stopwords
    test_corpus1.append(review) # add back the sentence in the list without the stopwords

test_corpus2 = []
for i in range(0, len(test)):
    review = re.sub('[^a-zA-Z]', ' ', test['title2_en'][i]) #function used to replace occurrences of a specific sub-string with a different sub-string.
    review = review.lower() # convert to lower case
    review = review.split() # split the sentence
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')] # checking if each word in the sentence belongs to the list of stopwords or not
    review = ' '.join(review) # join the words without the stopwords
    test_corpus2.append(review) # add back the sentence in the list without the stopwords

In [31]:
test['title1_en_tokenized'] = test_corpus1 
test['title2_en_tokenized'] = test_corpus2

In [32]:
x1_test = tokenizer.texts_to_sequences(test.title1_en_tokenized) #converting tokens of text corpus into a sequence of integers.
x2_test = tokenizer.texts_to_sequences(test.title2_en_tokenized) #converting tokens of text corpus into a sequence of integers.
x1_test = keras.preprocessing.sequence.pad_sequences(x1_test, maxlen=Max_Seq_Length)
x2_test = keras.preprocessing.sequence.pad_sequences(x2_test, maxlen=Max_Seq_Length)    
predictions = model.predict([x1_test, x2_test])

In [33]:
predictions[:5]

array([[5.6075466e-01, 4.3841669e-01, 8.2859857e-04],
       [4.8939593e-02, 9.5013058e-01, 9.2984171e-04],
       [5.8102459e-01, 4.1301647e-01, 5.9589385e-03],
       [3.1719151e-01, 6.8215716e-01, 6.5139792e-04],
       [9.6379369e-01, 2.4153480e-02, 1.2052707e-02]], dtype=float32)

In [34]:
index_to_label = {v: k for k, v in label_to_index.items()}
test['label'] = [index_to_label[idx] for idx in np.argmax(predictions, axis=1)]
submission = test.loc[:, ['id', 'label']]
submission.columns = ['id', 'label']
submission.to_csv('submission.csv', index=False) #Storing the results in submission.csv
