# CBOW

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Imports

In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras, nltk
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Model

In [None]:
import pickle
from tqdm import tqdm


## Data Read

In [None]:
df = pd.read_csv('/content/drive/MyDrive/SMAI_QUORA/questions.csv')

In [None]:
#changing pwd to desired directory
%cd '/content/drive/MyDrive/SMAI_QUORA'


/content/drive/MyDrive/SMAI_QUORA


## Getting Glove

In [None]:
!wget http://nlp.stanford.edu/data/glove.840B.300d.zip

--2021-04-28 15:14:27--  http://nlp.stanford.edu/data/glove.840B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.840B.300d.zip [following]
--2021-04-28 15:14:27--  https://nlp.stanford.edu/data/glove.840B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip [following]
--2021-04-28 15:14:28--  http://downloads.cs.stanford.edu/nlp/data/glove.840B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2176768927 (2.0G) [application/zip

In [None]:
!unzip glove.840B.300d.zip

Archive:  glove.840B.300d.zip
replace glove.840B.300d.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: glove.840B.300d.txt     

y



## Embedding Vector Using Glove
- Creating embedding_vector in form of dictionary and saving it for future use.

In [None]:

from tqdm import tqdm
def get_embedding_vector(file_name):
  embedding_vector = {}

  with open(file_name) as f :
    for line in tqdm(f):
        values = line.split(' ')
        word = values[0]
        embedding_vector[word] = np.array(values[1:], dtype='float32')
    # f.close()
  return embedding_vector

In [None]:
embedding_vector = get_embedding_vector('glove.840B.300d.txt')

2196017it [05:41, 6430.88it/s] 


In [None]:
np.save('my_embedding_vector.npy', embedding_vector) 

In [None]:
embedding_vector = np.load('my_embedding_vector.npy',allow_pickle='TRUE').item()

## Cleaning Questions 
- Cleaning `question1` and `question2` of dataframe. 
- Removing stopwords and using regex 

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:

def review_to_wordlist(review, remove_stopwords=True):
  if(type(review) is str):
    words = review.lower().split()
  else:
    words = str(review).lower().split()
  if remove_stopwords:
      stops = set(stopwords.words("english"))
      words = [w for w in words if not w in stops]
  
    
  review_text = " ".join(words)
  review_text = re.sub(r"[^A-Za-z0-9(),!.?\'\`]", " ", review_text)
  review_text = re.sub(r"\'s", " 's ", review_text)
  review_text = re.sub(r"\'ve", " 've ", review_text)
  review_text = re.sub(r"n\'t", " 't ", review_text)
  review_text = re.sub(r"\'re", " 're ", review_text)
  review_text = re.sub(r"\'d", " 'd ", review_text)
  review_text = re.sub(r"\'ll", " 'll ", review_text)
  review_text = re.sub(r",", " ", review_text)
  review_text = re.sub(r"\.", " ", review_text)
  review_text = re.sub(r"!", " ", review_text)
  review_text = re.sub(r"\(", " ( ", review_text)
  review_text = re.sub(r"\)", " ) ", review_text)
  review_text = re.sub(r"\?", " ", review_text)
  review_text = re.sub(r"\s{2,}", " ", review_text)
  
  words = review_text.split()
  # Shorten words to their stems
  stemmer = SnowballStemmer('english')
  stemmed_words = [stemmer.stem(word) for word in words]
  
  review_text = " ".join(stemmed_words)
  
  # Return a list of words
  return(review_text)

def process_questions(question_list, questions, question_list_name):
# function to transform questions and display progress
  for question in questions:
    question_list.append(review_to_wordlist(question))
    if len(question_list) % 100000 == 0:
      progress = len(question_list)/len(df) * 100
      print("{} is {}% complete.".format(question_list_name, round(progress, 1)))

  print("{} is {}% complete.".format(question_list_name, 100))

In [None]:
questions1 = []     
process_questions(questions1, df.question1, "questions1")


questions1 is 24.7% complete.
questions1 is 49.5% complete.
questions1 is 74.2% complete.
questions1 is 98.9% complete.
questions1 is 100% complete.


In [None]:
questions2 = []     
process_questions(questions2, df.question2, "questions2")

questions2 is 24.7% complete.
questions2 is 49.5% complete.
questions2 is 74.2% complete.
questions2 is 98.9% complete.
questions2 is 100% complete.


## Getting feature matrix of pre-processed `question1` and `question2`   

In [None]:
def get_qfeature_matrix(questions):
  que = [nltk.word_tokenize(ww) for ww in questions]
  que_feats = np.zeros((len(que),300))
  for i, question in enumerate(que):
    temp_matrix = np.zeros((1,300), dtype='float32')
    for j, word in enumerate(question):
      embed_value = embedding_vector.get(word)
      if(embed_value is not None):
        temp_matrix = np.add(temp_matrix,embed_value.reshape(1,300),dtype='float32')
    if(i%20000 == 0):
      print("{} done".format(20000*i))
    
    que_feats[i] = temp_matrix
  
  del(que)
  return que_feats


In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
que1_feats = get_qfeature_matrix(questions1)

0 done
400000000 done
800000000 done
1200000000 done
1600000000 done
2000000000 done
2400000000 done
2800000000 done
3200000000 done
3600000000 done
4000000000 done
4400000000 done
4800000000 done
5200000000 done
5600000000 done
6000000000 done
6400000000 done
6800000000 done
7200000000 done
7600000000 done
8000000000 done


In [None]:
que2_feats = get_qfeature_matrix(questions2)

0 done
400000000 done
800000000 done
1200000000 done
1600000000 done
2000000000 done
2400000000 done
2800000000 done
3200000000 done
3600000000 done
4000000000 done
4400000000 done
4800000000 done
5200000000 done
5600000000 done
6000000000 done
6400000000 done
6800000000 done
7200000000 done
7600000000 done
8000000000 done


In [None]:
del(questions1)
del(questions2)

In [None]:
np.save('my_que1_feats.npy', que1_feats) 
np.save('my_que2_feats.npy', que2_feats) 

In [None]:
que1_feats = np.load('my_que1_feats.npy',allow_pickle='TRUE')
que2_feats = np.load('my_que2_feats.npy',allow_pickle='TRUE')


## Concatenated Matrices
- Creating `difference and hadamard feature matrices` as per paper.

In [None]:
diff_feats = que1_feats - que2_feats
hadamard_feats = que1_feats * que2_feats

In [None]:
features = np.hstack((np.hstack((que1_feats,que2_feats)),np.hstack((diff_feats,hadamard_feats))))

In [None]:
del(que1_feats)
del(que2_feats)
del(diff_feats)
del(hadamard_feats)

In [None]:
np.save('my_features.npy', features) 

In [None]:
features = np.load('my_features.npy',allow_pickle='TRUE')

## MODEL

#### Splitting into Training and Testing

In [None]:
labels = df["is_duplicate"]


In [None]:
Y = np.array(labels)
Y = Y.reshape(-1,1)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(features, Y, train_size = 0.7,random_state = 42, shuffle = True)

In [None]:
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras.layers import Dense
from keras.layers import Flatten
from keras.optimizers import SGD
import time
from keras.layers import BatchNormalization
import time

### Model1 using SGD optimizer
- This model has 3 dense layers with dropout of 10%

In [None]:
def get_model():
  model = Sequential()
  model.add(Dense(200,input_dim=1200))
  model.add(Activation('relu'))
  model.add(Dropout(0.1))
  model.add(Dense(100))
  model.add(Activation('relu'))
  model.add(Dropout(0.1))
  model.add(Dense(50))
  model.add(Activation('relu'))
  model.add(Dropout(0.1))
  model.add(Dense(2,activation='softmax'))
  opt = SGD(lr=0.001, momentum=0.9) 

  model.compile(optimizer=opt,loss='sparse_categorical_crossentropy',metrics=['accuracy'])
  return model

In [None]:

def task1_run():
  task1model = get_model()
  start = time.time()
  history = task1model.fit(x_train,y_train,epochs=50, batch_size=2000, verbose=1)
  end = time.time()
  print("Model took %0.2f seconds to train"%(end - start))
  
  return task1model, history

task1model, task1history = task1_run()

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Model took 504.50 seconds to train


In [None]:
def task1_evaluate(model):
  _,acc = model.evaluate(x_test, y_test, verbose=0)
  return acc *100
  
print("Accuracy: %0.2f"%task1_evaluate(task1model)+"%")


Accuracy: 75.24%


### Model2 using adam optimizer
- This model has 3 dense layers with dropout of 10%

In [None]:
def get_model2():
  model = Sequential()
  model.add(Dense(200,input_dim=1200))
  model.add(Activation('relu'))
  model.add(Dropout(0.1))
  model.add(Dense(100))
  model.add(Activation('relu'))
  model.add(Dropout(0.1))
  model.add(Dense(50))
  model.add(Activation('relu'))
  model.add(Dropout(0.1))
  model.add(Dense(2,activation='softmax'))
  # opt = SGD(lr=0.001, momentum=0.9) 

  model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
  return model

In [None]:
def task2_run():
  task2model = get_model2()
  start = time.time()
  history = task2model.fit(x_train,y_train,epochs=50, batch_size=2000, verbose=1)
  end = time.time()
  print("Model took %0.2f seconds to train"%(end - start))
  
  return task2model, history

task2model, task2history = task2_run()

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Model took 514.92 seconds to train


#### Highest Accuracy(Model2)

In [None]:
def task2_evaluate(model):
  _,acc = model.evaluate(x_test, y_test, verbose=0)
  return acc *100
  
print("Accuracy: %0.2f"%task2_evaluate(task2model)+"%")

Accuracy: 79.85%


#### F1 Score(Model2)

In [None]:
pred2 = task2model.predict(x_test,batch_size=200,verbose=1)
pred2



array([[9.7753239e-01, 2.2467602e-02],
       [9.8132312e-01, 1.8676842e-02],
       [1.0000000e+00, 5.2981446e-11],
       ...,
       [5.5673052e-02, 9.4432694e-01],
       [1.9539918e-01, 8.0460083e-01],
       [7.2344966e-02, 9.2765498e-01]], dtype=float32)

In [None]:
from sklearn.metrics import accuracy_score, f1_score
def get_ypred(pred):
  y_pred = []
  for row in pred:
    y_pred.append(0 if row[0]>row[1] else 1)
  
  return np.array(y_pred).reshape(-1,1)

y_pred2 = get_ypred(pred2)


In [None]:
F1_score = f1_score(y_test,y_pred2,average='weighted')
print("F1 Score: ",F1_score*100)

F1 Score:  79.8974611727898
