### Loading Libraries

In [1]:
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')


import sklearn
from sklearn import preprocessing

### NLP Libraries
import string
import re
from nltk.tokenize import  word_tokenize 
import nltk
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score, classification_report
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
stpwords = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from sklearn.model_selection import train_test_split
pd.options.display.max_colwidth = 500


### Importing TensorFlow Libraries

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, LSTM, Flatten, Dropout, Bidirectional, Multiply, Lambda
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam
import keras.backend as K

## Importing Word2Vec Embeddings

import gensim
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from gensim.models import KeyedVectors
filename = '/content/drive/My Drive/GoogleNews-vectors-negative300.bin'
w2v_dictionary = KeyedVectors.load_word2vec_format(filename, binary=True)

# if glove file is on your google drive and you are running the notebook on colab, run this code below, else ignore


### Importing Glove Embeddings
#path = '/content/drive/My Drive/glove.6B.200d.txt'
### Loading Glove vectors
  # g_dictionary = {}
  # with open(path) as file:
  #   for each_line in file:
  #       words_in_line, coeff_cients = each_line.split(maxsplit=1)
  #       coeff_cients = np.array(coeff_cients.split(),dtype = float)
  #       glove_dictionary[words_in_line] = coeff_cients


Mounted at /content/drive


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [16]:
def clean_message(text):
    '''Function to clean and preprocess the input text'''

    text = re.sub("'", "", text) # to avoid removing contractions in english
    text = re.sub(r'\$\w*', '', text)
    text = re.sub(r'\'s', '', text)
    text = re.sub(r'<br />', '', text)
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub("@[A-Za-z0-9_]+","", text)
    text = re.sub("#[A-Za-z0-9_]+","", text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub('[()!?]', ' ', text)
    text = re.sub('\[.*?\]',' ', text)
    text = re.sub("[^a-z0-9]"," ", text)
    text = text.lower()
    text_links_removed = "".join([char for char in text if char not in string.punctuation])
    text_cleaned = " ".join([word for word in re.split('\W+', text_links_removed) if word not in stpwords])
    #text_cleaned = " ".join([word for word in re.split('\W+', text_links_removed)])
    text = " ".join([lemmatizer.lemmatize(word) for word in re.split('\W+', text_cleaned)])
    text = text.strip()
    return text
        
def exponent_neg_manhattan_distance(left, right):
    return K.exp(-K.sum(K.abs(left-right), axis=1, keepdims=True))

def exponent_neg_euclidean_distance(left, right):
    return K.exp(-(K.sqrt(K.sum(K.square(left-right), axis=1, keepdims=True))))


### Input Questions

In [3]:
import pickle
def load_pickle_file(path):
  with open(path, 'rb') as fp:
    embedding_df = pickle.load(fp)
    fp.close()
  

    return embedding_df

pickle_path = '/content/drive/My Drive/quora_question_pairs_w2v_lstm.pkl'
model = load_pickle_file(pickle_path)
pickle_path = '/content/drive/My Drive/quora_question_pairs_w2v_tokenizer.pkl'
tokenizer = load_pickle_file(pickle_path)
pickle_path = '/content/drive/My Drive/quora_question_pairs_w2v_embedding_matrix.pkl'
embedding_matrix = load_pickle_file(pickle_path)

Keras model archive loading:
File Name                                             Modified             Size
variables.h5                                   2023-03-28 02:16:48    545438344
config.json                                    2023-03-28 02:16:46         5101
metadata.json                                  2023-03-28 02:16:46           64




Keras weights file (<HDF5 file "variables.h5" (mode r)>) loading:
...layers
......embedding
.........vars
............0
......embedding_1
.........vars
............0
......input_layer
.........vars
......input_layer_1
.........vars
......lambda
.........vars
......lstm
.........cell
............vars
...............0
...............1
...............2
.........vars
......lstm_1
.........cell
............vars
...............0
...............1
...............2
.........vars
...metrics
......mean
.........vars
............0
............1
......mean_metric_wrapper
.........vars
............0
............1
......precision
.........vars
............0
............1
......recall
.........vars
............0
............1
...optimizer
......vars
.........0
.........1
.........10
.........11
.........12
.........13
.........14
.........15
.........16
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars


In [48]:
### Input Questions

q1 = 'What is the best/most memorable thing youve ever eaten and why?'
q2 = 'What is the most delicious dish youve ever eaten and why?'

In [49]:
### Cleaning Question 1 and Question 2
q1 = clean_message(q1)
q2 = clean_message(q2)

In [43]:
### Tokenizing the questions
q1 = tokenizer.texts_to_sequences(q1)
q2 = tokenizer.texts_to_sequences(q2)


In [44]:
### Padding the input questions

q1 = pad_sequences(q1, maxlen = 681)
q2 = pad_sequences(q2, maxlen = 681)

In [None]:
### Predicting if the given questions are similar using the selected model

preds = model.predict([q1,q2])

if preds > 0.5:
  print('The questions are similar')
else:
  print("The questions are not similar")