<a href="https://colab.research.google.com/github/Atomnp/realtime_text_similarity_backend/blob/main/tfidf_word2vec_sif.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [146]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from scipy.sparse import coo_matrix, lil_matrix
import pandas as pd
import numpy as np
import itertools
from typing import List
import warnings
warnings.filterwarnings("ignore")

In [15]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
# mounting your google drive to colab
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


#Loading the data

**Make shortcut of [this](https://drive.google.com/drive/folders/1BGr0cWKiJwT_jNg9nRNAhWgy0mYPgw_K?usp=sharing) folder in your gdrive**

In [113]:
# load dataset
dataset = r'/gdrive/MyDrive/minor_project_files/filtered.txt'
questions = pd.read_fwf(dataset, header=None, delimiter = "\n", keep_default_na=False, na_values=['_'])

In [124]:
stoplist = list(string.punctuation) + list(["``", "''"])

def preprocess(text):
    stemmer = nltk.porter.PorterStemmer()
    return [stemmer.stem(word.lower()) for word in word_tokenize(str(text)) if word.lower() not in stoplist and not word.isdigit()]

In [123]:
processed = questions[0].apply(preprocess).to_list()

# Word2Vec Model (Train, Save and Load)

In [125]:
from gensim.models.callbacks import CallbackAny2Vec

class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
      loss = model.get_latest_training_loss()
      if self.epoch == 0:
          print('Loss after epoch {}: {}'.format(self.epoch, loss))
      else:
          print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
      self.epoch += 1
      self.loss_previous_step = loss

In [130]:
# uncomment if you want to retrain the word2vec model
# it_copy, sentences = itertools.tee(sentences)
# model = Word2Vec(sentences=processed, size=100, window=5, min_count=1, workers=4, compute_loss=True, iter=50, callbacks=[callback()])
# model.save("/gdrive/MyDrive/minor_project_files/word2vec_50_iter.model")

Loss after epoch 0: 1622462.125
Loss after epoch 1: 1312306.625
Loss after epoch 2: 1184831.0
Loss after epoch 3: 1031265.75
Loss after epoch 4: 992155.0
Loss after epoch 5: 1004180.5
Loss after epoch 6: 1014405.0
Loss after epoch 7: 857419.0
Loss after epoch 8: 793878.0
Loss after epoch 9: 818756.0
Loss after epoch 10: 784788.0
Loss after epoch 11: 814961.0
Loss after epoch 12: 820393.0
Loss after epoch 13: 781218.0
Loss after epoch 14: 797458.0
Loss after epoch 15: 754276.0
Loss after epoch 16: 799526.0
Loss after epoch 17: 768215.0
Loss after epoch 18: 641368.0
Loss after epoch 19: 647118.0
Loss after epoch 20: 635604.0
Loss after epoch 21: 624394.0
Loss after epoch 22: 615802.0
Loss after epoch 23: 662708.0
Loss after epoch 24: 625012.0
Loss after epoch 25: 665296.0
Loss after epoch 26: 618182.0
Loss after epoch 27: 614952.0
Loss after epoch 28: 597666.0
Loss after epoch 29: 609306.0
Loss after epoch 30: 615836.0
Loss after epoch 31: 605878.0
Loss after epoch 32: 618628.0
Loss afte

In [131]:
# load already saved word2vec model
model = Word2Vec.load("/gdrive/MyDrive/minor_project_files/word2vec_50_iter.model")

In [132]:
model.most_similar('china')

[('taiwan', 0.7596267461776733),
 ('pakistan', 0.7264149188995361),
 ('japan', 0.7235797643661499),
 ('russia', 0.7100479602813721),
 ('afghanistan', 0.677221417427063),
 ('malaysia', 0.6441991329193115),
 ('iran', 0.6398093104362488),
 ('franc', 0.6356956958770752),
 ('israel', 0.6324349641799927),
 ('mexico', 0.6236512064933777)]

# Find sentence Embeddings for each sentence in the dataset

1.   Lookup their word vectors from word2vec/glove model
3.   Save the weighted average word vector as the sentence embedding



In [170]:
# todo: get a proper word frequency for a word in a document set
# or perhaps just a typical frequency for a word from Google's n-grams
def get_word_frequency(word_text):
    return 0.0001  # set to a low occurring frequency - probably not unrealistic for most words, improves vector values

In [231]:
def weighted_average(sentence: List[str], embedding_size=100, a: float = 1e-3):
    vs = np.zeros(embedding_size)  # add all word2vec values into one vector for the sentence
    for word in sentence:
      a_value = a / (a + get_word_frequency(word))  # smooth inverse frequency, SIF
      vs = np.add(vs, np.multiply(a_value, model.wv[word]))  # vs += sif * word_vector
    vs = np.divide(vs, len(sentence))  # weighted average
    return vs

In [232]:
pca = PCA()

def sentences_to_vec(sentences: List[List[str]], embedding_size=100, a=1e-3):
    global pca
    sentence_set = [weighted_average(sentence) for sentence in sentences]

    # calculate PCA of this sentence set
    pca.fit(np.array(sentence_set))
    u = pca.components_[0]  # the PCA vector
    u = np.multiply(u, np.transpose(u))  # u x uT

    # pad the vector?  (occurs if we have less sentences than embeddings_size)
    if len(u) < embedding_size:
        for i in range(embedding_size - len(u)):
            u = np.append(u, 0)  # add needed extension for multiplication below

    # resulting sentence vectors, vs = vs -u x uT x vs
    sentence_vecs = []
    for vs in sentence_set:
        sub = np.multiply(u, vs)
        sentence_vecs.append(np.subtract(vs, sub))
    
    return sentence_vecs

In [234]:
# sentences_vecs = np.asarray(sentences_to_vec(processed))
# np.save('/gdrive/MyDrive/minor_project_files/weighted_sentence_embeddings.npy', sentence_vecs)

In [235]:
sentences_vecs = np.load('/gdrive/MyDrive/minor_project_files/weighted_sentence_embeddings.npy', allow_pickle=True)
print(sentences_vecs[:1])

[[-9.80927748e-01 -2.79434575e-01 -5.66141313e-03 -1.61609953e-01
  -3.06725165e-01 -2.36819868e-01  3.75395380e-01  4.79206396e-01
   1.33139029e-01 -6.97573714e-01 -5.39033370e-01  2.65995558e-01
   4.17935940e-01 -2.03088195e-01  3.17525126e-03  1.81194688e-01
  -1.50001279e-01  7.83060354e-01  6.14964515e-01  1.05875483e-01
  -5.68448379e-02  6.09423031e-01  3.57881941e-01 -4.77975181e-01
  -9.36058938e-01  7.34957507e-01  4.21115536e-01  8.08080726e-02
  -6.37149807e-01  1.01189915e-01  1.36324577e-02  1.04379010e-01
  -5.95440226e-01  4.24879820e-02 -2.31063301e-02 -2.51275549e-01
  -2.36971548e-02 -6.24342715e-02 -3.17679840e-01 -8.77911832e-01
  -5.27627447e-01  4.40916404e-01  1.68490753e-01 -4.38635626e-02
  -2.33046105e-01 -8.95342083e-02  4.68331654e-01  3.28619166e-01
   5.35630283e-01  2.01373408e-01 -3.95593916e-01  1.88952985e-01
  -8.51291801e-04 -7.98133247e-02  1.80502431e-01 -5.99805941e-01
  -4.49169215e-02 -2.41230510e-01 -3.01091673e-01 -2.07823346e-01
  -8.00059

#Build Annoy Index for finding approximate nearest neighbours (and corresponding label) 

In [236]:
!pip install annoy



In [237]:
import annoy
import pickle

class AnnoyIndex():
    def __init__(self, dimension):
        self.dimension = dimension
        self.index = annoy.AnnoyIndex(self.dimension)   
   
    def build(self, vectors, labels, number_of_trees=5):
        self.vectors = vectors
        self.labels = labels 

        for i, vec in enumerate(self.vectors):
          if not np.isnan(np.sum(vec)):
            self.index.add_item(i, vec)
        self.index.build(number_of_trees)
        
    def query(self, vector, k=10):
        indices = self.index.get_nns_by_vector(list(vector), k)                                           
        return [self.labels[i] for i in indices]
    
    def save(self, path):
        label_path=path.split(".")[0]+".labels"
        print(label_path)
        with open(label_path,'wb') as fp:
            pickle.dump(self.labels,fp)
        self.index.save(path)
    
    def load(self, path):
        label_path=path.split(".")[0]+".labels"
        self.index=annoy.AnnoyIndex(self.dimension)
        with open(label_path,"rb") as fp:
            self.labels=pickle.load(fp)
        self.index.load(path)

In [239]:
questions = []
with open(dataset, "r") as fp:
  questions=[line.strip() for line in fp.readlines()]

In [240]:
# create annoy index from vectors
index = AnnoyIndex(dimension=len(sentences_vecs[0]))
index.build(sentences_vecs, questions)

In [241]:
index.save('/gdrive/MyDrive/minor_project_files/weighted_annoy_index.ann')

/gdrive/MyDrive/minor_project_files/weighted_annoy_index.labels


In [242]:
index.query(sentences_vecs[500])

['What are the civil law examples?',
 "Should I take up a PhD in applied math if the advisor i'm going to work with does not have any awards listed in his CV?",
 'What are the Indian deities?',
 'How is US education different from Indian education?',
 'What are the major aspects of the Victorian literature?',
 'What are the Class Definitions ?',
 'What are the sociological perspectives?',
 'What are the top 10 books for Christian apologetics?',
 'What is the penalty for espionage?',
 'What is the property tax rate in Granville, Ohio? How is it compared to the one of Wyoming?']

# Load Annoy Index and query sentences

In [243]:
#  load existing annoy index from file
loaded_index = AnnoyIndex(dimension=len(sentences_vecs[0]))
loaded_index.load('/gdrive/MyDrive/minor_project_files/weighted_annoy_index.ann')

In [244]:
loaded_index.query(sentences_vecs[500])

['What are the civil law examples?',
 "Should I take up a PhD in applied math if the advisor i'm going to work with does not have any awards listed in his CV?",
 'What are the Indian deities?',
 'How is US education different from Indian education?',
 'What are the major aspects of the Victorian literature?',
 'What are the Class Definitions ?',
 'What are the sociological perspectives?',
 'What are the top 10 books for Christian apologetics?',
 'What is the penalty for espionage?',
 'What is the property tax rate in Granville, Ohio? How is it compared to the one of Wyoming?']

In [277]:
def sentence_to_vec(sentence: str, embedding_size=100, a=1e-3):
    vs = np.zeros(embedding_size)
    for word in sentence:
      a_value = a / (a + get_word_frequency(word))
      vs = np.add(vs, np.multiply(a_value, model.wv[word]))
    vs = np.divide(vs, len(sentence))

    # calculate PCA of this sentence
    pca.transform([vs])
    u = pca.components_[0]  # the PCA vector
    u = np.multiply(u, np.transpose(u))  # u x uT

    # pad the vector?  (occurs if we have less sentences than embeddings_size)
    if len(u) < embedding_size:
        for i in range(embedding_size - len(u)):
            u = np.append(u, 0)  # add needed extension for multiplication below

    # resulting sentence vectors, vs = vs -u x uT x vs
    return np.subtract(vs, np.multiply(u, vs))

In [290]:
def get_similar(input_question: str):
    # get sentence embedding of the question
    to_transform = word_tokenize(input_question.lower())
    embedding = sentence_to_vec(to_transform)
    return index.query(embedding)

In [296]:
#@title { run: "auto" }
query = "What are the best book" #@param {type:"string"}

print("Finding relevant items in the index...\n")
for similar in get_similar(query):
    print(similar)
print()
%time query_embedding = get_similar(query)

Finding relevant items in the index...

Could Julian Albert, played by Tom Felton, be Doctor Alchemy on Flash?
What are the best YA books?
What are the best life-changing books?
What are the best Bengali books?
What is your review of Shahrukh Khan (actor)?
What are the best Non-fiction books currently?
What are the best calculus books?
Is it possible to download movies from Netflix?
Is it possible to slip on a banana peel?
What are the best books in neuroscience?

CPU times: user 1.32 ms, sys: 5 ms, total: 6.31 ms
Wall time: 6.61 ms


#Running Flask on collab

In [None]:
%%capture
!curl -s https://ngrok-agent.s3.amazonaws.com/ngrok.asc | sudo tee /etc/apt/trusted.gpg.d/ngrok.asc >/dev/null
!echo "deb https://ngrok-agent.s3.amazonaws.com buster main" | sudo tee /etc/apt/sources.list.d/ngrok.list
!sudo apt update && sudo apt install ngrok
!pip install flask_ngrok flask-bootstrap
!pip install flask_restful flask_cors
!cat /gdrive/MyDrive/minor_project_files/ngrok_token | xargs ngrok authtoken

In [None]:
import sys
stdout = sys.stdout
stderr = sys.stderr

In [None]:
print("test")

test


In [None]:
from flask_ngrok import run_with_ngrok
from flask import Flask, render_template , request , jsonify
from flask_restful import Resource, Api
import os, logging, sys
from flask_cors import CORS, cross_origin

# sys.stdout = open("/gdrive/MyDrive/minor_project_files/test.txt", "w", buffering=1)
# sys.stderr = open("/gdrive/MyDrive/minor_project_files/test.txt", "a", buffering=1)

app = Flask(__name__)
cors = CORS(app, resources={r"/*": {"origins": "*"}})
# cors = CORS(app)
# app.config['CORS_HEADERS'] = 'Content-Type'
api = Api(app)

run_with_ngrok(app)


class Similarity(Resource):
  # get endpoint to check server is up
    def get(self):
        return jsonify({"hello": "Server Online!"})

    def post(self):
        json_data = request.get_json(force=True)
        qn = json_data["question"]
        # similarity = get_similar(qn)
        # return list of questions
        x = ["question 1", "question 2"]
        return x


api.add_resource(Similarity, "/")
app.run()

In [None]:
sys.stdout = stdout
sys.stderr = stderr