<a href="https://colab.research.google.com/github/Atomnp/realtime_text_similarity_backend/blob/main/tfidf_word2vec_sif.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [480]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from scipy.sparse import coo_matrix, lil_matrix
import pandas as pd
import numpy as np
import itertools
from typing import List
import warnings
warnings.filterwarnings("ignore")

In [481]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [482]:
# mounting your google drive to colab
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


#Loading the data

**Make shortcut of [this](https://drive.google.com/drive/folders/1BGr0cWKiJwT_jNg9nRNAhWgy0mYPgw_K?usp=sharing) folder in your gdrive**

In [483]:
# load dataset
dataset = r'/gdrive/MyDrive/minor_project_files/filtered.txt'
questions = pd.read_fwf(dataset, header=None, delimiter = "\n", keep_default_na=False, na_values=['_'])

In [484]:
stoplist = nltk.corpus.stopwords.words('english') + list(string.punctuation) + list(["``", "''"])

def preprocess(text):
    # stemmer = nltk.porter.PorterStemmer()
    return [word.lower() for word in word_tokenize(str(text)) if word.lower() not in stoplist and not word.isdigit()]

In [485]:
processed = questions[0].apply(preprocess).to_list()

In [486]:
processed[:10]

[['space',
  'inserted',
  'first',
  'letter',
  'words',
  'text',
  'ms',
  'word',
  'eg',
  'q',
  'uora',
  'instead',
  'quora',
  'fix'],
 ["'s", 'like', 'work', 'care.com', 'first', 'job'],
 ['german', 'jews', 'treated', 'hitler', 'wwi'],
 ['sugar', 'bad', 'us'],
 ['deal', 'death', 'grandparent'],
 ["'s",
  'best',
  'available',
  'compact',
  'camera',
  'sony',
  'cyber-shot',
  'dsc-rx100',
  'anything',
  'better',
  'coming',
  'next',
  '2-3',
  'months'],
 ['best', 'way', 'build', 'email', 'list'],
 ['humanity', 'part', 'experiment', 'someone', "'s", 'terrarium'],
 ['unix', 'downloadable'],
 ['calculate', 'chemical', 'formula', 'ammonium', 'chlorate']]

# Word2Vec Model (Train, Save and Load)

In [487]:
from gensim.models.callbacks import CallbackAny2Vec

class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
      loss = model.get_latest_training_loss()
      if self.epoch == 0:
          print('Loss after epoch {}: {}'.format(self.epoch, loss))
      else:
          print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
      self.epoch += 1
      self.loss_previous_step = loss

In [488]:
# uncomment if you want to retrain the word2vec model
# it_copy, sentences = itertools.tee(sentences)
model = Word2Vec(sentences=processed, size=100, window=5, min_count=1, workers=4, compute_loss=True, iter=5, callbacks=[callback()])
model.save("/gdrive/MyDrive/minor_project_files/word2vec_6_iter.model")

Loss after epoch 0: 1384321.375
Loss after epoch 1: 954061.625
Loss after epoch 2: 864531.25
Loss after epoch 3: 747768.75
Loss after epoch 4: 685184.5


In [489]:
# load already saved word2vec model
model = Word2Vec.load("/gdrive/MyDrive/minor_project_files/word2vec_6_iter.model")

In [490]:
model.most_similar('movie')

[('bollywood', 0.8338087797164917),
 ('scene', 0.8335925340652466),
 ('actor/actress', 0.8335633873939514),
 ('actor', 0.8323915004730225),
 ('movies', 0.8274307250976562),
 ('character', 0.8252178430557251),
 ('film', 0.8131483793258667),
 ('scenes', 0.81049644947052),
 ('dangal', 0.8089582920074463),
 ('moana', 0.8061360120773315)]

# Find sentence Embeddings for each sentence in the dataset

1.   Lookup their word vectors from word2vec/glove model
3.   Save the weighted average word vector as the sentence embedding



In [491]:
# todo: get a proper word frequency for a word in a document set
# or perhaps just a typical frequency for a word from Google's n-grams
def get_word_frequency(word_text):
    return 0.0001  # set to a low occurring frequency - probably not unrealistic for most words, improves vector values

In [492]:
def weighted_average(sentence: List[str], embedding_size=100, a: float = 1e-3):
    vs = np.zeros(embedding_size)  # add all word2vec values into one vector for the sentence
    for word in sentence:
      a_value = a / (a + get_word_frequency(word))  # smooth inverse frequency, SIF
      vs = np.add(vs, np.multiply(a_value, model.wv[word]))  # vs += sif * word_vector
    
    vs = np.divide(vs, 1 if not len(sentence) else len(sentence))  # weighted average
    return vs

In [493]:
pca = PCA()

def sentences_to_vec(sentences: List[List[str]], embedding_size=100, a=1e-3):
    global pca
    sentence_set = [weighted_average(sentence) for sentence in sentences]
    
    # calculate PCA of this sentence set
    pca.fit(np.array(sentence_set))
    u = pca.components_[0]  # the PCA vector
    u = np.multiply(u, np.transpose(u))  # u x uT

    # pad the vector?  (occurs if we have less sentences than embeddings_size)
    if len(u) < embedding_size:
        for i in range(embedding_size - len(u)):
            u = np.append(u, 0)  # add needed extension for multiplication below

    # resulting sentence vectors, vs = vs -u x uT x vs
    sentence_vecs = []
    for vs in sentence_set:
        sub = np.multiply(u, vs)
        sentence_vecs.append(np.subtract(vs, sub))
    
    return sentence_vecs

In [494]:
sentences_vecs = np.asarray(sentences_to_vec(processed))
np.save('/gdrive/MyDrive/minor_project_files/weighted_sentence_embeddings.npy', sentences_vecs)

In [495]:
sentences_to_vec(processed[:10])

[array([ 0.31594459,  0.40106817,  0.59025721, -0.43870918, -0.02076275,
        -0.17167177, -0.19591358, -0.07797367, -0.30058938,  0.14371658,
        -0.9541242 , -0.0528139 ,  0.5997642 ,  0.39000816,  0.59793771,
        -0.26782019, -0.16484385,  0.2281077 ,  0.20902857,  0.36153989,
         0.08792378, -0.09517225,  0.55455913, -0.63613776, -0.2599178 ,
         0.16059046, -0.12785832, -0.16673749,  0.58632598, -0.3339059 ,
         0.35777242,  0.76032453, -0.30836141,  0.69922111, -0.43529671,
        -0.23044712,  0.07017667, -0.12568322, -0.03233147, -0.20817648,
         0.13562453, -0.18154066, -0.02623359,  0.11542933,  0.25683025,
        -0.50192242, -0.27255214, -0.0762017 ,  0.44443971, -0.25678787,
         0.03267961,  0.52543996,  0.06749582, -0.15025536,  0.36131315,
        -0.28369705, -0.33288505,  0.46190689, -0.0567425 ,  0.03086267,
        -0.4536808 , -0.5421956 , -0.06390785, -0.18535432, -0.43087775,
         0.07913587,  0.12621043, -0.07726827,  0.6

In [496]:
sentences_vecs = np.load('/gdrive/MyDrive/minor_project_files/weighted_sentence_embeddings.npy', allow_pickle=True)
print(sentences_vecs[:1])

[[ 0.31042172  0.40169428  0.59744718 -0.43856834 -0.02071478 -0.17226439
  -0.19438551 -0.07729835 -0.30658148  0.14396599 -0.94728083 -0.05142065
   0.59916629  0.38714551  0.59576261 -0.26543345 -0.16577711  0.22819483
   0.20208678  0.35963947  0.0881106  -0.094598    0.55521333 -0.61088494
  -0.25851952  0.16010883 -0.12782484 -0.16869234  0.58953443 -0.33492707
   0.37037189  0.75698007 -0.30886377  0.70381236 -0.44366723 -0.23201376
   0.06865373 -0.12504767 -0.03232195 -0.20807538  0.13509005 -0.18024444
  -0.02654787  0.11524393  0.25672263 -0.49959309 -0.27221729 -0.07638517
   0.44965686 -0.26058598  0.03278556  0.51732425  0.06936882 -0.15206694
   0.36155243 -0.28369967 -0.33000972  0.46380316 -0.05679894  0.0305542
  -0.4465715  -0.54248235 -0.06226976 -0.18502148 -0.43053962  0.08309634
   0.12754802 -0.07720361  0.64237771 -0.47749611  0.48612772 -0.66506531
   0.71816226  0.11118747 -0.05823348  0.93530793 -0.66787954  0.04565644
   0.22648882 -0.12511906 -0.25323801  

#Build Annoy Index for finding approximate nearest neighbours (and corresponding label) 

In [497]:
!pip install annoy



In [498]:
import annoy
import pickle

class AnnoyIndex():
    def __init__(self, dimension):
        self.dimension = dimension
        self.index = annoy.AnnoyIndex(self.dimension)   
   
    def build(self, vectors, labels, number_of_trees=5):
        self.vectors = vectors
        self.labels = labels 

        for i, vec in enumerate(self.vectors):
          if not np.isnan(np.sum(vec)):
            self.index.add_item(i, vec)
        self.index.build(number_of_trees)
        
    def query(self, vector, k=10):
        indices = self.index.get_nns_by_vector(list(vector), k)                                           
        return [self.labels[i] for i in indices]
    
    def save(self, path):
        label_path=path.split(".")[0]+".labels"
        print(label_path)
        with open(label_path,'wb') as fp:
            pickle.dump(self.labels,fp)
        self.index.save(path)
    
    def load(self, path):
        label_path=path.split(".")[0]+".labels"
        self.index=annoy.AnnoyIndex(self.dimension)
        with open(label_path,"rb") as fp:
            self.labels=pickle.load(fp)
        self.index.load(path)

In [499]:
questions = []
with open(dataset, "r") as fp:
  questions=[line.strip() for line in fp.readlines()]

In [500]:
# create annoy index from vectors
index = AnnoyIndex(dimension=len(sentences_vecs[0]))
index.build(sentences_vecs, questions)

In [501]:
index.save('/gdrive/MyDrive/minor_project_files/weighted_annoy_index.ann')

/gdrive/MyDrive/minor_project_files/weighted_annoy_index.labels


In [502]:
index.query(sentences_vecs[500])

['What are the civil law examples?',
 'Where is the world headed over the next 5 years?',
 'What are some examples of civil cases?',
 'What are some things new employees should know going into their first day at Aetna?',
 'What are the differences between civil and criminal cases?',
 'What is the law of interaction? What are some examples of it?',
 'What are some examples of the Law of Detachment?',
 'Should I quit my job or not?',
 'Which is the best compiler for C & C++ programming?',
 'What are some examples of the Third Law of Thermodynamics?']

# Load Annoy Index and query sentences

In [503]:
#  load existing annoy index from file
loaded_index = AnnoyIndex(dimension=len(sentences_vecs[0]))
loaded_index.load('/gdrive/MyDrive/minor_project_files/weighted_annoy_index.ann')

In [504]:
loaded_index.query(sentences_vecs[500])

['What are the civil law examples?',
 'Where is the world headed over the next 5 years?',
 'What are some examples of civil cases?',
 'What are some things new employees should know going into their first day at Aetna?',
 'What are the differences between civil and criminal cases?',
 'What is the law of interaction? What are some examples of it?',
 'What are some examples of the Law of Detachment?',
 'Should I quit my job or not?',
 'Which is the best compiler for C & C++ programming?',
 'What are some examples of the Third Law of Thermodynamics?']

In [505]:
def sentence_to_vec(sentence: str, embedding_size=100, a=1e-3):
    vs = np.zeros(embedding_size)
    for word in sentence:
      a_value = a / (a + get_word_frequency(word))
      if word in model.wv:
          vs = np.add(vs, np.multiply(a_value, model.wv[word]))
    vs = np.divide(vs, 1 if not len(sentence) else len(sentence))

    # calculate PCA of this sentence
    # pca.transform([vs])
    u = pca.components_[0]  # the PCA vector
    u = np.multiply(u, np.transpose(u))  # u x uT

    # pad the vector?  (occurs if we have less sentences than embeddings_size)
    if len(u) < embedding_size:
        for i in range(embedding_size - len(u)):
            u = np.append(u, 0)  # add needed extension for multiplication below

    # resulting sentence vectors, vs = vs -u x uT x vs
    return np.subtract(vs, np.multiply(u, vs))

In [506]:
def get_similar(input_question: str):
    # get sentence embedding of the question
    to_transform = word_tokenize(input_question.lower())
    embedding = sentence_to_vec(to_transform)
    return index.query(embedding)

In [515]:
#@title { run: "auto" }
query = "should i buy the new book" #@param {type:"string"}

print("Finding relevant items in the index...\n")
for similar in get_similar(query):
    print(similar)
print()
%time query_embedding = get_similar(query)

Finding relevant items in the index...

Can I learn to make crystal meth using the IIN?
What is that one thing you will miss from living, once your life ends?
If I do not get into an IIM, what are my best options.  Which universities in India and abroad are best for me to do mba?
I am looking to buy new bike. Suzuki gixxer 155 or honda hornet 160r. Which one to buy?
How will demonetization affect India?
How can we solve racism?
Is it worth it to buy a MacBook Pro 2015 instead of the new MacBook 2016 model?
Is it better to buy a new or a used Porsche?
Would you prefer to receive assistance from a self-service machine or a real person?
What are some tips for buy a refurbished MacBook Pro?

CPU times: user 320 µs, sys: 1.81 ms, total: 2.13 ms
Wall time: 2.46 ms


#Running Flask on collab

In [None]:
%%capture
!curl -s https://ngrok-agent.s3.amazonaws.com/ngrok.asc | sudo tee /etc/apt/trusted.gpg.d/ngrok.asc >/dev/null
!echo "deb https://ngrok-agent.s3.amazonaws.com buster main" | sudo tee /etc/apt/sources.list.d/ngrok.list
!sudo apt update && sudo apt install ngrok
!pip install flask_ngrok flask-bootstrap
!pip install flask_restful flask_cors
!cat /gdrive/MyDrive/minor_project_files/ngrok_token | xargs ngrok authtoken

In [None]:
import sys
stdout = sys.stdout
stderr = sys.stderr

In [None]:
print("test")

test


In [None]:
from flask_ngrok import run_with_ngrok
from flask import Flask, render_template , request , jsonify
from flask_restful import Resource, Api
import os, logging, sys
from flask_cors import CORS, cross_origin

# sys.stdout = open("/gdrive/MyDrive/minor_project_files/test.txt", "w", buffering=1)
# sys.stderr = open("/gdrive/MyDrive/minor_project_files/test.txt", "a", buffering=1)

app = Flask(__name__)
cors = CORS(app, resources={r"/*": {"origins": "*"}})
# cors = CORS(app)
# app.config['CORS_HEADERS'] = 'Content-Type'
api = Api(app)

run_with_ngrok(app)


class Similarity(Resource):
  # get endpoint to check server is up
    def get(self):
        return jsonify({"hello": "Server Online!"})

    def post(self):
        json_data = request.get_json(force=True)
        qn = json_data["question"]
        # similarity = get_similar(qn)
        # return list of questions
        x = ["question 1", "question 2"]
        return x


api.add_resource(Similarity, "/")
app.run()

In [None]:
sys.stdout = stdout
sys.stderr = stderr