<a href="https://colab.research.google.com/github/Atomnp/realtime_text_similarity_backend/blob/main/tfidf_word2vec_sif.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [69]:
import pandas as pd
from typing import List
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix, lil_matrix
import numpy as np
import itertools
import warnings
warnings.filterwarnings("ignore")

In [70]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec, FastText
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [71]:
stop_words=["?","n't","'s"]
stop_words+=nltk.corpus.stopwords.words('english')

In [72]:
# mounting your google drive to colab
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


#Loading the data

**Make shortcut of [this](https://drive.google.com/drive/folders/1BGr0cWKiJwT_jNg9nRNAhWgy0mYPgw_K?usp=sharing) folder in your gdrive**

In [73]:
# load dataset
dataset = r'/gdrive/MyDrive/minor_project_files/filtered.txt'

# df= pd.read_csv(dataset, keep_default_na=False, na_values=['_'])
questions = []
with open(dataset,"r") as fp:
  questions=[x.strip().lower() for x in fp.readlines()]

In [74]:
# tokenize sentences
sentences = (word_tokenize(sentence) for sentence in questions)

#Word2Vec Model (Train, Save and Load)

In [75]:
from gensim.models.callbacks import CallbackAny2Vec

class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
      loss = model.get_latest_training_loss()
      if self.epoch == 0:
          print('Loss after epoch {}: {}'.format(self.epoch, loss))
      else:
          print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
      self.epoch += 1
      self.loss_previous_step = loss

In [76]:
# uncomment if you want to retrain the word2vec model

it_copy, sentences = itertools.tee(sentences)

# min_count, window=3?? !!
model = Word2Vec(sentences=list(it_copy), size=100, window=5, min_count=1, workers=4, compute_loss=True, iter=6, callbacks=[callback()])
model_wv = model.wv
model.save("/gdrive/MyDrive/minor_project_files/word2vec.model")
# # model = FastText( window=3, min_count=1)  # instantiate
# # model.build_vocab(sentences=it_copy)
# # model.train(sentences=it_copy, total_examples=len(questions), epochs=10)  # train
# # model.save("/gdrive/MyDrive/minor_project_files/fasttext.model")

Loss after epoch 0: 1719031.25
Loss after epoch 1: 1352354.5
Loss after epoch 2: 1235897.25
Loss after epoch 3: 1051952.5
Loss after epoch 4: 997084.0
Loss after epoch 5: 1021777.0


In [77]:
# load already saved word2vec model
# model = FastText.load("/gdrive/MyDrive/minor_project_files/fasttext.model")
model = Word2Vec.load("/gdrive/MyDrive/minor_project_files/word2vec.model")
model_wv = model.wv

In [78]:
model.most_similar('china')

[('pakistan', 0.8222455978393555),
 ('russia', 0.8220350742340088),
 ('israel', 0.8166234493255615),
 ('taiwan', 0.8009499311447144),
 ('iran', 0.7991870641708374),
 ('japan', 0.7957135438919067),
 ('america', 0.7903860211372375),
 ('africa', 0.7748715281486511),
 ('bangladesh', 0.7601543664932251),
 ('nepal', 0.7534977793693542)]

^^ First train the model on the entire dataset

#TFIDF for finding important words in a sentence

In [79]:
def identity_tokenizer(text):
      return text

# lowercase !!
vect = TfidfVectorizer(stop_words=stop_words, use_idf=True, tokenizer=identity_tokenizer,lowercase=False)    
# copy the iterator so that the cell can be rerun (otherwise the iterator will be at the end)
it_copy, sentences = itertools.tee(sentences)
tfidf_matrix = vect.fit_transform(it_copy)
fv = vect.get_feature_names()

In [80]:
# implementation to find sentence embeddings (alternative 1) : lil_matrix! 2m4s!!

# cx = coo_matrix(tfidf_matrix)
cx = lil_matrix(tfidf_matrix)

#Find sentence Embeddings for each sentence in the dataset


1.   Find most important words
2.   Lookup their word vectors from word2vec/glove model
3.   Save the average word vector as the sentence embedding



In [81]:

# todo: get a proper word frequency for a word in a document set
# or perhaps just a typical frequency for a word from Google's n-grams
def get_word_frequency(word_text):
    return 0.0001  # set to a low occurring frequency - probably not unrealistic for most words, improves vector values


In [82]:
def weighted(arrlist, a: float = 1e-3):
    # print(arrlist.shape)
    vs = np.zeros(
            arrlist.shape[1]
        )  # add all word2vec values into one vector for the sentence
    for word in arrlist:
      a_value = a / (a + get_word_frequency(word))  # smooth inverse frequency, SIF
      vs = np.add(vs, np.multiply(a_value, word))  # vs += sif * word_vector
    vs = np.divide(vs, arrlist.shape[0])  # weighted average
    return vs

In [83]:
sem = []
to = cx.get_shape()[0]
# to = 2
for i in range(to):
  rx = cx.getrow(i).tocoo()

  sorted_by_tfidf = sorted([(fv[j],v) for k,j,v in zip(rx.row, rx.col, rx.data)], key=lambda x: x[1], reverse=True)
  # if i in [1,500,1000]:
    # print(sorted_by_tfidf)

  sorted_by_tfidf = list(filter(lambda x: x[0] in model_wv, sorted_by_tfidf))
  arrlist = np.array( list(map(lambda x: model_wv[x[0]], sorted_by_tfidf[:5])  ))
  if arrlist.shape[0]==0:
    continue

  # sem.append(np.mean(arrlist, axis=0))
  sem.append(weighted(arrlist))

sem = np.asarray(sem)
# sem.shape,sem
# np.save('/gdrive/MyDrive/minor_project_files/sentence_embeddings4.npy', sem)
# np.save('/gdrive/MyDrive/minor_project_files/glove_sentence_embeddings4.npy', sem)
# sem

In [84]:
from sklearn.decomposition import PCA
# calculate PCA of this sentence set
pca = PCA()
pca.fit(np.array(sem))
u = pca.components_[0]  # the PCA vector
u = np.multiply(u, np.transpose(u))  # u x uT

# pad the vector?  (occurs if we have less sentences than embeddings_size)
embedding_size=sem[0].shape[0]
if len(u) < embedding_size:
    for i in range(embedding_size - len(u)):
        u = np.append(u, 0)  # add needed extension for multiplication below

# resulting sentence vectors, vs = vs -u x uT x vs
sentence_vecs = []
for vs in sem:
    sub = np.multiply(u, vs)
    sentence_vecs.append(np.subtract(vs, sub))

np.save('/gdrive/MyDrive/minor_project_files/arora_sentence_embeddings.npy', sentence_vecs)


In [85]:
def tf(corpus):
    dic={}
    for document in corpus:
        for word in document:
            if word in dic:
                dic[word] = dic[word] + 1
            else:
                dic[word]=1
    for word,freq in dic.items():
        print(word,freq)
        dic[word]=freq/sum(map(len, (document for document in corpus)))
    return dic

In [86]:
sem = np.load('/gdrive/MyDrive/minor_project_files/arora_sentence_embeddings.npy', allow_pickle=True)
print(sem[:1])

[[-0.88608253  0.78968546  0.03952108 -0.78146953  0.60086666  0.17499598
   0.58080738 -0.06199354 -0.31637966  0.22862579  0.76320275 -0.449672
  -0.45812576  0.54439217 -0.63675463  0.00681493  0.20754722 -0.54744711
   0.31750661  0.23291712  0.48562898  0.32424532  0.29709069 -0.46910946
   0.57720112  0.13064435 -0.55985368  0.25449161 -0.0810781   0.22339983
  -0.33132478  1.2746254   0.17979356  0.71257594 -0.06345678  1.1007385
   0.93899507 -0.04438215 -0.01715147 -0.04527296 -1.43924853  0.67309493
  -0.15809807  0.23628887 -0.536597    0.16913407  0.1360056  -0.62645493
   0.75002241  0.54410961 -0.24294168  0.51600161 -0.13280483 -0.43569982
  -0.12644896  0.13436333 -0.5668966   0.21124034 -0.71325462 -0.0439922
   0.73073836  0.73185005 -0.39510581 -0.43979219 -0.86810379  0.48561704
   0.21382737  0.30997053  0.6713865   0.24731501 -0.25045833 -0.35981103
  -0.44597959  0.41055525  0.67702342 -0.1420384  -0.00857112 -0.12346481
   0.52801272  0.66797205  0.27854845  0.2

#Build Annoy Index for finding approximate nearest neighbours (and corresponding label) 

In [87]:
!pip install annoy



In [88]:
import annoy
import pickle
class AnnoyIndex():
    def __init__(self,dimension):
        self.dimension = dimension
        self.index = annoy.AnnoyIndex(self.dimension)   
   
    def build(self,vectors, labels, number_of_trees=5):
        self.vectors = vectors
        self.labels = labels 

        for i, vec in enumerate(self.vectors):
          if not np.isnan(np.sum(vec)):
            self.index.add_item(i, vec)
        self.index.build(number_of_trees)
        
    def query(self, vector, k=10):
        indices = self.index.get_nns_by_vector(
              list(vector), 
              k)                                           
        return [self.labels[i] for i in indices]
    def save(self,path):
        label_path=path.split(".")[0]+".labels"
        print(label_path)
        with open(label_path,'wb') as fp:
            pickle.dump(self.labels,fp)
        self.index.save(path)
    
    def load(self,path):
        label_path=path.split(".")[0]+".labels"
        self.index=annoy.AnnoyIndex(self.dimension)
        with open(label_path,"rb") as fp:
            self.labels=pickle.load(fp)
        self.index.load(path)

In [89]:
questions = []
with open(dataset,"r") as fp:
  questions=fp.readlines()

In [90]:
# create annoy index from vectors
index = AnnoyIndex(dimension=len(sem[0]))
index.build(sem, questions)

In [91]:
# index.save('/gdrive/MyDrive/minor_project_files/annoy_index.ann')
# index.save('/gdrive/MyDrive/minor_project_files/annoy_index_glove.ann')
index.save('/gdrive/MyDrive/minor_project_files/arora_sentence_embeddings.ann')

/gdrive/MyDrive/minor_project_files/arora_sentence_embeddings.labels


In [92]:
index.query(sem[500])
# print(questions[1])
# print(sem[1])

['What are the civil law examples?\n',
 'How can I increase my pay as a software engineer from mid 100K to over 200K a year, without working at a top tech company (Google, Facebook, and etc)?\n',
 'On which site can I buy the cheapest t-shirt?\n',
 'Which is the best smart phone between 8k to 13k?\n',
 'What is the hardest part of software deployment?\n',
 'Which graphic card is better NVidia Geforce GTX 950m (2 GB) or AMD Radeon m375 (4 gb)? Does the extra memory make that much difference in perfomance?\n',
 'What does the emoticon <3 mean?\n',
 'Have you read The Hunger Games?\n',
 'What are the most dangerous US cities? Why are they so dangerous?\n',
 'What are the effects of stress on the brain?\n']

In [93]:
#  load existing annoy index from file
loaded_index = AnnoyIndex(dimension=len(sem[0]))
loaded_index.load('/gdrive/MyDrive/minor_project_files/arora_sentence_embeddings.ann')

In [94]:
loaded_index.query(sem[499])

['Which book is best for electronic devices?\n',
 "Russia supports China-Pakistan Economic Corridor (CPEC). How is this going to impact India's relations with Russia?\n",
 'Why do police in the US use the Tonfa or ASP baton instead of nunchucks?\n',
 'Is Dunkirk movie based on a true story?\n',
 'What is the Lewis structure for KrF2? How is it determined?\n',
 'How good is a salary offer of 14k AED per month in Dubai compared to another offer of 1.5 lacs INR per month in Mumbai?\n',
 'Where is my Android app store revenue stored?\n',
 'What car engine oils can cause burning with white smoke when they are leaking?\n',
 'How can I retrieve a deleted message from my inbox on OkCupid?\n',
 'What are the best ways to get part time jobs in India?\n']

In [95]:
def get_similar(input_question:str):
    # get sentence embedding of the question
    to_transform = word_tokenize(input_question)
    matrix = vect.transform([to_transform])
    # print(matrix)
    cx = coo_matrix(matrix)

    sorted_by_tfidf = sorted([(fv[j],v) for i,j,v in zip(cx.row, cx.col, cx.data)], key=lambda x: x[1], reverse=True)
    
    # print(sorted_by_tfidf)
    filtered = list(filter(lambda x: x[0] in model_wv, sorted_by_tfidf))

    arrlist = np.array( list(map(lambda x: model_wv[x[0]], sorted_by_tfidf[:5])  ))

    sentence_embedding = np.mean(arrlist, axis=0)
    # print(sentence_embedding)
    return index.query(sentence_embedding)

In [107]:
#@title { run: "auto" }
query = "What are the best anime" #@param {type:"string"}

print("Finding relevant items in the index...")
print(get_similar(query))
# %time query_embedding = get_similar(query)


Finding relevant items in the index...
["Instagram reset my password but I don't know my email password to receive my new password. What do I do?\n", 'What are some good neutral colors for clothing?\n', 'How can a person change a bad habit?\n', 'What kind of music do you prefer?\n', 'What should be preference order when you opt for service in a Civil Service examination?\n', 'What foods should I try in India?\n', 'Can an arranged marriage turn into a love marriage?\n', 'Are we truly free?\n', 'How do you search on Quora?\n', 'Why do people use Quora when they could easily find the answer in a quick Google search?\n']


#Running Flask on collab

In [97]:
%%capture
!curl -s https://ngrok-agent.s3.amazonaws.com/ngrok.asc | sudo tee /etc/apt/trusted.gpg.d/ngrok.asc >/dev/null
!echo "deb https://ngrok-agent.s3.amazonaws.com buster main" | sudo tee /etc/apt/sources.list.d/ngrok.list
!sudo apt update && sudo apt install ngrok
!pip install flask_ngrok flask-bootstrap
!pip install flask_restful flask_cors
!cat /gdrive/MyDrive/minor_project_files/ngrok_token | xargs ngrok authtoken

In [98]:
import sys
stdout = sys.stdout
stderr = sys.stderr

In [99]:
print("test")

test


In [None]:
from flask_ngrok import run_with_ngrok
from flask import Flask, render_template , request , jsonify
from flask_restful import Resource, Api
import os, logging, sys
from flask_cors import CORS, cross_origin

# sys.stdout = open("/gdrive/MyDrive/minor_project_files/test.txt", "w", buffering=1)
# sys.stderr = open("/gdrive/MyDrive/minor_project_files/test.txt", "a", buffering=1)

app = Flask(__name__)
cors = CORS(app, resources={r"/*": {"origins": "*"}})
# cors = CORS(app)
# app.config['CORS_HEADERS'] = 'Content-Type'
api = Api(app)

run_with_ngrok(app)


class Similarity(Resource):
  # get endpoint to check server is up
    def get(self):
        return jsonify({"hello": "Server Online!"})

    def post(self):
        json_data = request.get_json(force=True)
        qn = json_data["question"]
        # similarity = get_similar(qn)
        # return list of questions
        x = ["question 1", "question 2"]
        return x


api.add_resource(Similarity, "/")
app.run()

In [101]:
sys.stdout = stdout
sys.stderr = stderr