<a href="https://colab.research.google.com/github/Atomnp/realtime_text_similarity_backend/blob/main/tfidf_word2vec_sif.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [None]:
import pandas as pd
from typing import List
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix, lil_matrix
import numpy as np
import itertools
import warnings
warnings.filterwarnings("ignore")

In [None]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec, FastText
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_words=["?","n't","'s"]
stop_words+=nltk.corpus.stopwords.words('english')

In [None]:
# mounting your google drive to colab
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


#Loading the data

**Make shortcut of [this](https://drive.google.com/drive/folders/1BGr0cWKiJwT_jNg9nRNAhWgy0mYPgw_K?usp=sharing) folder in your gdrive**

In [None]:
# load dataset
dataset = r'/gdrive/MyDrive/minor_project_files/filtered.txt'

# df= pd.read_csv(dataset, keep_default_na=False, na_values=['_'])
questions = []
with open(dataset,"r") as fp:
  questions=[x.strip().lower() for x in fp.readlines()]

In [None]:
# tokenize sentences
sentences = (word_tokenize(sentence) for sentence in questions)

#Word2Vec Model (Train, Save and Load)

In [None]:
from gensim.models.callbacks import CallbackAny2Vec

class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
      loss = model.get_latest_training_loss()
      if self.epoch == 0:
          print('Loss after epoch {}: {}'.format(self.epoch, loss))
      else:
          print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
      self.epoch += 1
      self.loss_previous_step = loss

In [None]:
# uncomment if you want to retrain the word2vec model

it_copy, sentences = itertools.tee(sentences)

# min_count, window=3?? !!
model = Word2Vec(sentences=list(it_copy), size=100, window=5, min_count=1, workers=4, compute_loss=True, iter=6, callbacks=[callback()])
model_wv = model.wv
model.save("/gdrive/MyDrive/minor_project_files/word2vec.model")
# # model = FastText( window=3, min_count=1)  # instantiate
# # model.build_vocab(sentences=it_copy)
# # model.train(sentences=it_copy, total_examples=len(questions), epochs=10)  # train
# # model.save("/gdrive/MyDrive/minor_project_files/fasttext.model")

Loss after epoch 0: 1708216.5
Loss after epoch 1: 1408493.5
Loss after epoch 2: 1206733.0
Loss after epoch 3: 1066334.0
Loss after epoch 4: 1033058.5
Loss after epoch 5: 946525.5


In [None]:
# load already saved word2vec model
# model = FastText.load("/gdrive/MyDrive/minor_project_files/fasttext.model")
model = Word2Vec.load("/gdrive/MyDrive/minor_project_files/word2vec.model")
model_wv = model.wv

In [None]:
model.most_similar('china')

[('russia', 0.8251861333847046),
 ('japan', 0.8227856159210205),
 ('pakistan', 0.8111938238143921),
 ('israel', 0.7959840893745422),
 ('america', 0.7884897589683533),
 ('taiwan', 0.7727714776992798),
 ('iran', 0.7588338851928711),
 ('philippines', 0.7560622096061707),
 ('africa', 0.7530892491340637),
 ('turkey', 0.7486889362335205)]

^^ First train the model on the entire dataset

#Alternative word embedding method: GloVe

In [None]:
# GloVe Model embeddings matrix
embeddings_index = dict()

# reading Glove word embeddings into a dictionary with "word" as key and values as word vectors
with open('/gdrive/MyDrive/minor_project_files/glove.6B.100d.txt') as file:
    for line in file:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

model_wv = embeddings_index

#TFIDF for finding important words in a sentence

In [None]:
def identity_tokenizer(text):
      return text

# lowercase !!
vect = TfidfVectorizer(stop_words=stop_words, use_idf=True, tokenizer=identity_tokenizer,lowercase=False)    
# copy the iterator so that the cell can be rerun (otherwise the iterator will be at the end)
it_copy, sentences = itertools.tee(sentences)
tfidf_matrix = vect.fit_transform(it_copy)
fv = vect.get_feature_names()

In [None]:
# implementation to find sentence embeddings (alternative 1) : lil_matrix! 2m4s!!

# cx = coo_matrix(tfidf_matrix)
cx = lil_matrix(tfidf_matrix)

#Find sentence Embeddings for each sentence in the dataset


1.   Find most important words
2.   Lookup their word vectors from word2vec/glove model
3.   Save the average word vector as the sentence embedding



In [None]:
sem = []
to = cx.get_shape()[0]
for i in range(to):
  rx = cx.getrow(i).tocoo()

  sorted_by_tfidf = sorted([(fv[j],v) for k,j,v in zip(rx.row, rx.col, rx.data)], key=lambda x: x[1], reverse=True)
  if i in [1,500,1000]:
    print(sorted_by_tfidf)

  sorted_by_tfidf = list(filter(lambda x: x[0] in model_wv, sorted_by_tfidf))
  arrlist = np.array( list(map(lambda x: model_wv[x[0]], sorted_by_tfidf[:5])  ))

  sem.append(np.mean(arrlist, axis=0))

sem = np.asarray(sem)
# np.save('/gdrive/MyDrive/minor_project_files/sentence_embeddings4.npy', sem)
np.save('/gdrive/MyDrive/minor_project_files/glove_sentence_embeddings4.npy', sem)


[('care.com', 0.7546053809320284), ('first', 0.35224776891547877), ('job', 0.34440532246877414), ('work', 0.3314647551924048), ('like', 0.2792996925936627)]
[('law', 0.6021351916751914), ('civil', 0.5989092139444282), ('examples', 0.5279592450168362)]
[('pokémon', 0.5499085913046504), ('yet', 0.5206739448942341), ('working', 0.43720610120089487), ('go', 0.38136197989797543), ('india', 0.2998550477791224)]


In [None]:
sem = np.load('/gdrive/MyDrive/minor_project_files/sentence_embeddings4.npy', allow_pickle=True)
print(sem[:1])

[array([ 0.3117105 ,  0.798547  , -0.3517407 , -0.57282555,  1.2834091 ,
       -0.4639574 ,  0.8048502 ,  0.2604505 ,  0.2879932 ,  0.66775215,
        1.1243334 ,  0.58522564, -0.22775023, -0.49574035, -0.28327748,
        0.08783732, -0.991526  , -0.13409904, -0.7097201 ,  0.31242192,
       -0.021375  , -0.21674934,  0.52010095,  0.09773171, -0.39059073,
       -0.39624795, -0.52537084,  0.38627592,  0.01615157,  0.11585979,
       -0.522874  ,  0.44573665, -0.24741313,  0.67889136,  0.6839901 ,
        0.4969235 ,  0.61014324,  0.23550949, -0.08888514,  0.87053615,
        0.31962043, -0.8116109 ,  1.0099776 , -0.62597007, -0.06193497,
       -0.2162857 ,  1.1613632 , -0.6052805 ,  0.192239  , -0.26050115,
        0.6963515 ,  0.22888465, -0.787995  , -1.3464333 ,  0.21761307,
       -0.11376771,  0.57417274,  0.03277395,  0.5498637 ,  0.04191905,
        0.69846714,  0.66302496, -0.21729198, -0.11927372, -0.31712413,
       -0.23745637, -0.34448057, -0.05416854, -0.9802536 ,  1.3

#Build Annoy Index for finding approximate nearest neighbours (and corresponding label) 

In [None]:
!pip install annoy

Collecting annoy
  Downloading annoy-1.17.0.tar.gz (646 kB)
[K     |████████████████████████████████| 646 kB 8.6 MB/s 
[?25hBuilding wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.0-cp37-cp37m-linux_x86_64.whl size=391683 sha256=24ae1028da02cdd7d8455f102d2cb906a8670278710babb4e29c0890d5125c79
  Stored in directory: /root/.cache/pip/wheels/4f/e8/1e/7cc9ebbfa87a3b9f8ba79408d4d31831d67eea918b679a4c07
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.0


In [None]:
import annoy
import pickle
class AnnoyIndex():
    def __init__(self,dimension):
        self.dimension = dimension
        self.index = annoy.AnnoyIndex(self.dimension)   
   
    def build(self,vectors, labels, number_of_trees=5):
        self.vectors = vectors
        self.labels = labels 

        for i, vec in enumerate(self.vectors):
          if not np.isnan(np.sum(vec)):
            self.index.add_item(i, vec)
        self.index.build(number_of_trees)
        
    def query(self, vector, k=10):
        indices = self.index.get_nns_by_vector(
              list(vector), 
              k)                                           
        return [self.labels[i] for i in indices]
    def save(self,path):
        label_path=path.split(".")[0]+".labels"
        print(label_path)
        with open(label_path,'wb') as fp:
            pickle.dump(self.labels,fp)
        self.index.save(path)
    
    def load(self,path):
        label_path=path.split(".")[0]+".labels"
        self.index=annoy.AnnoyIndex(self.dimension)
        with open(label_path,"rb") as fp:
            self.labels=pickle.load(fp)
        self.index.load(path)

In [None]:
questions = []
with open(dataset,"r") as fp:
  questions=fp.readlines()

In [None]:
# create annoy index from vectors
index = AnnoyIndex(dimension=len(sem[0]))
index.build(sem, questions)

In [None]:
index.save('/gdrive/MyDrive/minor_project_files/annoy_index.ann')
# index.save('/gdrive/MyDrive/minor_project_files/annoy_index_glove.ann')

/gdrive/MyDrive/minor_project_files/annoy_index_glove.labels


In [None]:
index.query(sem[500])
# print(questions[1])
# print(sem[1])

['What are the civil law examples?\n',
 'What are civil laws and what are some examples?\n',
 'Why is it so difficult to find basic information about Indian civil law over the internet?\n',
 'What is civil disobedience in law?\n',
 'What is the difference between legal and law?\n',
 'What are examples of law of demand?\n',
 'What is an example of the Law of Conservation of Matter?\n',
 'What are some examples of integrity being shown in law enforcement?\n',
 'What is public disclosure law and how it is applied?\n',
 'Why is maritime law so important?\n']

In [None]:
#  load existing annoy index from file
loaded_index = AnnoyIndex(dimension=len(sem[0]))
loaded_index.load('/gdrive/MyDrive/minor_project_files/annoy_index.ann')

In [None]:
loaded_index.query(sem[499])

['Could an extremely advanced civilization, in the far future, deactivate and store red and brown dwarfs to delay heat death?\n',
 'Why do I feel extreme anger and crave revenge for any slight, no matter how insignificant?\n',
 'I have no gyno (tested) yet my nipples are puffy and they look like female breasts. Why?\n',
 'Why do some Orthodox Jewish circumcisions involve the mohel using his mouth to draw blood?\n',
 'Are most women attracted to men with overly masculine faces, overly feminine faces, or in-between? What influences their preference?\n',
 "How do I stop my son who has Asperger's from destroying his bedroom walls and hurting people when he's in a rage?\n",
 'Do you think the climate of arrogant rudeness afforded by Internet anonymity will ever spill over to the real world?\n',
 'What are the sexiest pornstars?\n',
 'How do spiders mate?\n',
 'Can I commit mental disorder disability fraud at age 30 if I have 240k saved up from my job?\n']

In [None]:
def get_similar(input_question:str):
    # get sentence embedding of the question
    to_transform = word_tokenize(input_question)
    matrix = vect.transform([to_transform])
    # print(matrix)
    cx = coo_matrix(matrix)

    sorted_by_tfidf = sorted([(fv[j],v) for i,j,v in zip(cx.row, cx.col, cx.data)], key=lambda x: x[1], reverse=True)
    
    # print(sorted_by_tfidf)
    filtered = list(filter(lambda x: x[0] in model_wv, sorted_by_tfidf))

    arrlist = np.array( list(map(lambda x: model_wv[x[0]], sorted_by_tfidf[:5])  ))

    sentence_embedding = np.mean(arrlist, axis=0)
    # print(sentence_embedding)
    return index.query(sentence_embedding)

In [None]:
#@title { run: "auto" }
query = "Should I buy the new macbook?" #@param {type:"string"}

print("Finding relevant items in the index...")
print(get_similar(query))
# %time query_embedding = get_similar(query)


Finding relevant items in the index...
['Should I buy the new MacBook?\n', 'What is the cheapest way to buy a MacBook Pro?\n', 'How can I buy the new Macbook 12" M7 512GB model in India?\n', 'Should I buy a Macbook or a Macbook Pro?\n', 'Is it ok to buy a MacBook Air from Amazon?\n', 'Should I buy the new iPhone 7?\n', 'Should I buy the new MacBook 2016 or one from 2015?\n', 'Is a MacBook really worth buying more than a PC?\n', 'What are the cons of buying a refurbished MacBook Air or MacBook Pro?\n', 'Would you buy an iPod nano or iPod touch? Which one is more worth it?\n']


#Running Flask on collab

In [None]:
%%capture
!curl -s https://ngrok-agent.s3.amazonaws.com/ngrok.asc | sudo tee /etc/apt/trusted.gpg.d/ngrok.asc >/dev/null
!echo "deb https://ngrok-agent.s3.amazonaws.com buster main" | sudo tee /etc/apt/sources.list.d/ngrok.list
!sudo apt update && sudo apt install ngrok
!pip install flask_ngrok flask-bootstrap
!pip install flask_restful flask_cors
!cat /gdrive/MyDrive/minor_project_files/ngrok_token | xargs ngrok authtoken

In [None]:
import sys
stdout = sys.stdout
stderr = sys.stderr

In [None]:
print("test")

test


In [None]:
from flask_ngrok import run_with_ngrok
from flask import Flask, render_template , request , jsonify
from flask_restful import Resource, Api
import os, logging, sys
from flask_cors import CORS, cross_origin

# sys.stdout = open("/gdrive/MyDrive/minor_project_files/test.txt", "w", buffering=1)
# sys.stderr = open("/gdrive/MyDrive/minor_project_files/test.txt", "a", buffering=1)

app = Flask(__name__)
cors = CORS(app, resources={r"/*": {"origins": "*"}})
# cors = CORS(app)
# app.config['CORS_HEADERS'] = 'Content-Type'
api = Api(app)

run_with_ngrok(app)


class Similarity(Resource):
  # get endpoint to check server is up
    def get(self):
        return jsonify({"hello": "Server Online!"})

    def post(self):
        json_data = request.get_json(force=True)
        qn = json_data["question"]
        # similarity = get_similar(qn)
        # return list of questions
        x = ["question 1", "question 2"]
        return x


api.add_resource(Similarity, "/")
app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://a520-35-227-155-192.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [02/Feb/2022 15:20:48] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [02/Feb/2022 15:20:49] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


In [None]:
sys.stdout = stdout
sys.stderr = stderr