<a href="https://colab.research.google.com/github/Atomnp/realtime_text_similarity_backend/blob/main/tfidf_word2vec_sif.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Imports

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from scipy.sparse import coo_matrix, lil_matrix
import pandas as pd
import numpy as np
import itertools
from typing import List
import warnings
warnings.filterwarnings("ignore")

In [2]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# mounting your google drive to colab
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


#Loading the data

**Make shortcut of [this](https://drive.google.com/drive/folders/1BGr0cWKiJwT_jNg9nRNAhWgy0mYPgw_K?usp=sharing) folder in your gdrive**

In [4]:
# load dataset
dataset = r'/gdrive/MyDrive/minor_project_files/filtered.txt'
questions = pd.read_fwf(dataset, header=None, delimiter = "\n", keep_default_na=False, na_values=['_'])

In [5]:
stoplist = nltk.corpus.stopwords.words('english') + list(string.punctuation) + list(["``", "''"])

def preprocess(text):
    # stemmer = nltk.porter.PorterStemmer()
    return [word.lower() for word in word_tokenize(str(text)) if word.lower() not in stoplist and not word.isdigit()]

In [6]:
processed = questions[0].apply(preprocess).to_list()

In [7]:
processed[:10]

[['space',
  'inserted',
  'first',
  'letter',
  'words',
  'text',
  'ms',
  'word',
  'eg',
  'q',
  'uora',
  'instead',
  'quora',
  'fix'],
 ["'s", 'like', 'work', 'care.com', 'first', 'job'],
 ['german', 'jews', 'treated', 'hitler', 'wwi'],
 ['sugar', 'bad', 'us'],
 ['deal', 'death', 'grandparent'],
 ["'s",
  'best',
  'available',
  'compact',
  'camera',
  'sony',
  'cyber-shot',
  'dsc-rx100',
  'anything',
  'better',
  'coming',
  'next',
  '2-3',
  'months'],
 ['best', 'way', 'build', 'email', 'list'],
 ['humanity', 'part', 'experiment', 'someone', "'s", 'terrarium'],
 ['unix', 'downloadable'],
 ['calculate', 'chemical', 'formula', 'ammonium', 'chlorate']]

# Word2Vec Model (Train, Save and Load)

In [8]:
from gensim.models.callbacks import CallbackAny2Vec

class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
      loss = model.get_latest_training_loss()
      if self.epoch == 0:
          print('Loss after epoch {}: {}'.format(self.epoch, loss))
      else:
          print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
      self.epoch += 1
      self.loss_previous_step = loss

In [9]:
# uncomment if you want to retrain the word2vec model
# it_copy, sentences = itertools.tee(sentences)
model = Word2Vec(sentences=processed, size=100, window=5, min_count=1, workers=4, compute_loss=True, iter=5, callbacks=[callback()])
model.save("/gdrive/MyDrive/minor_project_files/word2vec_6_iter.model")

Loss after epoch 0: 1376979.625
Loss after epoch 1: 948915.625
Loss after epoch 2: 806411.25
Loss after epoch 3: 789229.25
Loss after epoch 4: 672099.75


In [10]:
# load already saved word2vec model
model = Word2Vec.load("/gdrive/MyDrive/minor_project_files/word2vec_6_iter.model")

In [11]:
model.most_similar('movie')

[('actor/actress', 0.8493589162826538),
 ('bollywood', 0.8430700302124023),
 ('film', 0.8417855501174927),
 ('actor', 0.8351795673370361),
 ('scenes', 0.8346238136291504),
 ('movies', 0.834228515625),
 ('scene', 0.8281405568122864),
 ('actress', 0.8274914622306824),
 ('films', 0.8154959678649902),
 ('hollywood', 0.8133722543716431)]

# Find sentence Embeddings for each sentence in the dataset

1.   Lookup their word vectors from word2vec/glove model
3.   Save the weighted average word vector as the sentence embedding



In [12]:
from collections import Counter
wf = Counter([word for sentence in processed for word in sentence])

In [13]:
unique_words = sum(wf.values())
print(unique_words, wf["movie"])

1698817 2073


In [14]:
# todo: get a proper word frequency for a word in a document set
# or perhaps just a typical frequency for a word from Google's n-grams
def get_word_frequency(word_text):
    return wf[word_text]/unique_words  

In [15]:
def weighted_average(sentence: List[str], embedding_size=100, a: float = 1e-3):
    vs = np.zeros(embedding_size)  # add all word2vec values into one vector for the sentence
    for word in sentence:
      a_value = a / (a + get_word_frequency(word))  # smooth inverse frequency, SIF
      vs = np.add(vs, np.multiply(a_value, model.wv[word]))  # vs += sif * word_vector
    
    vs = np.divide(vs, 1 if not len(sentence) else len(sentence))  # weighted average
    return vs

In [16]:
pca = PCA()

def sentences_to_vec(sentences: List[List[str]], embedding_size=100, a=1e-3):
    global pca
    sentence_set = [weighted_average(sentence) for sentence in sentences]
    
    # calculate PCA of this sentence set
    pca.fit(np.array(sentence_set))
    u = pca.components_[0]  # the PCA vector
    u = np.multiply(u, np.transpose(u))  # u x uT

    # pad the vector?  (occurs if we have less sentences than embeddings_size)
    if len(u) < embedding_size:
        for i in range(embedding_size - len(u)):
            u = np.append(u, 0)  # add needed extension for multiplication below

    # resulting sentence vectors, vs = vs -u x uT x vs
    sentence_vecs = []
    for vs in sentence_set:
        sub = np.multiply(u, vs)
        sentence_vecs.append(np.subtract(vs, sub))
    
    return sentence_vecs

In [17]:
sentences_vecs = np.asarray(sentences_to_vec(processed))
np.save('/gdrive/MyDrive/minor_project_files/weighted_sentence_embeddings.npy', sentences_vecs)

In [None]:
sentences_to_vec(processed[:10])

In [19]:
sentences_vecs = np.load('/gdrive/MyDrive/minor_project_files/weighted_sentence_embeddings.npy', allow_pickle=True)
print(sentences_vecs[:1])

[[-0.04123472 -0.62800549  0.28561507 -0.33221452  0.02482206 -0.35611167
  -0.46764914  0.20503985  0.0881743   0.14388739 -0.24019077 -0.27652836
  -0.05590096  0.46147946 -0.1993378   0.18196125  0.02341565 -0.27972645
  -0.23335134  0.34333249 -0.16804745  0.58046789  0.32486065 -0.21757168
   0.09604735 -0.37448978  0.23501296 -0.04423243  0.1940462  -0.31905368
  -0.28384971  0.11100448 -0.08584574 -0.28831921  0.02981178  0.51389128
   0.49132991 -0.33462451 -0.18148645  0.28262717  0.04049196  0.37171954
  -0.05181293  0.40783267 -0.17894445 -0.3326019  -0.18167464 -0.01213471
   0.05711866 -0.1105276  -0.24635505 -0.08920633  0.20526904  0.34831443
   0.38948737  0.130034   -0.03987007  0.01378547  0.18242859  0.10977452
  -0.14724109 -0.05026363  0.09099263  0.11086938  0.23176335  0.29164669
   0.1466073  -0.0049128  -0.00975062 -0.53459251  0.17689424  0.27955076
   0.19985334  0.477654    0.12803252  0.01984706  0.72792506  0.06607801
  -0.14808793 -0.17412628 -0.17884588 

#Build Annoy Index for finding approximate nearest neighbours (and corresponding label) 

In [20]:
!pip install annoy

Collecting annoy
  Downloading annoy-1.17.0.tar.gz (646 kB)
[K     |████████████████████████████████| 646 kB 4.2 MB/s 
[?25hBuilding wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.0-cp37-cp37m-linux_x86_64.whl size=391666 sha256=59228d97e26895bda30b12debd6dc2435c32a767dbc28236d563d500d2cf8d87
  Stored in directory: /root/.cache/pip/wheels/4f/e8/1e/7cc9ebbfa87a3b9f8ba79408d4d31831d67eea918b679a4c07
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.0


In [21]:
import annoy
import pickle

class AnnoyIndex():
    def __init__(self, dimension):
        self.dimension = dimension
        self.index = annoy.AnnoyIndex(self.dimension)   
   
    def build(self, vectors, labels, number_of_trees=5):
        self.vectors = vectors
        self.labels = labels 

        for i, vec in enumerate(self.vectors):
          if not np.isnan(np.sum(vec)):
            self.index.add_item(i, vec)
        self.index.build(number_of_trees)
        
    def query(self, vector, k=10):
        indices = self.index.get_nns_by_vector(list(vector), k)                                           
        return [self.labels[i] for i in indices]
    
    def save(self, path):
        label_path=path.split(".")[0]+".labels"
        print(label_path)
        with open(label_path,'wb') as fp:
            pickle.dump(self.labels,fp)
        self.index.save(path)
    
    def load(self, path):
        label_path=path.split(".")[0]+".labels"
        self.index=annoy.AnnoyIndex(self.dimension)
        with open(label_path,"rb") as fp:
            self.labels=pickle.load(fp)
        self.index.load(path)

In [22]:
questions = []
with open(dataset, "r") as fp:
  questions=[line.strip() for line in fp.readlines()]

In [23]:
# create annoy index from vectors
index = AnnoyIndex(dimension=len(sentences_vecs[0]))
index.build(sentences_vecs, questions)

In [24]:
index.save('/gdrive/MyDrive/minor_project_files/weighted_annoy_index.ann')

/gdrive/MyDrive/minor_project_files/weighted_annoy_index.labels


In [25]:
index.query(sentences_vecs[500])

['What are the civil law examples?',
 'What is a outstanding certification? What purpose does it serve?',
 'What is the difference between a criminal law and a civil law? Is murder a criminal offence or a civil offence?',
 'What are some examples of the Third Law of Thermodynamics?',
 'What is an example of the law of interaction?',
 'What would Stannis have done with Sansa if he had won the Battle of Blackwater?',
 'What is the parallelogram law of forces? What are some examples in how it is used?',
 'What are the best books on employment and labor law?',
 'Why do babies cry soon after they are delivered? What is the scientific reason behind it?',
 'Do babies also feel the pain of child birth?']

# Load Annoy Index and query sentences

In [26]:
#  load existing annoy index from file
loaded_index = AnnoyIndex(dimension=len(sentences_vecs[0]))
loaded_index.load('/gdrive/MyDrive/minor_project_files/weighted_annoy_index.ann')

In [27]:
loaded_index.query(sentences_vecs[500])

['What are the civil law examples?',
 'What is a outstanding certification? What purpose does it serve?',
 'What is the difference between a criminal law and a civil law? Is murder a criminal offence or a civil offence?',
 'What are some examples of the Third Law of Thermodynamics?',
 'What is an example of the law of interaction?',
 'What would Stannis have done with Sansa if he had won the Battle of Blackwater?',
 'What is the parallelogram law of forces? What are some examples in how it is used?',
 'What are the best books on employment and labor law?',
 'Why do babies cry soon after they are delivered? What is the scientific reason behind it?',
 'Do babies also feel the pain of child birth?']

In [28]:
def sentence_to_vec(sentence: str, embedding_size=100, a=1e-3):
    vs = np.zeros(embedding_size)
    for word in sentence:
      a_value = a / (a + get_word_frequency(word))
      if word in model.wv:
          vs = np.add(vs, np.multiply(a_value, model.wv[word]))
    vs = np.divide(vs, 1 if not len(sentence) else len(sentence))

    # calculate PCA of this sentence
    # pca.transform([vs])
    u = pca.components_[0]  # the PCA vector
    u = np.multiply(u, np.transpose(u))  # u x uT

    # pad the vector?  (occurs if we have less sentences than embeddings_size)
    if len(u) < embedding_size:
        for i in range(embedding_size - len(u)):
            u = np.append(u, 0)  # add needed extension for multiplication below

    # resulting sentence vectors, vs = vs -u x uT x vs
    return np.subtract(vs, np.multiply(u, vs))

In [29]:
def get_similar(input_question: str):
    # get sentence embedding of the question
    to_transform = word_tokenize(input_question.lower())
    embedding = sentence_to_vec(to_transform)
    return index.query(embedding)

In [32]:
#@title { run: "auto" }
query = "should i buy the new macbook" #@param {type:"string"}

print("Finding relevant items in the index...\n")
for similar in get_similar(query):
    print(similar)
print()
%time query_embedding = get_similar(query)

Finding relevant items in the index...

Is it better to buy a new or a used Porsche?
How can we solve racism?
How do I root the XOLO Q1010i?
Should I buy Battlefield 4 at Best Buy?
Why do people put ridiculous questions on Quora when they can just Google them? Huh, Huh, Huh :-/
What's the best way to meet new people in LA?
What is the difference between 4 wheel drive, 2 wheel drive and all wheel drive?
How do you make a milkshake?
What is your opinion about Narendra Modi's speech in the joint session of USA Congress?
Which are some professional photography jobs?

CPU times: user 513 µs, sys: 21 µs, total: 534 µs
Wall time: 544 µs


#Running Flask on collab

In [None]:
%%capture
!curl -s https://ngrok-agent.s3.amazonaws.com/ngrok.asc | sudo tee /etc/apt/trusted.gpg.d/ngrok.asc >/dev/null
!echo "deb https://ngrok-agent.s3.amazonaws.com buster main" | sudo tee /etc/apt/sources.list.d/ngrok.list
!sudo apt update && sudo apt install ngrok
!pip install flask_ngrok flask-bootstrap
!pip install flask_restful flask_cors
!cat /gdrive/MyDrive/minor_project_files/ngrok_token | xargs ngrok authtoken

In [None]:
import sys
stdout = sys.stdout
stderr = sys.stderr

In [None]:
print("test")

test


In [None]:
from flask_ngrok import run_with_ngrok
from flask import Flask, render_template , request , jsonify
from flask_restful import Resource, Api
import os, logging, sys
from flask_cors import CORS, cross_origin

# sys.stdout = open("/gdrive/MyDrive/minor_project_files/test.txt", "w", buffering=1)
# sys.stderr = open("/gdrive/MyDrive/minor_project_files/test.txt", "a", buffering=1)

app = Flask(__name__)
cors = CORS(app, resources={r"/*": {"origins": "*"}})
# cors = CORS(app)
# app.config['CORS_HEADERS'] = 'Content-Type'
api = Api(app)

run_with_ngrok(app)


class Similarity(Resource):
  # get endpoint to check server is up
    def get(self):
        return jsonify({"hello": "Server Online!"})

    def post(self):
        json_data = request.get_json(force=True)
        qn = json_data["question"]
        # similarity = get_similar(qn)
        # return list of questions
        x = ["question 1", "question 2"]
        return x


api.add_resource(Similarity, "/")
app.run()

In [None]:
sys.stdout = stdout
sys.stderr = stderr