<a href="https://colab.research.google.com/github/AI-Growth-Lab/Patent_p2p_similarity_w2v/blob/main/Patent_W2V_v2_version2_Example.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --upgrade gensim -q

[K     |████████████████████████████████| 24.1 MB 1.4 MB/s 
[?25h

In [2]:
# numpy version 1.21.5
import numpy
numpy.version.version

'1.21.6'

In [3]:
# gensim version 4.1.2
import gensim
print(gensim.__version__)

4.2.0


In [4]:
import numpy as np
import pandas as pd
import os
from gensim.models import Word2Vec

import json
import gensim
from nltk import sent_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
stopwords = nltk.corpus.stopwords.words('english')

import h5py
import tqdm

import itertools
import gensim, logging
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel
from gensim.matutils import sparse2full

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import gc
from multiprocessing import Pool
# from gensim.test.utils import datapath
from gensim.models.word2vec import Text8Corpus
from gensim.models.phrases import Phrases, ENGLISH_CONNECTOR_WORDS
from gensim.parsing.preprocessing import remove_stopwords



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
def ele0(x):
    return x[0]
    
# Loads from JSON
def json_l(x):
    try:
        return json.loads(x)
    except ValueError:
        return []
        
# Preprocessing (sentence tokenizer + basik Gensim - lowercases, tokenizes, de-accents (optional))
def prepro(x):
    x = sent_tokenize(x)
    clean = []
    for j in x:
        clean.append(gensim.utils.simple_preprocess(j, min_len=1))
    return clean
    
def bigrammer(x):
    bigram = gensim.models.phrases.Phraser.load('bigram.m')
    sents = []
    for i in x:
        sents.append(bigram[i])
    return sents
    
def tfidfer(para):
    model_tfidf = gensim.models.TfidfModel.load('model_tfidf_11_2.m')
    docs_dict = Dictionary.load('docs_dict_11_2.d')
    return model_tfidf[docs_dict.doc2bow(itertools.chain(*para))] 

def preprocess_1 (data):
    data['appln_abstract_prepro'] = list(map(prepro, data.appln_abstract_st_r))
    data['appln_abstract_prepro'] = list(map(json.dumps, data.appln_abstract_prepro))
    return data
    
def preprocess_2 (data):
    data['no_json'] = list(map(json.loads, data['appln_abstract_prepro']))
    data['appln_abstract_prepro_bi'] = list(map(bigrammer, data['no_json']))
    data['appln_abstract_prepro_bi'] = list(map(json.dumps, data.appln_abstract_prepro_bi))
    return data

def preprocess_3 (data):
    data['no_json_tf'] = list(map(json.loads, data['appln_abstract_prepro_bi']))
    data['tfidf'] = list(map(tfidfer, data['no_json_tf']))
    data['tfidf'] = list(map(json.dumps, data['tfidf']))
    return data

def get_data_path(filename):
    data_dir = '/content'
    path = os.path.join(data_dir, filename)
    if data_dir != '.' and 'DEEP_QUANT_ROOT' in os.environ:
        path = os.path.join(os.environ['DEEP_QUANT_ROOT'], path)
    return path 
    
def abstract_generator_nonmult(df):
    go = 'OK'
    while go == 'OK':
        try:
            i = df['appln_abstract_prepro_bi']
        except StopIteration:
                return
        if i.isnull().sum() > 0:
            i = list(filter(lambda a: a != None, i))
            go = 'STOP'
        if i.isnull().sum() < 0:
            i = [x[0] for x in i]
        j = [json_l(x) for x in i]
        for h in j:
            try:
                yield list(itertools.chain(*h))
            except StopIteration:
                return
    

In [7]:
data = pd.read_csv('https://raw.githubusercontent.com/AI-Growth-Lab/Patent_p2p_similarity_w2v/main/patent_dtatset_sample_1k.csv')

In [8]:
tokens_without_sw = [remove_stopwords(text) for text in data.appln_abstract]
data['appln_abstract_st_r'] = tokens_without_sw

In [9]:
data_1 = preprocess_1(data)
data_1.to_csv('data.csv')

In [10]:
sentences = Text8Corpus(get_data_path('data.csv'))

In [11]:
bigram = Phrases(sentences, min_count=1, threshold=1, connector_words=ENGLISH_CONNECTOR_WORDS)

In [12]:
bigram = gensim.models.phrases.Phraser(bigram)

In [13]:
bigram.save('bigram.m')

In [14]:
data_2 = preprocess_2 (data_1)



In [15]:
data_2.to_csv('data.csv')

In [16]:
model_with_phrases = Word2Vec(sentences=sentences, vector_size=300, window=80, min_count=1, workers=14)

In [17]:
model_with_phrases.save('w2v.m')

In [18]:
voc = model_with_phrases.wv.index_to_key

In [19]:
# Build Gensim Dictionary
docs_dict = Dictionary([voc])
docs_dict.compactify()
docs_dict.save('docs_dict_11_2.d')

In [20]:
model_tfidf = TfidfModel((docs_dict.doc2bow(x) for x in sentences), id2word=docs_dict)
model_tfidf.save('model_tfidf_11_2.m')

In [21]:
data_3 = preprocess_3 (data_2)
data_3.to_csv('data.csv')
iterator = abstract_generator_nonmult(data_3)

In [22]:
# Document matrix TF-IDF weighted
docs_vecs = (sparse2full(c, len(docs_dict)) for c in ((model_tfidf[docs_dict.doc2bow(x)] for x in iterator)))

In [23]:
# Selected Word-Embeddings
emb_vecs_selftrained = np.vstack([model_with_phrases.wv[docs_dict.get(i)] for i in range(len(docs_dict))])
n_abstracts = len(data_2)
h5f = h5py.File('docvecs_23_4_test.h5', 'a')
dataset = h5f.create_dataset('weighted_tfidf_docvecs', (n_abstracts,300))

In [24]:
# Generates document-vectors
pbar = tqdm.tqdm(total=n_abstracts)
start = 0
#while docs_vecs:
for i in range(1):
    a = np.vstack(next(docs_vecs) for _ in range(n_abstracts))
    pbar.update(n_abstracts)
    b = np.dot(a,emb_vecs_selftrained)
    end = start + len(b)
    dataset[start:end] = b
    start = end
pbar.close()
h5f.close()

  
100%|██████████| 1000/1000 [00:01<00:00, 576.61it/s]


In [25]:
h5f = h5py.File('docvecs_23_4_test.h5', 'r')
dataset = h5f['weighted_tfidf_docvecs']
dataset[1]

array([-0.05205272, -0.461966  ,  0.4014555 ,  0.5138215 , -0.31621286,
       -0.62124974, -0.25785035,  1.3022149 ,  0.14012174, -0.9412803 ,
        1.035978  ,  0.09522733, -1.055949  ,  1.2118465 , -0.43018413,
       -0.686788  ,  0.9686846 , -0.58684635, -0.1631586 , -0.9776744 ,
        0.01906457,  1.0731976 ,  0.8642982 ,  1.264511  , -0.2250568 ,
       -0.54605645, -0.51214844,  0.80090696, -0.6588723 , -0.67597836,
        1.7397033 , -0.19973068,  1.0296581 , -1.1289895 ,  0.7445261 ,
       -0.9251042 , -0.08644897, -1.0327184 ,  0.5954751 , -0.2610794 ,
       -0.6402455 , -0.17841025, -0.44318157, -1.2373002 , -0.13993335,
        0.03662474, -0.7218325 ,  0.11164966,  0.02642337, -0.08000155,
        0.0187408 ,  0.8784889 , -0.5367489 ,  0.4813019 ,  0.00459789,
       -0.35290617,  0.604631  , -0.6003416 , -0.41737077,  0.48500222,
        0.6599798 , -0.41417733,  0.49715832,  1.3559908 , -0.64288056,
        0.9960632 , -0.44202578, -0.3869542 , -0.92245907,  0.20

In [26]:
import scipy
scipy.spatial.distance.cosine(dataset[9],dataset[8])

0.5613730847835541

In [27]:
# Annoy
!pip install annoy
from annoy import AnnoyIndex
t = AnnoyIndex(dataset.shape[1])  # Length of item vector that will be indexed

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting annoy
  Downloading annoy-1.17.1.tar.gz (647 kB)
[K     |████████████████████████████████| 647 kB 4.0 MB/s 
[?25hBuilding wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.1-cp37-cp37m-linux_x86_64.whl size=395183 sha256=b7ff3fb240c88dc4a3939c90c3357c98d50e21930d58966113a89689fb149e1d
  Stored in directory: /root/.cache/pip/wheels/81/94/bf/92cb0e4fef8770fe9c6df0ba588fca30ab7c306b6048ae8a54
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.1


  after removing the cwd from sys.path.


In [28]:
t.on_disk_build('vecs_build.annoy')

True

In [29]:
for i in tqdm.tqdm(range(dataset.shape[0])):
    t.add_item(i, dataset[i])

100%|██████████| 1000/1000 [00:00<00:00, 8882.40it/s]


In [30]:
t.build(20)

True

In [31]:
tt = AnnoyIndex(300)
tt.load('vecs_build.annoy', prefault=False)

  """Entry point for launching an IPython kernel.


True

In [32]:
simH5 = h5py.File('similarities_23_2.h5', 'a')

In [33]:
dataset = simH5.create_dataset('sims_50',(10000000,3), maxshape=(None,3), compression="gzip")

In [34]:
round(t.get_n_items()/10)

100

In [35]:
pbar = tqdm.tqdm(total=tt.get_n_items())

blocks = []
start = 0
end = 0
counter = 0
for i in range(tt.get_n_items()):
    j = 100
    tresh = 0.0
    while tresh < 0.5:
        y = tt.get_nns_by_item(i, j, search_k=50, include_distances=True)
        tresh = y[1][-1]
        j += 50
        if j >= 500:
            break
    block = np.array(list(zip([i for _ in range(len(y[0]))],y[0],y[1])))
    end = end + len(block)
    blocks.append(block)
    counter += 1
    if counter == 1000:
        counter = 0
        blocks_np = np.vstack(blocks)
        if end >= dataset.shape[0]:
            dataset.resize((end+(end - dataset.shape[0]),3))
        dataset[start:end] = blocks_np
        blocks = []
        gc.collect()
        start = end
        pbar.update(1000)
        
        
simH5.close()

100%|██████████| 1000/1000 [00:00<00:00, 1143.01it/s]

In [36]:
def lookup(x):
    y = tree.get_nns_by_item(x, j, search_k=5, include_distances=True)
    return np.array(list(zip([x for _ in range(len(y[0]))],y[0],y[1])))