In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
def top_n_vocab(vocab_path, n):
    '''
    It will take the vocabulary path and return the top n vocabulary.
    And Sort it by lexiographical order.
    '''
    df = pd.read_csv(vocab_path)
    vocab = df['word'][:n]
    vocab_list = vocab.tolist()
    vocab_list = [str(i) for i in vocab_list]
    vocab_list = sorted(vocab_list)
    return vocab_list[1:]



In [4]:
def create_index(vocab_list):
    '''
    It will take the vocab list and return a index dictionary from it.
    index_dict={token : lexiological rank}
    '''
    index_dict = dict()
    ind = 0
    for i in vocab_list:
        index_dict[i] = ind
        ind += 1
    return index_dict

In [5]:
def tokenize_corpus(batch_number):
    '''
    It will read the text file at corpus path and return the tokenized corpus.
    '''
    corpus_path ='/content/drive/MyDrive/DataSets/pdf_json2/lemmatized_batch_' + str(batch_number) +'.txt'
    with open(corpus_path, 'r') as f:
        corpus = f.readlines()
    corpus_token = list()
    for doc in corpus:
        corpus_token +=doc.split(' ')
    return corpus_token

In [6]:

def co_occurance_matrix(ind_dict, tokens, mat, window=4):
    '''
    This function will return the co-occurance matrix.
    It will take a dictionary with rank of each token and 
    token list. It will check if the word is available at 
    the dictionary, if available then it will take the index
    of the word and add corresponding value to the matrix.
    '''
    for i in tqdm(range(len(tokens))):
        try:
            ind = ind_dict[tokens[i]]
            k = window
            if i>len(tokens)-window:
                for word in tokens[i+1:len(tokens)]:
                    try:
                        ind2 = ind_dict[word]
                        mat[ind, ind2] += k
                        mat[ind2, ind] +=k
                    except:
                        continue
                    k -= 1
            else:
                for word in tokens[i+1:i+1+window]:
                    try:
                        ind2 = ind_dict[word]
                        mat[ind, ind2] += k
                        mat[ind2, ind] +=k
                    except(KeyError):
                        continue
                    k -= 1
        except:
            continue
    return mat

In [7]:
def corpus_cooc(k, index_dict):
    '''
    take index dictionary and run through all the batches and calculate the co occurance matrix.
    '''
    n = 14000
    mat = np.zeros((n,n))
    for batch_number in range(1,k):
        start_time = time.time()
        tokens = tokenize_corpus(batch_number)
        mat = co_occurance_matrix(index_dict, tokens, mat)
        end_time = time.time()
        print(f"for {batch_number}'th batch co occurance matrix took {round((end_time-start_time),2)} seconds ")
    return mat

In [8]:
vocab_path = '/content/drive/MyDrive/DataSets/pdf_json2/frequency_dataframe.csv'
n = 14000
vocab_list = top_n_vocab(vocab_path, n)


In [9]:
vocab_list[:5]

['aaa', 'aac', 'aad', 'aag', 'aat']

In [10]:
index_dict = create_index(vocab_list)
mat = corpus_cooc(10, index_dict)

100%|██████████| 13571052/13571052 [01:23<00:00, 162626.04it/s]


for 1'th batch co occurance matrix took 86.32 seconds 


100%|██████████| 13500995/13500995 [01:21<00:00, 166464.98it/s]


for 2'th batch co occurance matrix took 83.87 seconds 


100%|██████████| 13680026/13680026 [01:24<00:00, 162014.22it/s]


for 3'th batch co occurance matrix took 87.15 seconds 


100%|██████████| 13344876/13344876 [01:22<00:00, 161426.57it/s]


for 4'th batch co occurance matrix took 85.32 seconds 


100%|██████████| 13387568/13387568 [01:21<00:00, 165247.31it/s]


for 5'th batch co occurance matrix took 84.93 seconds 


100%|██████████| 13900885/13900885 [01:25<00:00, 161730.64it/s]


for 6'th batch co occurance matrix took 88.92 seconds 


100%|██████████| 13933662/13933662 [01:24<00:00, 165866.91it/s]


for 7'th batch co occurance matrix took 86.78 seconds 


100%|██████████| 13877469/13877469 [01:24<00:00, 163851.03it/s]


for 8'th batch co occurance matrix took 87.55 seconds 


100%|██████████| 13554844/13554844 [01:23<00:00, 161524.73it/s]


for 9'th batch co occurance matrix took 87.04 seconds 


In [None]:
np.save('/content/drive/MyDrive/DataSets/cooccur_mat.npy', mat)

In [None]:
mat = np.load('/content/drive/MyDrive/DataSets/cooccur_mat.npy')

In [None]:
mat.shape

(14000, 14000)

In [11]:
def correlation(mat):
    '''
    Calculate Correlation from the COALS paper.
    '''
    T = np.sum(mat)
    n = mat.shape[0]
    cor_mat = np.zeros((n,n))
    sum_w_jb = np.sum(mat, axis = 1)
    sum_w_aj = np.sum(mat, axis = 0)
    for i in tqdm(range(n)):
        for j in range(n):
            if np.sqrt(sum_w_jb[i]*sum_w_aj[j]*(T-sum_w_jb[i])*(T-sum_w_aj[j])) == 0:
                cor_mat[i,j] = 0
            else:
                cor_mat[i,j] = (T*mat[i,j] - sum_w_jb[i]*sum_w_aj[j])/np.sqrt(sum_w_jb[i]*sum_w_aj[j]*(T-sum_w_jb[i])*(T-sum_w_aj[j]))
            if cor_mat[i,j] <0 :
                cor_mat[i,j] = 0
    return np.sqrt(cor_mat)

In [12]:
cor_mat = correlation(mat)

100%|██████████| 14000/14000 [20:18<00:00, 11.49it/s]


In [13]:
np.save('/content/drive/MyDrive/DataSets/cor_mat.npy', cor_mat)

In [16]:
cor_mat = np.load('/content/drive/MyDrive/DataSets/cor_mat.npy')

In [17]:
from sklearn.utils.extmath import randomized_svd
def corel_svd(m):
    U, S, V = randomized_svd(m, n_components = 50, n_iter = 7)
    return U

In [18]:
vec = corel_svd(cor_mat)
#runtime crashed



In [19]:
np.save('/content/drive/MyDrive/DataSets/svd_vecs.npy', vec)

In [20]:
from gensim.models import word2vec
import logging

In [21]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
corpus = open('/content/drive/MyDrive/DataSets/pdf_json2/corpus_fin.txt', 'r')
stream = word2vec.LineSentence(corpus)
model = word2vec.Word2Vec(size = 50, hs = 1, sg = 1, window = 4, workers = 8, max_final_vocab=15000)
model.build_vocab(stream)
wordVectors = vec.astype('float32')
model.wv.vectors = wordVectors 
model.train(stream, total_examples = model.corpus_count, epochs = 5)
model.save("wordvector.model")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
2022-06-03 12:20:08,200 : INFO : EPOCH 2 - PROGRESS: at 42.24% examples, 73412 words/s, in_qsize 15, out_qsize 0
2022-06-03 12:20:09,464 : INFO : EPOCH 2 - PROGRESS: at 42.35% examples, 73423 words/s, in_qsize 15, out_qsize 0
2022-06-03 12:20:10,589 : INFO : EPOCH 2 - PROGRESS: at 42.42% examples, 73427 words/s, in_qsize 16, out_qsize 0
2022-06-03 12:20:11,629 : INFO : EPOCH 2 - PROGRESS: at 42.50% examples, 73437 words/s, in_qsize 15, out_qsize 0
2022-06-03 12:20:12,798 : INFO : EPOCH 2 - PROGRESS: at 42.58% examples, 73420 words/s, in_qsize 15, out_qsize 0
2022-06-03 12:20:13,855 : INFO : EPOCH 2 - PROGRESS: at 42.67% examples, 73420 words/s, in_qsize 15, out_qsize 0
2022-06-03 12:20:14,901 : INFO : EPOCH 2 - PROGRESS: at 42.74% examples, 73420 words/s, in_qsize 15, out_qsize 0
2022-06-03 12:20:15,935 : INFO : EPOCH 2 - PROGRESS: at 42.81% examples, 73433 words/s, in_qsize 15, out_qsize 0
2022-06-03 12:20:17,128 : INFO 

In [27]:
keywords = ['virus', 'data', 'disease', 'protein', 'treatment']
for keyword in keywords:
    print()
    print(f'Top 5 most similar words to the root word: {keyword}')
    print(model.wv.most_similar(keyword, topn=5))


Top 5 most similar words to the root word: virus
[('viruses', 0.8978314399719238), ('poliovirus', 0.8665664196014404), ('paramyxovirus', 0.8472728133201599), ('adenoviruses', 0.8442724943161011), ('adenovirus', 0.8329476118087769)]

Top 5 most similar words to the root word: data
[('collate', 0.8093036413192749), ('anonymized', 0.8036049008369446), ('compile', 0.7802772521972656), ('datasets', 0.7765321135520935), ('analysed', 0.7749858498573303)]

Top 5 most similar words to the root word: disease
[('diseases', 0.901971697807312), ('noncommunicable', 0.8086387515068054), ('chronic', 0.7940521836280823), ('infectious', 0.7901886105537415), ('communicable', 0.7900318503379822)]

Top 5 most similar words to the root word: protein
[('proteins', 0.9585700035095215), ('polypeptide', 0.8674160838127136), ('glycoprotein', 0.8411242365837097), ('subunits', 0.8410493731498718), ('phosphoprotein', 0.8408096432685852)]

Top 5 most similar words to the root word: treatment
[('therapy', 0.91638201