In [None]:
#Data browsing
import glob
from google.colab import files
from google.colab import drive

#Data handling
import pandas as pd
from tqdm import tqdm
import re

#HTML pull
import requests
from lxml.html import fromstring

#Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument #Load in model and TD splits up sentences in lists of words.
import nltk
from nltk.tokenize import RegexpTokenizer #Tokenizer

import numpy as np
import scipy.spatial as sp
import logging
import  random
random.seed(sum([ord(c) for c in "KNAB"]))
import matplotlib.pyplot as plt
import getpass

#Pull data from drive

In [None]:
drive.mount('/content/drive')

In [None]:
!ls "/content/drive/My Drive/Knab/Data/CleanData/"

In [None]:
df_clean_article = pd.read_csv("/content/drive/My Drive/Knab/Data/CleanData/clean_article_data.csv") #import article data

In [None]:
df_clean_article

#Start building doc2vec model

In [None]:
#Load in the data
data = df_clean_article['TEXT'].values

In [None]:
#Split all the documents into tagged documents
#We use regex tokenizer to remove all charachters except letters and numbers \w+ equals [a-zA-Z0-9_]+
tokenizer = RegexpTokenizer(r'\w+')
split_data = [TaggedDocument(words=tokenizer.tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [None]:
#ADD SUBSAMPLING AND NEGATIVE SAMPLING
max_epochs = 600 #number of iterations
vec_size = 320 #vector size
alpha = 0.025 #learning rate
pretrained_emb = "/content/drive/My Drive/Knab/Algos/wikipedia-320.txt" #1097047 word vectors

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha = 0.00025,
                min_count = 2,
                hs = 0,    #1 turns on hierarchical sampling; this is rarely turned on as negative sampling is in general better
                negative = 5, #number of negative samples; 5 is a good value
                sample = 1e-6, #this is the sub-sampling threshold to downsample frequent words; 1e-5 is usually good for DBOW, and 1e-6 for DMPV
                pretrained_emb=pretrained_emb,
                dm = 1, #0 = DBOW; 1 = DMPV
                dbow_words = 0, #1 turns on updating of word embeddings
                window = 9,
                dm_concat = 0, #1 = concatenate input word vectors for DMPV; 0 = sum/average input word vectors. 
                dm_mean = 1,  #1 = average input word vectors; 0 = sum input word vectors.
                epochs = max_epochs,
                seed = sum([ord(c) for c in "KNAB"]),
                worker = 4   #should be set to 1 for reproducibility
                )
  
model.build_vocab(split_data)
model.train(split_data,
                total_examples=model.corpus_count,
                epochs=model.epochs)

model.save("d2v.model")
print("Model Saved")

In [None]:
article = "0"
similar_doc = model.docvecs.most_similar(article, topn=5)

print(similar_doc)


In [None]:
print(df_clean_article['TITLE'][int(article)]+'\n',
      df_clean_article['TITLE'][int(similar_doc[0][0])]+" "+str(similar_doc[0][1])+'\n',
      df_clean_article['TITLE'][int(similar_doc[1][0])]+" "+str(similar_doc[1][1])+'\n',
      df_clean_article['TITLE'][int(similar_doc[2][0])]+" "+str(similar_doc[2][1])+'\n',
      df_clean_article['TITLE'][int(similar_doc[3][0])]+" "+str(similar_doc[3][1])+'\n',
      df_clean_article['TITLE'][int(similar_doc[4][0])]+" "+str(similar_doc[4][1]))

In [None]:
print(df_clean_article['TITLE'][int(article)]+'\n',
      df_clean_article['TITLE'][int(similar_doc[0][0])]+" "+str(similar_doc[0][1])+'\n',
      df_clean_article['TITLE'][int(similar_doc[1][0])]+" "+str(similar_doc[1][1])+'\n',
      df_clean_article['TITLE'][int(similar_doc[2][0])]+" "+str(similar_doc[2][1])+'\n',
      df_clean_article['TITLE'][int(similar_doc[3][0])]+" "+str(similar_doc[3][1])+'\n',
      df_clean_article['TITLE'][int(similar_doc[4][0])]+" "+str(similar_doc[4][1]))

#Creating similarity matrices using some similarity measure

In [None]:
inferred_matrix = np.zeros([len(split_data),vec_size]) # np.zeros([320])  790 x 320
for doc_id in range(len(split_data)):
  model.random.seed(sum([ord(c) for c in "KNAB"])) #Force the same seed
  inferred_vector = model.infer_vector(split_data[doc_id].words)  #retrieve inferred vector for article from the model of size 320
  mags = np.linalg.norm(inferred_vector, axis=0)
  unit_vecs = inferred_vector / mags   #normalizing the inferred vector
  inferred_matrix[doc_id] = unit_vecs  #storing the normalized inferred vector in a matrix
sim_matrix = inferred_matrix.dot(np.matrix.transpose(inferred_matrix)) #cosine similarity
corrsim_matrix = np.corrcoef(inferred_matrix) #pearson similarity (correlation)

url_list = df_clean_article['URL'].values

In [None]:
#Save to npz
np.savez('/content/drive/My Drive/Knab/Data/CleanData/CB_pearson.npz', corrsim_matrix)

In [None]:
#Save
df_url= pd.DataFrame(url_list)
df_url.to_csv("/content/drive/My Drive/Knab/Data/CleanData/url_CB.csv")

#Providing the oppurtunity to add a new article

In [None]:
new_article = np.array([getpass.getpass('Copy the text of the new article here: ')]) #provide text of new article
new_data = [TaggedDocument(words=tokenizer.tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(new_article)]
model.random.seed(sum([ord(c) for c in "KNAB"])) #Force the same seed
new_inferred_vector = model.infer_vector(new_data[0].words)  #retrieve inferred vector for article from the model of size 320
new_mags = np.linalg.norm(new_inferred_vector, axis=0)
new_unit_vec = new_inferred_vector / new_mags   #normalizing the inferred vector
np.vstack((inferred_matrix,new_unit_vec)) #add new article vector to the vector matrix

sim_matrix = inferred_matrix.dot(np.matrix.transpose(inferred_matrix)) #cosine similarity
corrsim_matrix = np.corrcoef(inferred_matrix) #pearson similarity (correlation)