In [None]:
import pandas as pd
import json
import en_core_web_sm
import spacy
import gensim.parsing.preprocessing
import pprint

In [None]:
input_path = '../../data/data_2021-02-01 22:27:13.862993.json'

In [None]:
with open(input_path, encoding="utf-8") as f:
    data = json.load(f)
# Keywords from every paper  
keywords_df = pd.json_normalize(data['papers'])['keywords']

print("No of papers: {}".format(keywords_df.shape[0]))

In [None]:
nlp = spacy.load("en_core_web_sm")
STOPWORDS = nlp.Defaults.stop_words

In [None]:
def process(flat_words, lemmatize=True, min_word_len=2):
  flat_words = [gensim.parsing.preprocessing.strip_non_alphanum(word) for word in flat_words]
  doc = spacy.tokens.Doc(nlp.vocab, words=flat_words)

  processed = []

  if lemmatize:
    processed = [token.lemma_ for token in doc]
    processed = [token.lower() for token in processed]
  else:
    processed = [token.lower_ for token in doc]

  processed = [token for token in processed if token not in STOPWORDS]
  processed = [token for token in processed if len(token) >= min_word_len]

  return processed


In [None]:
def process_corpus(keywords_df, lemmatize=True, min_word_len=2):
  processed_corpus = []
  flat_words = []

  # n: no of research papers
  # [[kw11, kw12, ...] ...[kwn1, kwn2, kwn3, ...]]
  keywords = keywords_df.tolist()
  # j: no of document in which the keyword is present
  # kwji -> [wji1, wij2, ...] ~ list of all words associated with keyword i of paper j
  # [kwj1, kwj2, ...] -> [kwj11, kwj12, wkwj13, kwj21, kwj22, ...] ~ list of all words associated with the keywords of paper j
  for l in keywords:
    tmp = []
    for keyword in l:
      words = keyword.split(" ")
      for word in words:
        tmp.append(word)
    flat_words.append(tmp)

  for l in flat_words:
    processed_corpus.append(process(l, True, 2))

  return processed_corpus


In [None]:
from gensim import corpora

processed_corpus = process_corpus(keywords_df)
dictionary = corpora.Dictionary(processed_corpus)

bow_corpus = [dictionary.doc2bow(words) for words in processed_corpus]


In [None]:
from gensim import models

# Train the model
tfidf = models.TfidfModel(bow_corpus)

In [None]:
from gensim import similarities

index = similarities.SparseMatrixSimilarity(tfidf[bow_corpus], num_features=len(dictionary.token2id))

In [None]:
from collections import defaultdict

def compute_similarities(processed_corpus, threshold=0.9):
  similarities = defaultdict(list)

  for query_doc_index in range(len(processed_corpus)):
    query_doc =  processed_corpus[query_doc_index]
    query_bow = dictionary.doc2bow(query_doc)
    sims = index[tfidf[query_bow]]
    for document_number, score in sorted(enumerate(sims), key=lambda x: x[1], reverse=True):
      if score >= threshold and document_number != query_doc_index:
        similarities[query_doc_index].append((document_number, score))

  return similarities


In [None]:
# Compute similarites between every two papers
sims = compute_similarities(processed_corpus, 0.0)

In [None]:
from collections import Counter
from math import floor
from tqdm import tqdm

avg_kws_thr = [0] * 11

last_mean = [0] *  11

for item in tqdm(sims.items()):
  curr_paper_kws = processed_corpus[item[0]]
  for paper in item[1]:
    if paper[0] > item[0]: 
      thr = (int) (floor(paper[1] * 10))
      paper_kws = processed_corpus[paper[0]]
      inter = list((Counter(curr_paper_kws) & Counter(paper_kws)).elements())
      last_mean[thr] += 1
      # Calculate rolling mean
      avg_kws_thr[thr] = avg_kws_thr[thr] + (len(inter) - avg_kws_thr[thr]) / last_mean[thr]

for thr in range(11):
  print("a 0.{}% similarity between papers coresponds to {} common words".format(thr, avg_kws_thr[thr]))