<a href="https://colab.research.google.com/github/AlirezaPNouri/Storytelling/blob/main/doc2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This code is design to generate the document embedding for comparison with other techniques such as Bert, my proposed method and Jaccard Index approach
Author: Alireza Nouri

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import string
import requests
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument 
from nltk.tokenize import word_tokenize
import copy

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
#constant variables
dataset_covid_link = 'https://raw.githubusercontent.com/AlirezaPNouri/Storytelling/main/datasets/short_parsed_covid_data.csv'
dataset_NYT_link = 'https://raw.githubusercontent.com/AlirezaPNouri/Storytelling/main/datasets/short_parsed_NYT_data.csv'
dataset_percentage = .6
MAX_DATA = 100
MAX_SENTENCE_LENGTH = 350
MIN_SENTENCE_LENGTH = 200
doc_list =[item for item in range(MAX_DATA)]
jump = 3 # jump size for gradient decent
MAX_FEATURE_SIZE = 20 # max size of the gradient descent vector
range_n_clusters = [2, 3, 4, 5, 6] #number of different clustering to compare the performance of the model over different clustering
threshold_cluster_overlapping = 0.7 # to consider two clsuters are similar
stopwords = ['!','@','#','$','%','^','&','*','?','-','_','.','i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don','should','now', 'of', 'within']


In [4]:
# download the dataset as a zip file from the git repo
def dataset_downloader(str_):
  """
  This function download a dataset
  Arguments:
    str_: the name of the dataset. It can be covid or NYT
  Returns:
    df: a pandas dataframe
  """
  if str_ == 'covid':
    url = dataset_NYT_link
  elif str_ == 'NYT':
    url = dataset_covid_link
  else:
    print('dataset is not choose correctly!')

  print('Downloading dataset...')
  res = requests.get(url, allow_redirects=True)
  with open('small_dataset_NYT.csv','wb') as file:
      file.write(res.content)
  print('Download is done!')
  df = pd.read_csv("/content/small_dataset_NYT.csv", header= None, skiprows=1)
  if str_ == 'NYT':
    df.columns = ['id', 'title', 'content', 'publish_time', 'author']
  elif str_ == 'covid':
    df.columns = ['id', 'title', 'content', 'author', 'publish_time']
  
  df = df.dropna(how='any', axis=0)
  print('dataset size after removing non-value cells is {}'.format(df.shape))
  # reduce the size of dataset to dataset_percentage*dataset.shape
  df = df.sample(frac=dataset_percentage)
  print('The new size of dataset is {} and the columns are {}'.format(df.shape, df.columns.values ))
  # remove the content that has less than MIN_SENTENCE_LENGTH words 
  df = df[df['content'].apply(lambda x : len(x.split(' ')))>=MIN_SENTENCE_LENGTH]

  # keep the first 500 words in content
  df['content'] = df['content'].apply(lambda x : ' '.join(x.split(' ')[:MAX_SENTENCE_LENGTH]))
  return df


In [5]:
def special_char_remover(str_):
  """
  This function removes any special characters from a text
  Arguments:
    str_: a string
  Returns:
    A string
  """
  return ' '.join(''.join(w for w in m if w.isalnum()) for m in str_.split(' '))

In [6]:
#### Create a list of dataframe out of the dataset. Each dataframe belongs to a timestamp #############
#######################################################################################################
# use 10 days as time interval
def timestamps_generator(df_):
  """
  This function collect data related to each timestamp separately in a dataframe
  Arguments:
    df_: a pandas dataframe
  Returns:
    df_list: a list of all timestamps data [df_1, df_2, ...]
  """
  time_stamp = set()
  df_list= list()
  for index, row in df_.iterrows():
    time_stamp.add(row['publish_time'][0:4])
  for time_interval in time_stamp:
    new_df = df_[df_['publish_time'].str.slice(0, 4)== time_interval]
    if new_df.shape[0]>5: #minimum number of doc in a timestamp
      df_list.append(new_df)
  return df_list

In [7]:
def time_fixer(str_):
  """
  This function convert a date from x/x/xxxx to 0x0xxxxx
  Arguments:
    str_: a date in a string format
  Returns:
    a string without any / and all months and days are in two digit
  """
  t_slot = str_.split('/')
  t_slot[0] = t_slot[0] if len(t_slot[0]) == 2 else '0'+t_slot[0]
  t_slot[1] = t_slot[1] if len(t_slot[1]) == 2 else '0'+t_slot[1]
  return ''+t_slot[0]+t_slot[1]+t_slot[2]

In [8]:
def find_similarity(Embed_):
  """
  This function calculate the cosine similarity between each pair of words in a document and sort them as a new vector
  Arguments:
    Embed_: a dictionary of all doc and embedding seperated into their timestamps {timestamp1 {doc1{token1: embedding, token2: embedding}}}
  Returns:
    neighbor_ts_doc: a dictionary of all timestamps, docs and the neighbor similarity for each word
  """
  tmp_collector = dict()
  neighbor_ts_doc = dict()
  for ts_ in Embed_.keys():
    temp = Embed_[ts_]
    final_list = dict()
    for doc in temp.keys():
      if doc in tmp_collector.keys():
        final_list[doc] = tmp_collector[doc]
      else:
        neighbor_dict = dict()
        track_dict = dict()
        
        for f_word, f_emb in temp[doc].items():
          temp_list = list()
          
          for s_word, s_emb in temp[doc].items():
            if (s_word, f_word) in track_dict.keys():
              temp_list.append(track_dict[(s_word, f_word)])
            else:
              track_dict[(f_word, s_word)] = cosine_similarity([f_emb],[s_emb])[0][0]
              temp_list.append(track_dict[(f_word, s_word)])
          
          neighbor_dict[f_word] =sorted(temp_list, reverse=True)[1:] # ignore the cosine similarity between a word and itself

        final_list[doc] = neighbor_dict
        tmp_collector[doc] = neighbor_dict

      print('Document {} is done!'.format(doc))
    neighbor_ts_doc[ts_] = final_list
  return neighbor_ts_doc

In [9]:
def stopword_remover(all_dict_):
  final_dict_w_stopword = dict()
  for ts_ in all_dict_.keys():
    total_docs_clusters = all_dict_[ts_]
    #remove stopwords
    total_docs_clusters_wo_stopwords = dict()
    for doc in total_docs_clusters.keys():
      temp_doc = {}
      for cluster_rank in total_docs_clusters[doc].keys():
        temp_clustering = {}
        for ins in total_docs_clusters[doc][cluster_rank].keys():
          temp_cluster = {}
          for items in total_docs_clusters[doc][cluster_rank][ins].items():
            if items[0] not in stopwords:
              temp_cluster[items[0]] = items[1]
          temp_clustering[ins] = temp_cluster
        temp_doc[cluster_rank] = temp_clustering
      total_docs_clusters_wo_stopwords[doc] = temp_doc
    final_dict_w_stopword[ts_] = total_docs_clusters_wo_stopwords
  return final_dict_w_stopword

In [10]:
def get_df_size(list_of_df_):
  tmp_list = list()
  for df_ in list_of_df_:
    tmp_list.append(len(list(df_['id'])))
  return tmp_list

In [11]:
def merging_df(list_df_):
  new_df_list = list()
  for n in range(len(list_df_)-1):
    frames = [list_df_[n], list_df_[n+1]]
    new_df_list.append(pd.concat(frames))
  new_df_list.append(list_df_[n+1])
  return new_df_list

In [12]:
df = dataset_downloader('NYT')
list_of_dfs= list()
df['content'] = df['content'].apply( lambda x: special_char_remover(x)) # remove special characters
df['publish_time'] = df['publish_time'].apply(lambda x: time_fixer(x))
list_of_dfs = timestamps_generator(copy.deepcopy(df))
print('before merging dfs: ' ,get_df_size(list_of_dfs))
list_of_timestamps = [list(x['id']) for x in list_of_dfs ]
list_of_dfs = merging_df(copy.deepcopy(list_of_dfs))  # merging two coonsecuative df to each other and made a new one 
print('after merging dfs: ',get_df_size(list_of_dfs))

Downloading dataset...
Download is done!
dataset size after removing non-value cells is (5713, 5)
The new size of dataset is (3428, 5) and the columns are ['id' 'title' 'content' 'publish_time' 'author']
before merging dfs:  [118, 268, 54, 107, 42, 66, 117, 23, 76, 184, 19, 65, 181, 126, 117, 34, 100, 108, 126, 139, 128]
after merging dfs:  [386, 322, 161, 149, 108, 183, 140, 99, 260, 203, 84, 246, 307, 243, 151, 134, 208, 234, 265, 267, 128]


In [13]:
dic_of_ind = dict()
for index, row in df.iterrows():
  dic_of_ind[row['id']] = index



In [14]:

def doc2Vec_generator(document_):
  """
  This function receives a list of sentences as a doc and returns an embedding vector for it
  """
  return document_


In [15]:
def pre_processing(sentence_, stopwords_):
  words = word_tokenize(sentence_)
  ps = PorterStemmer()
  rootWord = list()
  for w in words:
      if w not in stopwords_:
        rootWord.append(ps.stem(w))
  return ' '.join( rootWord)

In [16]:
# doc = ["I love data science",
#         "I love coding in python",
#         "I love building NLP tool",
#         "This is a good phone",
#         "This is a good TV",
#         "This is a good laptop"]




# Tokenization of each document
tokenized_doc = []
for index, row in df.iterrows():
  tokenized_doc.append(word_tokenize(pre_processing(row['content'].lower(), stopwords)))

# Convert tokenized document into gensim formated tagged data
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]
# Train the model
model = Doc2Vec(vector_size= 150, window = 5, min_count = 2, workers =4, epochs= 100)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)


In [17]:
embedding_dict= dict()
for ts,data_ in zip(range(len(list_of_dfs)), list_of_dfs):
  temp_dict = dict()
  data_['embedding_vector'] = data_['content'].apply(lambda x: model.infer_vector(pre_processing(x.lower(), stopwords)))
  for index, row in data_.iterrows():
    temp_dict[row['id']] = row['embedding_vector']
  embedding_dict[ts] = temp_dict

In [21]:
def similarity_generator(total_doc_word_embedding):
  """
  doc_matrix is a dictionary of each doc and other doc similarity to this doc based ondoc2vec             doc_embedding ={doc0: {1: int(9*euclidian_distance +1 ), 2:int(9*euclidian_distance +1 )}, doc1}  

  """
  total_dict_ = dict()
  for ts_ in total_doc_word_embedding.keys():
    doc_word_embedding = total_doc_word_embedding[ts_]
    doc_embedding = dict()
    for first_doc in doc_word_embedding.keys():
      temp_dict = dict()
      for second_doc in doc_word_embedding.keys(): 
        if first_doc != second_doc:
          temp_dict[second_doc] =1.0 - cosine_similarity([doc_word_embedding[first_doc]],[doc_word_embedding[second_doc]])[0][0]
      doc_embedding[first_doc] = temp_dict
      print('Document {} is done!'.format(first_doc) )
    total_dict_[ts_] = doc_embedding
  return total_dict_

In [None]:
Embed_dict = dict() # {timestamp1 {doc1{(number_of_tokens+1, 768)}, doc2{}}, timestamp2{}}

Embed_dict= similarity_generator(embedding_dict)

In [22]:
def path_finder(start_, all_d):
  r= 0
  res = list()
  res.append(start_)
  while(r<len(all_d[0])-1):
    if start_ not in list_of_ts_[r]:
      r +=1
    min_val = min(all_d[r][start_].values())
    min_ind = [x for x,y in all_d[r][start_].items() if y==min_val]
    res.append(min_ind[0])
    if start_ in list_of_ts_[r]:
      del all_d[r][start_]
      for w in all_d[r].keys():
        del all_d[r][w][start_]
    if r == 20:
      break
    if start_ in list_of_ts_[r+1]:
      del all_d[r+1][start_]
      for w in all_d[r+1].keys():
        del all_d[r+1][w][start_]
    if r == 20:
      break

    start_ = min_ind[0]
  return res

In [23]:

new_list = dict()
for v in range(len(list_of_timestamps)):
  new_list[v] = list_of_timestamps[v]
list_of_ts_ = new_list.copy()
full_story = list()
for ni in list_of_ts_[0]:
  full_story.append(path_finder(ni, copy.deepcopy(Embed_dict)))

for yu in full_story:
  print(len(yu), yu)

84 [50818, 685613, 503496, 127530, 590069, 101537, 596584, 666265, 690647, 552330, 582561, 729099, 516437, 644520, 518446, 627503, 623656, 707186, 546626, 518434, 551505, 654426, 533979, 676479, 611299, 769272, 729071, 510283, 23931, 571901, 678711, 670534, 746046, 513577, 581199, 556465, 671078, 596567, 50766, 650318, 630862, 51280, 50588, 504799, 505147, 509786, 683384, 524277, 587415, 732991, 715429, 702542, 23782, 743182, 696043, 647300, 684253, 735707, 734230, 51260, 568554, 731480, 717411, 24218, 79918, 98942, 51075, 640648, 565179, 737561, 581438, 600365, 589521, 613415, 607215, 569970, 495579, 775332, 621355, 535040, 775717, 772794, 24169, 140341]
55 [623632, 598747, 601455, 222902, 599311, 693884, 508463, 566593, 590542, 512112, 642016, 568830, 183954, 160315, 588685, 187436, 553864, 624183, 550925, 639893, 666715, 98538, 699424, 744562, 509405, 674684, 502681, 650597, 707787, 675835, 23586, 550867, 139865, 139897, 95452, 648240, 51278, 51238, 566669, 719154, 773502, 685799, 6

In [24]:
pickle.dump(Embed_dict, open('doc_2vec_NYT_Embed_dict.p', 'wb')) # store embeddings
pickle.dump(df, open('doc_2vec_NYT_list_of_dfs.p', 'wb')) # store list_of_dfs
pickle.dump(full_story, open('doc_NYT_covid_full_story.p', 'wb'))