<a href="https://colab.research.google.com/github/AlirezaPNouri/Storytelling/blob/main/doc2Vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This code is design to generate the document embedding for comparison with other techniques such as Bert, my proposed method and Jaccard Index approach
Author: Alireza Nouri

In [39]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import string
import requests
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [20]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
#constant variables
dataset_covid_link = 'https://raw.githubusercontent.com/AlirezaPNouri/Storytelling/main/datasets/short_parsed_covid_data.csv'
dataset_NYT_link = 'https://raw.githubusercontent.com/AlirezaPNouri/Storytelling/main/datasets/short_parsed_NYT_data.csv'
dataset_percentage = .6
MAX_DATA = 100
MAX_SENTENCE_LENGTH = 350
MIN_SENTENCE_LENGTH = 200
doc_list =[item for item in range(MAX_DATA)]
jump = 3 # jump size for gradient decent
MAX_FEATURE_SIZE = 20 # max size of the gradient descent vector
range_n_clusters = [2, 3, 4, 5, 6] #number of different clustering to compare the performance of the model over different clustering
threshold_cluster_overlapping = 0.7 # to consider two clsuters are similar
stopwords = ['i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don','should','now', 'of', 'within']


In [2]:
# download the dataset as a zip file from the git repo
def dataset_downloader(str_):
  """
  This function download a dataset
  Arguments:
    str_: the name of the dataset. It can be covid or NYT
  Returns:
    df: a pandas dataframe
  """
  if str_ == 'covid':
    url = dataset_NYT_link
  elif str_ == 'NYT':
    url = dataset_covid_link
  else:
    print('dataset is not choose correctly!')

  print('Downloading dataset...')
  res = requests.get(url, allow_redirects=True)
  with open('small_dataset_NYT.csv','wb') as file:
      file.write(res.content)
  print('Download is done!')
  df = pd.read_csv("/content/small_dataset_NYT.csv", header= None, skiprows=1)
  if str_ == 'NYT':
    df.columns = ['id', 'title', 'content', 'publish_time', 'author']
  elif str_ == 'covid':
    df.columns = ['id', 'title', 'content', 'author', 'publish_time']
  
  df = df.dropna(how='any', axis=0)
  print('dataset size after removing non-value cells is {}'.format(df.shape))
  # reduce the size of dataset to dataset_percentage*dataset.shape
  df = df.sample(frac=dataset_percentage)
  print('The new size of dataset is {} and the columns are {}'.format(df.shape, df.columns.values ))
  # remove the content that has less than MIN_SENTENCE_LENGTH words 
  df = df[df['content'].apply(lambda x : len(x.split(' ')))>=MIN_SENTENCE_LENGTH]

  # keep the first 500 words in content
  df['content'] = df['content'].apply(lambda x : ' '.join(x.split(' ')[:MAX_SENTENCE_LENGTH]))
  return df


In [4]:
def special_char_remover(str_):
  """
  This function removes any special characters from a text
  Arguments:
    str_: a string
  Returns:
    A string
  """
  return ' '.join(''.join(w for w in m if w.isalnum()) for m in str_.split(' '))

In [5]:
#### Create a list of dataframe out of the dataset. Each dataframe belongs to a timestamp #############
#######################################################################################################
# use 10 days as time interval
def timestamps_generator(df_):
  """
  This function collect data related to each timestamp separately in a dataframe
  Arguments:
    df_: a pandas dataframe
  Returns:
    df_list: a list of all timestamps data [df_1, df_2, ...]
  """
  time_stamp = set()
  df_list= list()
  for index, row in df_.iterrows():
    time_stamp.add(row['publish_time'][0:4])
  for time_interval in time_stamp:
    new_df = df_[df_['publish_time'].str.slice(0, 4)== time_interval]
    if new_df.shape[0]>5: #minimum number of doc in a timestamp
      df_list.append(new_df)
  return df_list

In [6]:
def time_fixer(str_):
  """
  This function convert a date from x/x/xxxx to 0x0xxxxx
  Arguments:
    str_: a date in a string format
  Returns:
    a string without any / and all months and days are in two digit
  """
  t_slot = str_.split('/')
  t_slot[0] = t_slot[0] if len(t_slot[0]) == 2 else '0'+t_slot[0]
  t_slot[1] = t_slot[1] if len(t_slot[1]) == 2 else '0'+t_slot[1]
  return ''+t_slot[0]+t_slot[1]+t_slot[2]

In [7]:
def find_similarity(Embed_):
  """
  This function calculate the cosine similarity between each pair of words in a document and sort them as a new vector
  Arguments:
    Embed_: a dictionary of all doc and embedding seperated into their timestamps {timestamp1 {doc1{token1: embedding, token2: embedding}}}
  Returns:
    neighbor_ts_doc: a dictionary of all timestamps, docs and the neighbor similarity for each word
  """
  tmp_collector = dict()
  neighbor_ts_doc = dict()
  for ts_ in Embed_.keys():
    temp = Embed_[ts_]
    final_list = dict()
    for doc in temp.keys():
      if doc in tmp_collector.keys():
        final_list[doc] = tmp_collector[doc]
      else:
        neighbor_dict = dict()
        track_dict = dict()
        
        for f_word, f_emb in temp[doc].items():
          temp_list = list()
          
          for s_word, s_emb in temp[doc].items():
            if (s_word, f_word) in track_dict.keys():
              temp_list.append(track_dict[(s_word, f_word)])
            else:
              track_dict[(f_word, s_word)] = cosine_similarity([f_emb],[s_emb])[0][0]
              temp_list.append(track_dict[(f_word, s_word)])
          
          neighbor_dict[f_word] =sorted(temp_list, reverse=True)[1:] # ignore the cosine similarity between a word and itself

        final_list[doc] = neighbor_dict
        tmp_collector[doc] = neighbor_dict

      print('Document {} is done!'.format(doc))
    neighbor_ts_doc[ts_] = final_list
  return neighbor_ts_doc

In [8]:
def stopword_remover(all_dict_):
  final_dict_w_stopword = dict()
  for ts_ in all_dict_.keys():
    total_docs_clusters = all_dict_[ts_]
    #remove stopwords
    total_docs_clusters_wo_stopwords = dict()
    for doc in total_docs_clusters.keys():
      temp_doc = {}
      for cluster_rank in total_docs_clusters[doc].keys():
        temp_clustering = {}
        for ins in total_docs_clusters[doc][cluster_rank].keys():
          temp_cluster = {}
          for items in total_docs_clusters[doc][cluster_rank][ins].items():
            if items[0] not in stopwords:
              temp_cluster[items[0]] = items[1]
          temp_clustering[ins] = temp_cluster
        temp_doc[cluster_rank] = temp_clustering
      total_docs_clusters_wo_stopwords[doc] = temp_doc
    final_dict_w_stopword[ts_] = total_docs_clusters_wo_stopwords
  return final_dict_w_stopword

In [9]:
def get_df_size(list_of_df_):
  tmp_list = list()
  for df_ in list_of_df_:
    tmp_list.append(len(list(df_['id'])))
  return tmp_list

In [10]:
def merging_df(list_df_):
  new_df_list = list()
  for n in range(len(list_df_)-1):
    frames = [list_df_[n], list_df_[n+1]]
    new_df_list.append(pd.concat(frames))
  new_df_list.append(list_df_[n+1])
  return new_df_list

In [11]:
df = dataset_downloader('covid')
df['content'] = df['content'].apply( lambda x: special_char_remover(x)) # remove special characters
df['publish_time'] = df['publish_time'].apply(lambda x: time_fixer(x))
list_of_dfs = timestamps_generator(df)
print('before merging dfs: ' ,get_df_size(list_of_dfs))
list_of_timestamps = [list(x['id']) for x in list_of_dfs ]
list_of_dfs = merging_df(list_of_dfs)  # merging two coonsecuative df to each other and made a new one 
print('after merging dfs: ',get_df_size(list_of_dfs))

Downloading dataset...
Download is done!
dataset size after removing non-value cells is (2439, 5)
The new size of dataset is (1463, 5) and the columns are ['id' 'title' 'content' 'author' 'publish_time']
before merging dfs:  [8, 10, 21, 13, 22, 23, 15, 28, 14, 19, 18, 15, 12, 23, 8, 16, 8, 18, 21, 11, 13, 30, 20, 15, 31, 18, 19, 11, 18, 14, 10, 7, 22, 13, 12, 13, 18, 15, 16, 13, 21, 17, 24, 24, 14, 21, 14, 16, 13, 14, 22, 7, 12, 11, 25, 11, 25, 20, 19, 21, 22, 15, 22, 17, 6, 23, 11, 25, 12, 19, 8, 27, 9, 17, 14, 11, 11, 20, 18, 16, 16, 14, 23, 9, 19, 14, 24]
after merging dfs:  [18, 31, 34, 35, 45, 38, 43, 42, 33, 37, 33, 27, 35, 31, 24, 24, 26, 39, 32, 24, 43, 50, 35, 46, 49, 37, 30, 29, 32, 24, 17, 29, 35, 25, 25, 31, 33, 31, 29, 34, 38, 41, 48, 38, 35, 35, 30, 29, 27, 36, 29, 19, 23, 36, 36, 36, 45, 39, 40, 43, 37, 37, 39, 23, 29, 34, 36, 37, 31, 27, 35, 36, 26, 31, 25, 22, 31, 38, 34, 32, 30, 37, 32, 28, 33, 38, 24]


In [15]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument 
from nltk.tokenize import word_tokenize
def doc2Vec_generator(document_):
  """
  This function receives a list of sentences as a doc and returns an embedding vector for it
  """
  return document_


In [38]:
def pre_processing(sentence_, stopwords_):
  words = word_tokenize(sentence_)
  ps = PorterStemmer()
  rootWord = list()
  for w in words:
      if w not in stopwords_:
        rootWord.append(ps.stem(w))
  result = ' '.join( rootWord)

In [33]:
# doc = ["I love data science",
#         "I love coding in python",
#         "I love building NLP tool",
#         "This is a good phone",
#         "This is a good TV",
#         "This is a good laptop"]

#lemmatizing, removing stopwords and stemming must be done before going any steps further



# Tokenization of each document
tokenized_doc = []
for index, row in df.iterrows():
  tokenized_doc.append(word_tokenize(row['content'].lower()))

# Convert tokenized document into gensim formated tagged data
tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]
# Train the model
model = Doc2Vec(vector_size= 150, window = 5, min_count = 2, workers =4, epochs= 100)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
df['embedding_vector'] = df['content'].apply(lambda x: model.infer_vector(x)) 

In [None]:
embedding_dict= dict()
for ts,data_ in zip(range(len(list_of_dfs)), list_of_dfs):
  temp_dict = dict()
  data_['embedding_vector'] = data_['content'].apply(lambda x: model.infer_vector(pre_processing(x)))
  for index, row in data_.iterrows():
    temp_dict[index] = row['embedding_vector']
  embedding_dict[ts] = temp_dict

In [45]:
def similarity_generator(total_doc_word_embedding):
  """
  doc_matrix is a dictionary of each doc and other doc similarity to this doc based ondoc2vec             doc_embedding ={doc0: {1: int(9*euclidian_distance +1 ), 2:int(9*euclidian_distance +1 )}, doc1}  

  """
  total_dict_ = dict()
  for ts_ in total_doc_word_embedding.keys():
    doc_word_embedding = total_doc_word_embedding[ts_]
    doc_embedding = dict()
    for first_doc in doc_word_embedding.keys():
      temp_dict = dict()
      for second_doc in doc_word_embedding.keys(): 
        if first_doc != second_doc:
          temp_dict[second_doc] =cosine_similarity([doc_word_embedding[first_doc]],[doc_word_embedding[second_doc]])[0][0]
      doc_embedding[first_doc] = temp_dict
      print('Document {} is done!'.format(first_doc) )
    total_dict_[ts_] = doc_embedding
  return total_dict_

In [46]:
Embed_dict = dict() # {timestamp1 {doc1{(number_of_tokens+1, 768)}, doc2{}}, timestamp2{}}

Embed_dict= similarity_generator(embedding_dict)

Document 2166 is done!
Document 1610 is done!
Document 1793 is done!
Document 1816 is done!
Document 282 is done!
Document 390 is done!
Document 165 is done!
Document 1916 is done!
Document 2076 is done!
Document 1717 is done!
Document 539 is done!
Document 1489 is done!
Document 2206 is done!
Document 2285 is done!
Document 1885 is done!
Document 950 is done!
Document 290 is done!
Document 835 is done!
Document 2076 is done!
Document 1717 is done!
Document 539 is done!
Document 1489 is done!
Document 2206 is done!
Document 2285 is done!
Document 1885 is done!
Document 950 is done!
Document 290 is done!
Document 835 is done!
Document 1383 is done!
Document 2388 is done!
Document 1057 is done!
Document 1815 is done!
Document 1056 is done!
Document 2025 is done!
Document 476 is done!
Document 711 is done!
Document 2316 is done!
Document 1567 is done!
Document 1268 is done!
Document 240 is done!
Document 1804 is done!
Document 1744 is done!
Document 658 is done!
Document 1083 is done!
Doc

In [47]:
print(Embed_dict[0][2166])

{1610: 0.99190104, 1793: 0.98670876, 1816: 0.9877736, 282: 0.98519546, 390: 0.9853779, 165: 0.9768639, 1916: 0.9791419, 2076: 0.98008704, 1717: 0.9914881, 539: 0.98315895, 1489: 0.9802011, 2206: 0.9671827, 2285: 0.9824266, 1885: 0.9731506, 950: 0.9838557, 290: 0.98545337, 835: 0.9606161}


In [52]:
def path_finder(start_, all_d):
  r= 0
  res = list()
  res.append(start_)
  while(r<len(all_d[0])-1):
    if start_ not in list_of_ts_[r]:
      r +=1
    min_val = min(all_d[r][start_].values())
    min_ind = [x for x,y in all_d[r][start_].items() if y==min_val]
    res.append(min_ind[0])
    if start_ in list_of_ts_[r]:
      del all_d[r][start_]
      for w in all_d[r].keys():
        del all_d[r][w][start_]
    if r == 20:
      break
    if start_ in list_of_ts_[r+1]:
      del all_d[r+1][start_]
      for w in all_d[r+1].keys():
        del all_d[r+1][w][start_]
    if r == 20:
      break

    start_ = min_ind[0]
  return res

In [53]:
import copy
new_list = dict()
for v in range(len(list_of_timestamps)):
  new_list[v] = list_of_timestamps[v]
list_of_ts_ = new_list.copy()
full_story = list()
for ni in list_of_ts_[0]:
  full_story.append(path_finder(ni, copy.deepcopy(Embed_dict)))

for yu in full_story:
  print(len(yu), yu)

KeyError: ignored