# Γιάννης Δαλιάνης
# 1115201700027
# Homework 4
<!-- # Άσκηση 1 -->

<!-- [Source](https://github.com/bentrevett/pytorch-sentiment-analysis) -->

## Imports

In [None]:
import os
import re
import pandas as pd
import time
import json

import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from google.colab import drive
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import torch

## A GPU can be added by going to the menu and selecting:
## Runtime -> Change runtime type -> Hardware accelerator: GPU

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs
start_timeTOTAL = time.time()
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## Load Dataset

In [None]:
drive.mount('/content/gdrive', force_remount=True)

if os.path.isfile('/content/gdrive/My Drive/giannikos/dataframe.csv'):
  print ("File exist")
  df2 = pd.read_csv('/content/gdrive/My Drive/giannikos/dataframe.csv')
  df2 = df2.drop(['Unnamed: 0'], axis=1)
else:
  print ("File not exist")
  num = 0
  
  totalList = list()
  for filename in os.listdir('/content/gdrive/My Drive/giannikos/comm_use_subset/'):
    rowList = list()
    rowList.append(filename)
    if num%300 == 0:
      print(num)
    with open('/content/gdrive/My Drive/giannikos/comm_use_subset/' + filename) as json_file:
        wholeText = ""
        data = json.load(json_file)
        
        article_title = data['metadata']['title']
        
        abstract = list()
        abstract.append(article_title)
        for p in data['abstract']:
          abstract.append(p['text'])
          wholeText += p['text']
          wholeText += " "
        
        body_text = list()
        for p in data['body_text']:
          body_text.append(p['text'])
          wholeText += p['text']
          wholeText += " "
        
        rowList.append(article_title)
        rowList.append(wholeText)
        num += 1
    
        rowList.append(article_title + ". " + wholeText)
        comb = abstract + body_text
        
        rowList.append(comb)

    totalList.append(rowList)
  
  df2 = pd.DataFrame(totalList, columns=['fileName', 'fileTitle', 'fileText', 'combinedTitle_and_Text', 'paragraphs'])
  df2.to_csv(r'/content/gdrive/My Drive/DI/Colab Notebooks/ex4/dataframe.csv')

df2['fileTitle'] = df2['fileTitle'].astype(str)
df2['fileText'] = df2['fileText'].astype(str)
df2['combinedTitle_and_Text'] = df2['combinedTitle_and_Text'].astype(str)
# df2 = df2[:6000]

## Preprocessing

Text preprocessing isn't essential, because we are working on scientific papers and almost every word, number etc are important for the content of the papers. Urls and special characters have been removed.

Saves paragraphs as string instead of list so use literal_eval.

In [None]:
def cleanText(text):
    text = text.str.replace(r'RT[\s]+', '')                                             # Removing RT
    text = text.str.replace(r'#.*?(?=\s|$)', '')                                        # remove hashtags and mentions
    text = text.str.replace(r'@.*?(?=\s|$)', '')
    text = text.replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)  # remove urls
    text = text.apply(lambda x: re.split('https:\/\/.*', str(x))[0])
    text = text.str.replace(r"\'re", " are")                                            # Change 're to 'are'
    text = text.str.replace(r"\'t", " not")                                             # Change 't to 'not'
    text = text.str.replace(r"\'d", " would")                                           # Change 'd to 'would'
    text = text.replace(r'\\n',' ', regex=True)                                         # remove newlines and other special characters
    text = text.replace(r'\\u',' ', regex=True)
    text = text.replace(r'\\x',' ', regex=True)
    return text

df2['fileTitle'] = cleanText(df2['fileTitle'])
df2['fileText'] = cleanText(df2['fileText'])
df2['combinedTitle_and_Text'] = cleanText(df2['combinedTitle_and_Text'])

df2.head()

## Querries

In [None]:
quers = [
    "What are the coronoviruses?",
    "What was discovered in Wuhuan in December 2019?",
    "What is Coronovirus Disease 2019?",
    "What is COVID-19?",
    "What is caused by SARS-COV2?",
    "How is COVID-19 spread?",
    "Where was COVID-19 discovered?",
    "How does coronavirus spread?"
]

## Implementation description

The 2 embedding technics used are doc2vec and SBert. For each one, we identify papers similar to our question based on titles only and based on titles combined with corpus as well. 'Paragraphs' column is used for returning passages that answer our querries. The QA bert model doesn't work with large corpora. 

Results are better when we train our models on large text corpora, because scientific papers are very large, so they should be thoroughly examined as it comes to their similarity with the given question.

SBert works better than doc2vec because it follows a context-dependet word embedding method which takes into account the content of the papers more clearly. SBert is also by far quicker than doc2vec. As Sbert is context-dependet, it returns more similar papers. SBert returns more suitable papers for the most of our questions. 

When using the full corpora, we print more indicative passages to see how good our results are. Especially SBert takes into account words of the same semantic family.

## doc2vec on titles only

The word2vec gives a numeric representation for each word, that will be able to capture relations between words such as Paris and France. This is part of a wider concept in machine learning — the feature vectors.

An extension of Word2Vec, the Doc2Vec embedding. Build a tagged sentence corpus. Each sentence is now represented as a TaggedDocument containing a list of the words in it and a tag associated with it. Our text needs to have been tokenized.

In [None]:
df2["toked"] = df2["fileTitle"].apply(word_tokenize)

tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(df2["toked"])]
tagged_data[:6]

In [None]:
## Train doc2vec model
model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100)

## Print model vocabulary
num=0
for i, j in ((model.wv.vocab).items()):
  if num>5:
    break
  print(i, j)
  num+=1

In [None]:
for qSent in quers:
  test_doc = word_tokenize(qSent)

  test_doc_vector = model.infer_vector(test_doc)

  ret = model.docvecs.most_similar(positive = [test_doc_vector])

  print("\n\n======================\n\n")
  print("Query:", qSent)
  print("\nTop 5 most similar text:")

  for i in (range(5)):
    print( "Similarity", ret[i][1] )
    print( "Article Title:\n", df2['fileTitle'].iloc[ret[i][0]])
    # print( "Article Body:\n", df2['fileText'].iloc[ret[i][0]])
    print()

## doc2vec on combined Titles and Corpus

In [None]:
df2["toked1"] = df2["combinedTitle_and_Text"].apply(word_tokenize)

tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(df2["toked1"])]
tagged_data[:6]

In [None]:
## Train doc2vec model
model = Doc2Vec(tagged_data, vector_size = 20, window = 2, min_count = 1, epochs = 100)

num=0
for i, j in ((model.wv.vocab).items()):
  if num>5:
    break
  print(i, j)
  num+=1

In [None]:
for qSent in quers:
  test_doc = word_tokenize(qSent)

  test_doc_vector = model.infer_vector(test_doc)

  ret = model.docvecs.most_similar(positive = [test_doc_vector])

  print("\n\n======================\n\n")
  print("Query:", qSent)
  print("\nTop 5 most similar text:")

  for i in (range(5)):
    print( "Similarity", ret[i][1] )
    print( "Article Title:\n", df2['fileTitle'].iloc[ret[i][0]])
    # print( "Article Body:\n", df2['fileText'].iloc[ret[i][0]])
    print()

## Time Needed

In [None]:
end_timeTOTAL = time.time()
mins, secs = epoch_time(start_timeTOTAL, end_timeTOTAL)
print(f'Total Time: {mins}m {secs}s')