# Γιάννης Δαλιάνης
# 1115201700027
# Homework 4
<!-- # Άσκηση 1 -->

<!-- [Source](https://github.com/bentrevett/pytorch-sentiment-analysis) -->

## Imports

In [None]:
import os
import re
import pandas as pd
import time
import json

from google.colab import drive
from scipy.spatial.distance import cosine
from ast import literal_eval
import torch

In [None]:
!pip install transformers==3

In [None]:
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer

## A GPU can be added by going to the menu and selecting:
## Runtime -> Change runtime type -> Hardware accelerator: GPU

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs
start_timeTOTAL = time.time()
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## Load Dataset

In [None]:
drive.mount('/content/gdrive', force_remount=True)

if os.path.isfile('/content/gdrive/My Drive/dataframe.csv'):
  print ("File exist")
  df2 = pd.read_csv('/content/gdrive/My Drive/dataframe.csv')
  df2 = df2.drop(['Unnamed: 0'], axis=1)
else:
  print ("File not exist")
  num = 0
  
  totalList = list()
  for filename in os.listdir('/content/gdrive/My Drive/DI/Colab Notebooks/ex4/comm_use_subset/'):
    rowList = list()
    rowList.append(filename)
    if num%300 == 0:
      print(num)
    with open('/content/gdrive/My Drive/DI/Colab Notebooks/ex4/comm_use_subset/' + filename) as json_file:
        wholeText = ""
        data = json.load(json_file)
        
        article_title = data['metadata']['title']
        
        abstract = list()
        abstract.append(article_title)
        for p in data['abstract']:
          abstract.append(p['text'])
          wholeText += p['text']
          wholeText += " "
        
        body_text = list()
        for p in data['body_text']:
          body_text.append(p['text'])
          wholeText += p['text']
          wholeText += " "
        
        rowList.append(article_title)
        rowList.append(wholeText)
        num += 1
    
        rowList.append(article_title + ". " + wholeText)
        comb = abstract + body_text
        
        rowList.append(comb)

    totalList.append(rowList)
  
  df2 = pd.DataFrame(totalList, columns=['fileName', 'fileTitle', 'fileText', 'combinedTitle_and_Text', 'paragraphs'])
  df2.to_csv(r'/content/gdrive/My Drive/DI/Colab Notebooks/ex4/dataframe.csv')

df2['fileTitle'] = df2['fileTitle'].astype(str)
df2['fileText'] = df2['fileText'].astype(str)
df2['combinedTitle_and_Text'] = df2['combinedTitle_and_Text'].astype(str)

## Preprocessing

Text preprocessing isn't essential, because we are working on scientific papers and almost every word, number etc are important for the content of the papers. Urls and special characters have been removed.

Saves paragraphs as string instead of list so use literal_eval.

In [None]:
def cleanText(text):
    text = text.str.replace(r'RT[\s]+', '')                                             # Removing RT
    text = text.str.replace(r'#.*?(?=\s|$)', '')                                        # remove hashtags and mentions
    text = text.str.replace(r'@.*?(?=\s|$)', '')
    text = text.replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)  # remove urls
    text = text.apply(lambda x: re.split('https:\/\/.*', str(x))[0])
    text = text.str.replace(r"\'re", " are")                                            # Change 're to 'are'
    text = text.str.replace(r"\'t", " not")                                             # Change 't to 'not'
    text = text.str.replace(r"\'d", " would")                                           # Change 'd to 'would'
    text = text.replace(r'\\n',' ', regex=True)                                         # remove newlines and other special characters
    text = text.replace(r'\\u',' ', regex=True)
    text = text.replace(r'\\x',' ', regex=True)
    return text

df2['fileTitle'] = cleanText(df2['fileTitle'])
df2['fileText'] = cleanText(df2['fileText'])
df2['combinedTitle_and_Text'] = cleanText(df2['combinedTitle_and_Text'])

df2.head()

## Querries

In [None]:
quers = [
    "What are the coronoviruses?",
    "What was discovered in Wuhuan in December 2019?",
    "What is Coronovirus Disease 2019?",
    "What is COVID-19?",
    "What is caused by SARS-COV2?",
    "How is COVID-19 spread?",
    "Where was COVID-19 discovered?",
    "How does coronavirus spread?"
]

## Implementation description

The 2 embedding technics used are doc2vec and SBert. For each one, we identify papers similar to our question based on titles only and based on titles combined with corpus as well. 'Paragraphs' column is used for returning passages that answer our querries. The QA bert model doesn't work with large corpora. 

Results are better when we train our models on large text corpora, because scientific papers are very large, so they should be thoroughly examined as it comes to their similarity with the given question.

SBert works better than doc2vec because it follows a context-dependet word embedding method which takes into account the content of the papers more clearly. SBert is also by far quicker than doc2vec. As Sbert is context-dependet, it returns more similar papers. SBert returns more suitable papers for the most of our questions. 

When using the full corpora, we print more indicative passages to see how good our results are. Especially SBert takes into account words of the same semantic family.

## SBERT with passage finding on titles

Sentence-BERT uses a Siamese network like architecture to provide 2 sentences as an input. These 2 sentences are then passed to BERT models and a pooling layer to generate their embeddings. Then use the embeddings for the pair of sentences as inputs to calculate the cosine similarity.

Sentence-BERT (SBERT) is inspired by the transformer model, a sequence model that dispenses of both convolutions and recurrence and uses attention instead to incorporate sequential information into sequence representation. This booming family includes BERT (and its extensions), GPT (1 and 2) and the XL-flavored transformers. These models generate contextual embeddings of input tokens (commonly sub-word units), each infused with information of its neighborhood, but are not aimed at generating a rich embedding space for input sequences. Sentence-BERT aims to adapt the BERT architecture by using siamese and triplet network structures to derive semantically meaningful sentence embeddings that can be compared using cosine-similarity.

Passage finding is a small QA task that returns thw part from the text that answers the specific question. Some co-students suggested splitting every corpus in sentences and returning the article based on one of these questions. This would also be the passage. I didn't follow this path because it is like using the same solution for both the first and second exercise. The QA though cannot be applied on very large text and that is why the column paragraphs is used.

Passages from titles only are expected to be more inaccurate.

https://www.sbert.net/examples/applications/semantic-search/README.html

In [None]:
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

modelBertQA = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [None]:
def answer_question(question, answer_text):
    '''
    Takes `question` string and `answer_text` string (contains the answer).
    Identifies the words within the `answer_text` that are the answer.
    '''
    # ======== Tokenize ========
    input_ids = tokenizer.encode(question, answer_text)

    # print('Query has {:,} tokens.\n'.format(len(input_ids)))

    if len(input_ids)>512:
      return 0

    # ======== Set Segment IDs ========
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # ======== Evaluate ========
    # Run our example question through the model.
    start_scores, end_scores = modelBertQA(torch.tensor([input_ids]), # The tokens representing our input text.
                                    token_type_ids=torch.tensor([segment_ids]), return_dict=False) # The segment IDs to differentiate question from answer_text

    # ======== Reconstruct Answer ========
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Get the string versions of the input tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]

    if not answer.startswith("[CLS]") and not answer.endswith("[SEP]"):
        print('Passage: "' + answer + '"')

    return 1

In [None]:
from sentence_transformers import util

# embedder = SentenceTransformer('bert-base-nli-mean-tokens')
embedder = SentenceTransformer('paraphrase-distilroberta-base-v1')

corpus_embeddings = embedder.encode(df2['fileTitle'], convert_to_tensor=True)
corpus_embeddings = corpus_embeddings.to('cuda')

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(df2['fileTitle']))
for query in quers:
    
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    query_embedding = query_embedding.to('cuda')

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar titles:")

    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)

    for i in hits[0]:
      print( "  ", df2['fileTitle'].iloc[i['corpus_id']], "(Score: {:.4f})".format(i['score']) )

    answer_question(query, df2['fileTitle'].iloc[ hits[0][0]['corpus_id'] ])

## SBERT with passage finding on combined titles and corpus

In [None]:
modelBertQA = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Also returns the more suitable passage(if exists) from every paper paragraph.

In [None]:
# embedder = SentenceTransformer('bert-base-nli-mean-tokens')
embedder = SentenceTransformer('paraphrase-distilroberta-base-v1')

corpus_embeddings = embedder.encode(df2['combinedTitle_and_Text'], convert_to_tensor=True)
corpus_embeddings = corpus_embeddings.to('cuda')

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(df2['combinedTitle_and_Text']))
for query in quers:
    
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    query_embedding = query_embedding.to('cuda')

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar text:")

    hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=top_k)

    for i in hits[0]:
      print( "  ", df2['combinedTitle_and_Text'].iloc[i['corpus_id']], "(Score: {:.4f})".format(i['score']) )
      
    print("\nFor more similar text:")
    
    for i in literal_eval(df2['paragraphs'].iloc[ hits[0][0]['corpus_id'] ]):
      if answer_question(query, i)==0:
        firstpart, secondpart = i[:int(int(len(i))/2)], i[int(int(len(i))/2):]
        answer_question(query, firstpart)
        answer_question(query, secondpart)

We see how often the passages are similar to the questions.

## Time Needed

In [None]:
end_timeTOTAL = time.time()
mins, secs = epoch_time(start_timeTOTAL, end_timeTOTAL)
print(f'Total Time: {mins}m {secs}s')