## Task 2 - CORD 19 Information Retrieval

In this notebook we fetch the papers published in 2019 or 2020, create the correspondent BoW vectors for each of the papers and find relevant sentences according to a query from those papers.

### Step 1: Prepare a csv file containing only papers published in 2019 and 2020

__IMPORTANT:__ We provide you this particular csv file, therefore you do not need to run the following script.If you do not want to wait for a long time, please skip this step! 

In [None]:
import pandas as pd
import os
import json
import numpy as np
# Removes all cite spans and ref spans including the brackets
# INPUT: json_file['body_text'][i]
# OUTPUT: plain text in a string without any references or annotations
def removeCitsRefs(json_paragraph):
    ranges = []
    places = []

    places.extend(json_paragraph['cite_spans'])
    places.extend(json_paragraph['ref_spans'])

    # Get ranges to delete
    for d in places:
        tmp = (d['start'], d['end'])
        ranges.append(tmp)
    text_list = list(json_paragraph['text'])
    for plc in ranges:
        for i in range(plc[0], plc[1]):
            text_list[i] = " "
    text = ''.join(text_list)
    return text


# INPUT: json_file['body_text']
# OUTPUT: Plain Text string
def getText(json_body):
    main_txt = ""
    for i in range(len(json_body)):
        tmp_txt = removeCitsRefs(json_body[i])
        tmp_txt += "\n"
        main_txt += tmp_txt
    return main_txt


# PaperId, Title + Body,
def getFileText(json_file):
    body = json_file['metadata']['title'] + " \n\n" + getText(json_file['body_text'])
    paper_id = json_file['paper_id']

    return [paper_id, body]


def createDataFrame(json_paths):
    files = []
    for j in range(len(json_paths)):
        # load json
        with open(json_paths[j], 'r') as f:
            article_json = json.load(f)
        files.append(getFileText(article_json))
    df = pd.DataFrame(files, columns=['paper_id', 'text'])
    return df


# Get data
# metadata = pd.read_csv("./data/metadata.csv", low_memory=False)
path_csv = "./data/papers19-20.csv"
path_metadata = './data/metadata.csv'

#Get metadata
metadata = pd.read_csv(path_metadata,low_memory=False)
#Drop all rows that doesn't have sha identifier and publish time
to_drop = list(metadata[pd.isna(metadata['publish_time'])].index)
to_drop = to_drop + list(metadata[pd.isna(metadata['sha'])].index)
to_drop = np.array(to_drop)
to_drop = list(np.unique(to_drop))
metadata = metadata.drop(to_drop,axis=0)
print("Got metadata")

#Filter only rows with publish_time either 2019 or 2020
p19 = metadata[metadata['publish_time'].str.contains('2019',regex=False)]
p20 = metadata[metadata['publish_time'].str.contains('2020',regex=False)]
metadata_papers = pd.concat([p19,p20],axis=0)
print("Got metadata from 2019 and 2020")

files = pd.DataFrame(columns=["name", "path"])

#Get JSON Files' paths
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        files = files.append({'name': filename, 'path': os.path.join(dirname, filename)}, ignore_index=True)
print("Got file paths")

#Drop irrelevant files that are not json
idx = files[files.name.str.contains(".json") == False].index
files.drop(index=idx, inplace=True)
files.reset_index(inplace=True)
files.drop("index", axis=1, inplace=True)
print("Got all file paths")
#Erase extensions
def getName(string):
    for i in range(len(string)):
        if string[i] == ".":
            return string[:i]
files['file_name'] = files['name'].map(lambda file_name: getName(file_name))
print("Delete extensions")

# Mark papers from 2019 or 2020
files['sha_flag'] = files['file_name'].isin(list(metadata_papers['sha']))
path_files = list(files[files['sha_flag']]['path'])


print("Cleaned paths without .json extension")
df_text = createDataFrame(path_files)
print("Created Dataframe")

df_text.to_csv(path_csv,index=False)
print("Saved DataFrame to ",os.path.abspath(path_csv))

### Step 2: Compute BoW vectors of the papers

__IMPORTANT:__ Since it takes a lot of time to compute the BoW vectors we provide you the BoW vectors. If you do not want to wait for a lot of time please skip this step.

Import relevant libraries

In [None]:
import nltk
import pandas as pd
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from gensim import corpora
import os
import pickle
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

Define the paths to get the relevant objects

In [None]:
# Get data
PATH_CSV = os.path.abspath("./data/papers19-20.csv")
PATH_INDEX = './data/similarities.index'
PATH_DICTIONARY = './data/dictionary19-20.dict'
PATH_TFIDF = './data/tfidf.pkl'
PATH_PICKLE = './data/bow_corpus19-20.pickle'

Get the papers data, tokenize them, find pos-tags of the words otherwise the lemmatization does not work and afterwards lemmatize them. 
Filter out the stop words and punctuation and also words that only appears once.

In [None]:
data = pd.read_csv(PATH_CSV)
print("Got csv")
papers = data['text'].values

# Tokenize
punctuation = ",.?!()-_\"\'\\\n\r\t;:+*<>@#§^$%&|/"
processed = [[w.lower() for w in word_tokenize(document)] for document in papers]
print("Tokenized")

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)


# Lemmatize
lemmatizer = WordNetLemmatizer()
processed = [[lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in doc] for doc in processed]
print("Lemmatized")
# Filter
processed = [[w for w in doc if (w not in stopwords.words('english')) and (w not in punctuation)] for doc in processed]
print("Filtered")

# Compute frequency
frequency = defaultdict(int)
for document in processed:
    for token in document:
        frequency[token] += 1
print("Frequencies computed")
# Get only words with frequency >1
processed_corpus = [[w for w in document if frequency[w] > 1] for document in processed]

Create the dictionary and get the BoW vectors. In case anything happens, save them.

In [None]:
# Save it into dictionary
dictionary = corpora.Dictionary(processed_corpus)
dictionary.save(PATH_DICTIONARY)
# Create BoW vectors
bow_corpus = [dictionary.doc2bow(text) for text in processed_corpus]
print("BoW vectors are created")

with open(PATH_PICKLE,"wb") as f :
    pickle.dump(bow_corpus,f)

### Step 3: Find the relevant sentences

Import libraries 

In [None]:
import pandas as pd
from gensim.corpora import Dictionary
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from gensim import corpora, similarities
from gensim import models
from nltk.corpus import wordnet
import nltk
import os
import pickle


Function for computing the bow_vectors of the relevant papers 

In [None]:
def getBowVectors(papers,dictionary):
    # Tokenize
    punctuation = ",.?!()-_\"\'\\\n\r\t;:+*<>@#§^$%&|/"
    processed = [[w.lower() for w in word_tokenize(document)] for document in papers]


    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)


    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    processed = [[lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in doc] for doc in processed]
    # Filter
    processed = [[w for w in doc if (w not in stopwords.words('english')) and (w not in punctuation)] for doc in processed]
    # Compute frequency
    frequency = defaultdict(int)
    for document in processed:
        for token in document:
            frequency[token] += 1
    # Get only words with frequency >1
    processed = [[w for w in document if frequency[w] > 1] for document in processed]
    bow_vectors = [dictionary.doc2bow(text) for text in processed]
    return bow_vectors

Set up the paths and parameters

In [None]:

#Number of relevant papers to fetch from corpus
n_papers = 10
n_sentences = 10
#Get data
PATH_CSV = os.path.abspath("./data/papers19-20.csv")
PATH_INDEX = './data/similarities.index'
PATH_DICTIONARY = './data/dictionary19-20.dict'
PATH_TFIDF = './data/tfidf.pkl'
PATH_SENTENCES = './data/query_results.csv'
PATH_PICKLE = './data/bow_corpus19-20.pickle'

Get the corpus' bow_vectors

In [None]:
with open(PATH_PICKLE,"rb") as file :
    bow_corpus = pickle.load(file)

Get papers csv file

In [None]:
data = pd.read_csv(PATH_CSV)
print("Got csv")
papers = data['text']

Load the dictionary and create the tf-idf

In [None]:

# Load dictionary
dictionary = Dictionary.load(PATH_DICTIONARY)
print("Loaded dictionary")
tfidf = models.TfidfModel(bow_corpus, id2word=dictionary, normalize=True, slope=0.25)
# Similarity Index
# Use especially similarity but not matrix similarity, since
index = similarities.Similarity(None, tfidf[bow_corpus], num_features=len(dictionary))

__QUERY__

In [None]:
query = "risk factor corona virus 2019"


Process the query 

In [None]:

def process_query(query):
    query = dictionary.doc2bow(query.lower().split(" "))
    query = tfidf[query]
    return query
bow_query = process_query(query)

Get documents similar to the query 

In [None]:
#Get similar docs and sort them according to their value in descending order
similars = index[bow_query]
similars = sorted(enumerate(similars), key=lambda item: -item[1])

For each paper find the keywords and the sentence they occured. Save them in a pandas dataframe

In [None]:
df = pd.DataFrame([],columns=['paper_id','sentence'])

#look deeper in each paper
for i in range(len(bow_papers)):
    sentences = []
    #Extract most important keywords of the document
    keyword_ids = [ s[0] for s in sorted(tfidf[bow_papers[i]],key=lambda tup: -tup[-1])][:n_sentences]

    keywords = []
    for j in keyword_ids:
        keywords.append(dictionary[j])

    # Get paper and split it into sentences
    tmp_paper_id = relevant_papers.iloc[i,0]
    tmp_paper = relevant_papers.iloc[i,1]
    tmp_paper = tmp_paper.split('.')
    #Find the sentences containing those words
    for sent in tmp_paper:
        for k in keywords:
            if k in sent:
                sentences.append([tmp_paper_id,sent])
                break

    df = df.append(pd.DataFrame(sentences,columns=['paper_id','sentence']),ignore_index=True)

Save pandas dataframe 

In [None]:
print(df)
df.to_csv(PATH_SENTENCES)
print("Relevant sentences are saved into ",PATH_SENTENCES)