# Kaggle COVID 19 CORD-19-research-challenge
https://www.kaggle.com/allen-institute-for-ai/CORD-19-research-challenge

## 1. Prepare Environment and dowload data

In [0]:
# connect to drive
%cd "/content/drive/My Drive"
%mkdir covid19
%cd covid19

# clone repo for codes
%mkdir code 
%mkdir model
%cd code 

!git clone https://github.com/dmis-lab/bioasq-biobert.git
!git clone https://github.com/facebookresearch/DrQA.git
%cd bioasq-biobert
!pip install -r requirements.txt

# download data
!pip install kaggle
%cd ..
%cd ..
%mkdir data 
%cd data
%cd /root
!mkdir .kaggle
%cd .kaggle
! echo -e '{"username":"ari994","key":"0a0aa1fca6f950db4ee7c046894640b7"}' >> kaggle.json
%cd "/content/drive/My Drive/covid19/data"
!kaggle datasets download allen-institute-for-ai/CORD-19-research-challenge
!unzip CORD-19-research-challenge.zip
%cd /content/drive/My Drive/covid19/code/DrQA
!pip install -r requirements.txt
!python setup.py develop

In [0]:
%cd "/content/drive/My Drive/covid19/data"
import os
import json 
from tqdm import tqdm

path_to_jsons = "2020-03-13/comm_use_subset/comm_use_subset"

jsons = os.listdir(path_to_jsons)

def extract_abstracts(jsons, out="abstracts.txt"):
  f = open(out, "w")
  n_processed = 0
  for j in tqdm(jsons):
    j = json.loads(open(path_to_jsons+"/"+j).read())
    t = j['metadata']['title'].strip()
    abst = ""
    if len(j['abstract']) == 0 or len(t.strip())==0:
      continue
    else:
      n_processed += 1
    
    for a in j['abstract']:
      abst += a['text'].strip()+" "
    f.write(t+"\t"+abst+"\n")
  f.close()
  print(n_processed)

def extract_text(jsons, out="text.txt"):
  f = open(out, "w")
  n_processed = 0
  thresh_words = 300
  for j in tqdm(jsons):
    j = json.loads(open(path_to_jsons+"/"+j).read())
    t = j['metadata']['title'].strip()
    text = ""
    if len(t.strip())==0:
      continue
    else:
      n_processed += 1
    ix = 1
    for a in j['abstract']:
      text += a['text'].strip()+" "
      if len(text.split(" ")) >= thresh_words:
        f.write(j['paper_id']+"_"+str(ix)+"|"+t+"|"+text+"\n")
        text = ""
        ix += 1
    # for b in j['body_text']:
    #   text += b['text'].strip() 
    #   if len(text.split(" ")) >= thresh_words:
    #     f.write(j['paper_id']+"_"+str(ix)+"|"+t+"|"+text+"\n")
    #     text = ""
    #     ix += 1
    f.write(j['paper_id']+"_"+str(ix)+"|"+t+"|"+text+"\n")
  f.close()
  print(n_processed)

# extract_abstracts(jsons)
extract_text(jsons)

In [0]:
# convert data to DrQA doc retriver format
def convert_to_drqa_retriver(text_fn):
  f = open(text_fn)
  fn = open("/content/drive/My Drive/covid19/data/data_drqa_retriver.json","w")
  n_line = 0
  for line in f:
    n_line += 1
    d_id = line.split("|")[0] + " | "+line.split("|")[1]
    content = line.split("|")[2]
    fn.write(json.dumps({"id": d_id, "text": content.strip()})+"\n")
  fn.close()
  print(n_line,"processed.")

convert_to_drqa_retriver("/content/drive/My Drive/covid19/data/text.txt")

## 3. Train Document Ranker

### 3.1 Tf-Idf based DrQA ranker

https://github.com/facebookresearch/DrQA/tree/master/scripts/retriever

In [0]:
%cd "/content/drive/My Drive/covid19/code/DrQA/scripts/retriever"
!rm -f "/content/drive/My Drive/covid19/data/doc.db"
!python build_db.py "/content/drive/My Drive/covid19/data/data_drqa_retriver.json" "/content/drive/My Drive/covid19/data/doc.db" --num-workers 6
!mkdir "/content/drive/My Drive/covid19/model/drqa_doc_ranker"
!python build_tfidf.py "/content/drive/My Drive/covid19/data/doc.db" "/content/drive/My Drive/covid19/model/drqa_doc_ranker" --num-workers 6
#test doc ranker
%cd "/content/drive/My Drive/covid19/code/DrQA/scripts/retriever"
!python interactive.py --model "/content/drive/My Drive/covid19/model/drqa_doc_ranker/doc-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz"

### 3.2 Lucene Ranker

In [0]:
%cd "/content/drive/My Drive/covid19/data/"
data = "/content/drive/My Drive/covid19/data/data_drqa_retriver.json"



## 4. Prepare data for QnA

In [0]:
# download data and config
%cd "/content/drive/My Drive/covid19/data"

!wget https://raw.githubusercontent.com/circulosmeos/gdown.pl/master/gdown.pl
!chmod 777 gdown.pl

!./gdown.pl "https://drive.google.com/file/d/17fX1-oChZ5rxu-e-JuaZl2I96q1dGJO4/view" bert_config.json
!./gdown.pl "https://drive.google.com/file/d/1GQUvBbXvlI_PeUPsZTqh7xQDZMOXh7ko/view" vocab.txt


In [0]:
#download pretrained model
%cd "/content/drive/My Drive/covid19/model"
!wget https://raw.githubusercontent.com/circulosmeos/gdown.pl/master/gdown.pl
!chmod 777 gdown.pl
!./gdown.pl "https://drive.google.com/uc?id=1rXFQRcV69QHAxghQ3NeAlhkg6ykpflVK&export=download" BERT-pubmed-1000000-SQuAD.tar.gz
!tar -xvf BERT-pubmed-1000000-SQuAD.tar.gz
!rm -f BERT-pubmed-1000000-SQuAD.tar.gz

## 5. Train BERT QnA

https://github.com/dmis-lab/bioasq-biobert

In [0]:
%cd "/content/drive/My Drive/covid19/code/bioasq-biobert"
!export BIOBERT_DIR="/content/drive/My Drive/covid19/model"
!export BIOASQ_DIR="/content/drive/My Drive/covid19/data"

!python run_factoid.py \
     --do_train=True \
     --do_predict=True \
     --vocab_file="/content/drive/My Drive/covid19/data/vocab.txt" \
     --bert_config_file="/content/drive/My Drive/covid19/data/bert_config.json" \
     --init_checkpoint="/content/drive/My Drive/covid19/model/model.ckpt-14599" \
     --max_seq_length=384 \
     --train_batch_size=4 \
     --learning_rate=5e-6 \
     --doc_stride=128 \
     --num_train_epochs=5.0 \
     --do_lower_case=False \
     --train_file="/content/drive/My Drive/covid19/data/BioASQ-6b/train/Full-Abstract/BioASQ-train-factoid-6b-full-annotated.json" \
     --predict_file="/content/drive/My Drive/covid19/data/BioASQ-6b/test/Full-Abstract/BioASQ-test-factoid-6b-3.json" \
     --output_dir="/content/drive/My Drive/covid19/output"

## 6. Infer QnA


In [0]:
%cd "/content/drive/My Drive/covid19/code"

template = """
{
    "version": "BioASQ6b", 
    "data": [
      {
        "title": "BioASQ6b", 
        "paragraphs": [
          {
            "context": "[[CONTENT]]", 
            "qas": [
              {
                "question": "[[QUERY]]", 
                "id": "xx01"
              }
            ]
          }
        ]
        }
    ]
}
"""
con = input("Enter Context:")
q = input("Query:")
template = template.replace("[[CONTENT]]",con)
template = template.replace("[[QUERY]]",q)

f = open('temp.json','w')
f.write(template)
f.close()
%cd "/content/drive/My Drive/covid19/code/bioasq-biobert"
!python run_factoid.py \
     --do_train=False \
     --do_predict=True \
     --vocab_file="/content/drive/My Drive/covid19/data/vocab.txt" \
     --bert_config_file="/content/drive/My Drive/covid19/data/bert_config.json" \
     --init_checkpoint="/content/drive/My Drive/covid19/model/model.ckpt-14599" \
     --max_seq_length=384 \
     --train_batch_size=6 \
     --learning_rate=5e-6 \
     --doc_stride=128 \
     --num_train_epochs=5.0 \
     --do_lower_case=False \
     --predict_file="/content/drive/My Drive/covid19/code/temp.json" \
     --output_dir="/content/drive/My Drive/covid19/output"

import json
pred = json.loads(open('/content/drive/My Drive/covid19/output/predictions.json').read())

print("\n\n")
print("Question:",q)
for k in pred:
  print("Answer:",pred[k])



## 7. Doc Ranker + Question Answering

In [0]:
%cd /content/drive/My Drive/covid19/code/DrQA
!pip install -r requirements.txt
!pip install elasticsearch==6.0
!python setup.py develop

import json
from drqa import retriever
import numpy as np 

np_load_old = np.load


#numpy fix
np.load = lambda *a,**k: np_load_old(*a, allow_pickle=True, **k)

ranker = retriever.get_class('tfidf')(tfidf_path="/content/drive/My Drive/covid19/model/drqa_doc_ranker/doc-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz")
np.load = np_load_oldf = 
def load_db(path="/content/drive/My Drive/covid19/data/data_drqa_retriver.json"):
  x = open(path)
  db = {}
  for line in x:
    j = json.loads(line)
    db[j['id']]=j['text']
  print()
  return db

In [0]:
db = load_db()
%cd "/content/drive/My Drive/covid19/code"
import os
import json
import subprocess
import sys

def run(cmd):
    proc = subprocess.Popen(cmd,
        stdout = subprocess.PIPE,
        stderr = subprocess.PIPE,
        shell=True
    )
    stdout, stderr = proc.communicate()
 
    return proc.returncode, stdout, stderr

def call(query,k=5):
  import prettytable
  doc_names, doc_scores = ranker.closest_docs(query, k)
  i = 0
  for d in doc_names:
    doc = db[d]
    template = """
    {
        "version": "BioASQ6b", 
        "data": [
          {
            "title": "BioASQ6b", 
            "paragraphs": [
              {
                "context": "[[CONTENT]]", 
                "qas": [
                  {
                    "question": "[[QUERY]]", 
                    "id": "xx01"
                  }
                ]
              }
            ]
            }
        ]
    }
    """
    doc = doc.replace('"','')
    template = template.replace("[[CONTENT]]",doc)
    template = template.replace("[[QUERY]]",query)
    f = open('/content/drive/My Drive/covid19/code/temp.json','w')
    f.write(template)
    f.close()
    # print(doc)
    cmd = 'python "/content/drive/My Drive/covid19/code/bioasq-biobert/run_factoid.py" --do_train=False --do_predict=True --vocab_file="/content/drive/My Drive/covid19/data/vocab.txt" --bert_config_file="/content/drive/My Drive/covid19/data/bert_config.json" --init_checkpoint="/content/drive/My Drive/covid19/output/model.ckpt-1000" --max_seq_length=384 --train_batch_size=14 --learning_rate=5e-6 --doc_stride=128 --num_train_epochs=5.0 --do_lower_case=False --predict_file="/content/drive/My Drive/covid19/code/temp.json" --output_dir="/content/drive/My Drive/covid19/output"'
    code, out, err = run(cmd)
    # print(out)
    print(err)
    pred = json.loads(open('/content/drive/My Drive/covid19/output/predictions.json').read())
    os.remove('/content/drive/My Drive/covid19/output/predictions.json')
    table = prettytable.PrettyTable(
        ['Rank', 'Answer', 'Doc Id']
    )
    for k in pred:
      pred = pred[k]
      break
    table.add_row([i+1,pred,d])
    i+=1
    print(table)

In [0]:
query="How much risk a smoker have compared to a non smoker for covid ?"
call(query, k=10)

In [0]:
!top