In [24]:
import json
from tqdm import tqdm
import os
import re

In [25]:
# lets know where we are
cwd = os.getcwd()
print(cwd)

/home/ubuntu/cvd19-truth-finder/notebooks


# Retrieiver

In [None]:
RAW_DOC_PATH = "../cvd19-documents-raw/biorxiv_medrxiv/biorxiv_medrxiv"
PROCESSED_DOC_PATH = "../data/covid19/pre-database-documents/"

# Create Pre-Processed Files from papers

In [27]:
def iter_files(path):
    """Walk through all files located under a root path."""
    if os.path.isfile(path):
        yield path
    elif os.path.isdir(path):
        for dirpath, _, filenames in os.walk(path):
            for f in filenames:
                yield os.path.join(dirpath, f)
    else:
        raise RuntimeError('Path %s is invalid' % path)
        
def collect_text(text):
    """Combing list of text elements into one."""
    return " ".join(regex_filter(seg['text']) for seg in text)

def regex_filter(text):
    """Removing numbers and puncutation."""
    text = re.sub(r" \d+", "",text)
    return re.sub(r"[^A-Za-z0-9 -]+", "",text)

In [34]:
pre_processed_filename = "biorxiv_medrxiv.json"

# get all json papers in directory
jsons = list(iter_files(RAW_DOC_PATH))

# process them so they are {"id", <ID>, "text", <TEXT>} as specified here: https://github.com/facebookresearch/DrQA/tree/master/scripts/retriever
with open(os.path.join(PROCESSED_DOC_PATH, pre_processed_filename),  "w") as f:
    for j in tqdm(jsons):
        
        # reading in document
        document = json.loads(open(j).read())
        
        # extracting id and abstract
        doc_record = {
            "id": document['paper_id'],
            "text": collect_text(document['abstract'])
        }
        
        # saving to file
        f.write(json.dumps(doc_record)+"\n")
        
        

100%|██████████| 4/4 [00:00<00:00, 1058.97it/s]


# Create an SQLitte Database from Pre-Processed Files

In [36]:
!python ../scripts/retriever/build_db.py \
"/home/ubuntu/cvd19-truth-finder/data/covid19/pre-database-documents/biorxiv_medrxiv.json" \
"/home/ubuntu/cvd19-truth-finder/data/covid19/doc-db/doc.db" \
--num-workers=1

/home/ubuntu/cvd19-truth-finder/notebooks
/home/ubuntu/cvd19-truth-finder/data/covid19/doc-db/doc.db
['.ipynb_checkpoints']
05/09/2020 02:02:57 PM: [ Reading into database... ]
  0%|                                                     | 0/1 [00:00<?, ?it/s]
1it [00:00, 1082.68it/s]
100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 552.25it/s]
05/09/2020 02:02:57 PM: [ Read 4 docs. ]
05/09/2020 02:02:57 PM: [ Committing... ]


# Create TF-IDF Model from Database

In [37]:
!python ../scripts/retriever/build_tfidf.py \
"/home/ubuntu/cvd19-truth-finder/data/covid19/doc-db/doc.db" \
"/home/ubuntu/cvd19-truth-finder/data/covid19/doc_ranker"

05/09/2020 02:16:47 PM: [ Counting words... ]
05/09/2020 02:16:47 PM: [ Mapping... ]
05/09/2020 02:16:47 PM: [ -------------------------Batch 1/4------------------------- ]
05/09/2020 02:16:47 PM: [ -------------------------Batch 2/4------------------------- ]
05/09/2020 02:16:47 PM: [ -------------------------Batch 3/4------------------------- ]
05/09/2020 02:16:47 PM: [ -------------------------Batch 4/4------------------------- ]
05/09/2020 02:16:47 PM: [ Creating sparse matrix... ]
05/09/2020 02:16:47 PM: [ Making tfidf vectors... ]
05/09/2020 02:16:49 PM: [ Getting word-doc frequencies... ]
05/09/2020 02:16:50 PM: [ Saving to /home/ubuntu/cvd19-truth-finder/data/covid19/doc_ranker/doc-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz ]


---

# Reader

### Pre-Process SQUAD Dataset

#### Train set

In [55]:
!python ../scripts/reader/preprocess.py \
"/home/ubuntu/cvd19-truth-finder/drQA-training-datasets/datasets" \
"/home/ubuntu/cvd19-truth-finder/drQA-training-datasets/datasets" \
--split SQuAD-v1.1-train \
--tokenizer spacy

Loading dataset /home/ubuntu/cvd19-truth-finder/drQA-training-datasets/datasets/SQuAD-v1.1-train.json
Will write to file /home/ubuntu/cvd19-truth-finder/drQA-training-datasets/datasets/SQuAD-v1.1-train-processed-spacy.txt
Total time: 142.2876 (s)


#### dev set

In [57]:
!python ../scripts/reader/preprocess.py \
"/home/ubuntu/cvd19-truth-finder/drQA-training-datasets/datasets" \
"/home/ubuntu/cvd19-truth-finder/drQA-training-datasets/datasets" \
--split SQuAD-v1.1-dev \
--tokenizer spacy

Loading dataset /home/ubuntu/cvd19-truth-finder/drQA-training-datasets/datasets/SQuAD-v1.1-dev.json
Will write to file /home/ubuntu/cvd19-truth-finder/drQA-training-datasets/datasets/SQuAD-v1.1-dev-processed-spacy.txt
Total time: 21.2823 (s)


## Train Model

In [67]:
!python ../scripts/reader/train.py \
--num-epochs=1 \
--model-dir="/home/ubuntu/cvd19-truth-finder/data/covid19/model" \
--data-dir="/home/ubuntu/cvd19-truth-finder/drQA-training-datasets/datasets" \
--train-file="/home/ubuntu/cvd19-truth-finder/drQA-training-datasets/datasets/SQuAD-v1.1-train-processed-spacy.txt" \
--dev-file="/home/ubuntu/cvd19-truth-finder/drQA-training-datasets/datasets/SQuAD-v1.1-dev-processed-spacy.txt" \
--embed-dir="/home/ubuntu/cvd19-truth-finder/drQA-training-datasets/datasets/" \
--embedding-file="glove.6B.50d.txt"

05/09/2020 04:41:20 PM: [ COMMAND: ../scripts/reader/train.py --num-epochs=1 --model-dir=/home/ubuntu/cvd19-truth-finder/data/covid19/model --data-dir=/home/ubuntu/cvd19-truth-finder/drQA-training-datasets/datasets --train-file=/home/ubuntu/cvd19-truth-finder/drQA-training-datasets/datasets/SQuAD-v1.1-train-processed-spacy.txt --dev-file=/home/ubuntu/cvd19-truth-finder/drQA-training-datasets/datasets/SQuAD-v1.1-dev-processed-spacy.txt --embed-dir=/home/ubuntu/cvd19-truth-finder/drQA-training-datasets/datasets/ --embedding-file=glove.6B.50d.txt ]
05/09/2020 04:41:20 PM: [ ---------------------------------------------------------------------------------------------------- ]
05/09/2020 04:41:20 PM: [ Load data files ]
05/09/2020 04:41:41 PM: [ Num train examples = 86600 ]
05/09/2020 04:41:43 PM: [ Num dev examples = 10570 ]
05/09/2020 04:41:43 PM: [ ---------------------------------------------------------------------------------------------------- ]
05/09/2020 04:41:43 PM: [ Training mod





05/09/2020 04:42:24 PM: [ train: Epoch = 0 | iter = 25/2707 | loss = 9.29 | elapsed time = 19.81 (s) ]




05/09/2020 04:42:38 PM: [ train: Epoch = 0 | iter = 50/2707 | loss = 8.39 | elapsed time = 34.08 (s) ]






05/09/2020 04:42:58 PM: [ train: Epoch = 0 | iter = 75/2707 | loss = 8.50 | elapsed time = 54.22 (s) ]




05/09/2020 04:43:13 PM: [ train: Epoch = 0 | iter = 100/2707 | loss = 7.62 | elapsed time = 69.15 (s) ]






05/09/2020 04:43:29 PM: [ train: Epoch = 0 | iter = 125/2707 | loss = 7.56 | elapsed time = 85.29 (s) ]




05/09/2020 04:43:56 PM: [ train: Epoch = 0 | iter = 150/2707 | loss = 7.57 | elapsed time = 112.24 (s) ]






05/09/2020 04:44:14 PM: [ train: Epoch = 0 | iter = 175/2707 | loss = 7.08 | elapsed time = 129.48 (s) ]




05/09/2020 04:44:36 PM: [ train: Epoch = 0 | iter = 200/2707 | loss = 7.24 | elapsed time = 151.75 (s) ]






05/09/2020 04:44:54 PM: [ train: Epoch = 0 | iter = 225/2707 | loss = 7.14 | elapsed time = 170.01 (s) ]




05/09/2020 04:45:15 PM: [ train: Epoch = 0 | iter = 250/2707 | loss = 6.93 | elapsed time = 190.44 (s) ]






05/09/2020 04:45:35 PM: [ train: Epoch = 0 | iter = 275/2707 | loss = 7.11 | elapsed time = 210.53 (s) ]




05/09/2020 04:45:55 PM: [ train: Epoch = 0 | iter = 300/2707 | loss = 6.94 | elapsed time = 230.53 (s) ]






05/09/2020 04:46:15 PM: [ train: Epoch = 0 | iter = 325/2707 | loss = 6.74 | elapsed time = 251.02 (s) ]




05/09/2020 04:46:37 PM: [ train: Epoch = 0 | iter = 350/2707 | loss = 6.80 | elapsed time = 273.15 (s) ]






05/09/2020 04:46:56 PM: [ train: Epoch = 0 | iter = 375/2707 | loss = 6.66 | elapsed time = 291.60 (s) ]




05/09/2020 04:47:16 PM: [ train: Epoch = 0 | iter = 400/2707 | loss = 6.62 | elapsed time = 311.87 (s) ]






05/09/2020 04:47:35 PM: [ train: Epoch = 0 | iter = 425/2707 | loss = 6.83 | elapsed time = 330.71 (s) ]




05/09/2020 04:47:55 PM: [ train: Epoch = 0 | iter = 450/2707 | loss = 6.68 | elapsed time = 350.89 (s) ]






05/09/2020 04:48:14 PM: [ train: Epoch = 0 | iter = 475/2707 | loss = 6.51 | elapsed time = 369.51 (s) ]




05/09/2020 04:48:35 PM: [ train: Epoch = 0 | iter = 500/2707 | loss = 6.44 | elapsed time = 391.11 (s) ]






05/09/2020 04:48:56 PM: [ train: Epoch = 0 | iter = 525/2707 | loss = 6.46 | elapsed time = 411.40 (s) ]




05/09/2020 04:49:14 PM: [ train: Epoch = 0 | iter = 550/2707 | loss = 6.22 | elapsed time = 429.76 (s) ]






05/09/2020 04:49:35 PM: [ train: Epoch = 0 | iter = 575/2707 | loss = 6.57 | elapsed time = 451.15 (s) ]




05/09/2020 04:49:55 PM: [ train: Epoch = 0 | iter = 600/2707 | loss = 6.53 | elapsed time = 471.03 (s) ]






05/09/2020 04:50:17 PM: [ train: Epoch = 0 | iter = 625/2707 | loss = 6.57 | elapsed time = 493.16 (s) ]




05/09/2020 04:50:36 PM: [ train: Epoch = 0 | iter = 650/2707 | loss = 6.08 | elapsed time = 512.15 (s) ]






05/09/2020 04:50:55 PM: [ train: Epoch = 0 | iter = 675/2707 | loss = 6.50 | elapsed time = 530.72 (s) ]




05/09/2020 04:51:15 PM: [ train: Epoch = 0 | iter = 700/2707 | loss = 6.20 | elapsed time = 550.60 (s) ]






05/09/2020 04:51:35 PM: [ train: Epoch = 0 | iter = 725/2707 | loss = 6.20 | elapsed time = 570.87 (s) ]




05/09/2020 04:51:55 PM: [ train: Epoch = 0 | iter = 750/2707 | loss = 6.27 | elapsed time = 590.87 (s) ]






05/09/2020 04:52:13 PM: [ train: Epoch = 0 | iter = 775/2707 | loss = 6.17 | elapsed time = 608.61 (s) ]




05/09/2020 04:52:33 PM: [ train: Epoch = 0 | iter = 800/2707 | loss = 6.21 | elapsed time = 628.71 (s) ]






05/09/2020 04:52:51 PM: [ train: Epoch = 0 | iter = 825/2707 | loss = 6.17 | elapsed time = 646.96 (s) ]




05/09/2020 04:53:13 PM: [ train: Epoch = 0 | iter = 850/2707 | loss = 6.24 | elapsed time = 668.55 (s) ]






05/09/2020 04:53:34 PM: [ train: Epoch = 0 | iter = 875/2707 | loss = 6.05 | elapsed time = 689.54 (s) ]




05/09/2020 04:53:53 PM: [ train: Epoch = 0 | iter = 900/2707 | loss = 6.19 | elapsed time = 708.42 (s) ]






05/09/2020 04:54:14 PM: [ train: Epoch = 0 | iter = 925/2707 | loss = 6.10 | elapsed time = 729.83 (s) ]




05/09/2020 04:54:33 PM: [ train: Epoch = 0 | iter = 950/2707 | loss = 6.03 | elapsed time = 749.19 (s) ]






05/09/2020 04:54:53 PM: [ train: Epoch = 0 | iter = 975/2707 | loss = 6.18 | elapsed time = 768.46 (s) ]




05/09/2020 04:55:12 PM: [ train: Epoch = 0 | iter = 1000/2707 | loss = 6.05 | elapsed time = 788.10 (s) ]






05/09/2020 04:55:31 PM: [ train: Epoch = 0 | iter = 1025/2707 | loss = 5.99 | elapsed time = 807.07 (s) ]




05/09/2020 04:55:50 PM: [ train: Epoch = 0 | iter = 1050/2707 | loss = 5.82 | elapsed time = 826.09 (s) ]






05/09/2020 04:56:10 PM: [ train: Epoch = 0 | iter = 1075/2707 | loss = 5.77 | elapsed time = 845.38 (s) ]




05/09/2020 04:56:34 PM: [ train: Epoch = 0 | iter = 1100/2707 | loss = 6.09 | elapsed time = 870.12 (s) ]






05/09/2020 04:56:53 PM: [ train: Epoch = 0 | iter = 1125/2707 | loss = 5.61 | elapsed time = 889.18 (s) ]




05/09/2020 04:57:12 PM: [ train: Epoch = 0 | iter = 1150/2707 | loss = 5.76 | elapsed time = 908.20 (s) ]






05/09/2020 04:57:32 PM: [ train: Epoch = 0 | iter = 1175/2707 | loss = 5.57 | elapsed time = 927.40 (s) ]




05/09/2020 04:57:51 PM: [ train: Epoch = 0 | iter = 1200/2707 | loss = 5.76 | elapsed time = 946.37 (s) ]






05/09/2020 04:58:12 PM: [ train: Epoch = 0 | iter = 1225/2707 | loss = 6.01 | elapsed time = 968.29 (s) ]




05/09/2020 04:58:32 PM: [ train: Epoch = 0 | iter = 1250/2707 | loss = 5.83 | elapsed time = 987.87 (s) ]






05/09/2020 04:58:52 PM: [ train: Epoch = 0 | iter = 1275/2707 | loss = 5.78 | elapsed time = 1007.43 (s) ]




05/09/2020 04:59:09 PM: [ train: Epoch = 0 | iter = 1300/2707 | loss = 5.69 | elapsed time = 1024.78 (s) ]






05/09/2020 04:59:28 PM: [ train: Epoch = 0 | iter = 1325/2707 | loss = 5.78 | elapsed time = 1044.26 (s) ]




05/09/2020 04:59:48 PM: [ train: Epoch = 0 | iter = 1350/2707 | loss = 5.71 | elapsed time = 1063.87 (s) ]






05/09/2020 05:00:08 PM: [ train: Epoch = 0 | iter = 1375/2707 | loss = 5.40 | elapsed time = 1083.90 (s) ]




05/09/2020 05:00:30 PM: [ train: Epoch = 0 | iter = 1400/2707 | loss = 5.91 | elapsed time = 1105.36 (s) ]






05/09/2020 05:00:50 PM: [ train: Epoch = 0 | iter = 1425/2707 | loss = 5.64 | elapsed time = 1126.07 (s) ]




05/09/2020 05:01:09 PM: [ train: Epoch = 0 | iter = 1450/2707 | loss = 5.78 | elapsed time = 1145.14 (s) ]






05/09/2020 05:01:30 PM: [ train: Epoch = 0 | iter = 1475/2707 | loss = 5.60 | elapsed time = 1165.79 (s) ]




05/09/2020 05:01:53 PM: [ train: Epoch = 0 | iter = 1500/2707 | loss = 5.74 | elapsed time = 1188.77 (s) ]






05/09/2020 05:02:10 PM: [ train: Epoch = 0 | iter = 1525/2707 | loss = 5.59 | elapsed time = 1206.15 (s) ]




05/09/2020 05:02:30 PM: [ train: Epoch = 0 | iter = 1550/2707 | loss = 5.78 | elapsed time = 1225.34 (s) ]






05/09/2020 05:02:49 PM: [ train: Epoch = 0 | iter = 1575/2707 | loss = 5.82 | elapsed time = 1244.88 (s) ]




05/09/2020 05:03:10 PM: [ train: Epoch = 0 | iter = 1600/2707 | loss = 5.61 | elapsed time = 1265.78 (s) ]






05/09/2020 05:03:31 PM: [ train: Epoch = 0 | iter = 1625/2707 | loss = 5.42 | elapsed time = 1287.06 (s) ]




05/09/2020 05:03:51 PM: [ train: Epoch = 0 | iter = 1650/2707 | loss = 5.47 | elapsed time = 1306.34 (s) ]






05/09/2020 05:04:10 PM: [ train: Epoch = 0 | iter = 1675/2707 | loss = 5.62 | elapsed time = 1326.05 (s) ]




05/09/2020 05:04:45 PM: [ train: Epoch = 0 | iter = 1700/2707 | loss = 5.66 | elapsed time = 1361.10 (s) ]






05/09/2020 05:05:07 PM: [ train: Epoch = 0 | iter = 1725/2707 | loss = 5.52 | elapsed time = 1383.03 (s) ]




05/09/2020 05:05:27 PM: [ train: Epoch = 0 | iter = 1750/2707 | loss = 5.45 | elapsed time = 1402.31 (s) ]






05/09/2020 05:05:46 PM: [ train: Epoch = 0 | iter = 1775/2707 | loss = 5.40 | elapsed time = 1421.31 (s) ]




05/09/2020 05:06:03 PM: [ train: Epoch = 0 | iter = 1800/2707 | loss = 5.48 | elapsed time = 1439.20 (s) ]






05/09/2020 05:06:22 PM: [ train: Epoch = 0 | iter = 1825/2707 | loss = 5.51 | elapsed time = 1457.76 (s) ]




05/09/2020 05:06:47 PM: [ train: Epoch = 0 | iter = 1850/2707 | loss = 5.52 | elapsed time = 1482.92 (s) ]






05/09/2020 05:07:06 PM: [ train: Epoch = 0 | iter = 1875/2707 | loss = 5.31 | elapsed time = 1501.72 (s) ]




05/09/2020 05:07:31 PM: [ train: Epoch = 0 | iter = 1900/2707 | loss = 5.21 | elapsed time = 1526.46 (s) ]






05/09/2020 05:07:53 PM: [ train: Epoch = 0 | iter = 1925/2707 | loss = 5.46 | elapsed time = 1548.44 (s) ]




05/09/2020 05:08:13 PM: [ train: Epoch = 0 | iter = 1950/2707 | loss = 5.75 | elapsed time = 1568.86 (s) ]






05/09/2020 05:08:36 PM: [ train: Epoch = 0 | iter = 1975/2707 | loss = 5.66 | elapsed time = 1591.76 (s) ]




05/09/2020 05:08:55 PM: [ train: Epoch = 0 | iter = 2000/2707 | loss = 5.19 | elapsed time = 1610.33 (s) ]






05/09/2020 05:09:18 PM: [ train: Epoch = 0 | iter = 2025/2707 | loss = 5.62 | elapsed time = 1633.54 (s) ]




05/09/2020 05:09:51 PM: [ train: Epoch = 0 | iter = 2050/2707 | loss = 5.19 | elapsed time = 1666.75 (s) ]






05/09/2020 05:10:10 PM: [ train: Epoch = 0 | iter = 2075/2707 | loss = 5.50 | elapsed time = 1685.73 (s) ]




05/09/2020 05:10:32 PM: [ train: Epoch = 0 | iter = 2100/2707 | loss = 5.53 | elapsed time = 1708.24 (s) ]






05/09/2020 05:10:52 PM: [ train: Epoch = 0 | iter = 2125/2707 | loss = 5.45 | elapsed time = 1728.21 (s) ]




05/09/2020 05:11:11 PM: [ train: Epoch = 0 | iter = 2150/2707 | loss = 5.39 | elapsed time = 1746.55 (s) ]






05/09/2020 05:11:33 PM: [ train: Epoch = 0 | iter = 2175/2707 | loss = 5.63 | elapsed time = 1769.24 (s) ]




05/09/2020 05:11:54 PM: [ train: Epoch = 0 | iter = 2200/2707 | loss = 5.51 | elapsed time = 1790.12 (s) ]






05/09/2020 05:12:16 PM: [ train: Epoch = 0 | iter = 2225/2707 | loss = 5.47 | elapsed time = 1812.18 (s) ]




05/09/2020 05:12:43 PM: [ train: Epoch = 0 | iter = 2250/2707 | loss = 5.43 | elapsed time = 1838.48 (s) ]






05/09/2020 05:13:02 PM: [ train: Epoch = 0 | iter = 2275/2707 | loss = 5.37 | elapsed time = 1857.38 (s) ]




05/09/2020 05:13:21 PM: [ train: Epoch = 0 | iter = 2300/2707 | loss = 5.58 | elapsed time = 1877.13 (s) ]






05/09/2020 05:13:42 PM: [ train: Epoch = 0 | iter = 2325/2707 | loss = 5.39 | elapsed time = 1898.23 (s) ]




05/09/2020 05:14:02 PM: [ train: Epoch = 0 | iter = 2350/2707 | loss = 5.47 | elapsed time = 1917.61 (s) ]






05/09/2020 05:14:31 PM: [ train: Epoch = 0 | iter = 2375/2707 | loss = 5.43 | elapsed time = 1947.16 (s) ]




05/09/2020 05:14:53 PM: [ train: Epoch = 0 | iter = 2400/2707 | loss = 5.52 | elapsed time = 1968.30 (s) ]






05/09/2020 05:15:13 PM: [ train: Epoch = 0 | iter = 2425/2707 | loss = 5.54 | elapsed time = 1988.31 (s) ]




05/09/2020 05:15:33 PM: [ train: Epoch = 0 | iter = 2450/2707 | loss = 5.40 | elapsed time = 2008.38 (s) ]






05/09/2020 05:15:53 PM: [ train: Epoch = 0 | iter = 2475/2707 | loss = 5.20 | elapsed time = 2029.03 (s) ]




05/09/2020 05:16:14 PM: [ train: Epoch = 0 | iter = 2500/2707 | loss = 5.25 | elapsed time = 2049.63 (s) ]






05/09/2020 05:16:33 PM: [ train: Epoch = 0 | iter = 2525/2707 | loss = 5.35 | elapsed time = 2068.48 (s) ]




05/09/2020 05:16:53 PM: [ train: Epoch = 0 | iter = 2550/2707 | loss = 5.43 | elapsed time = 2089.00 (s) ]






05/09/2020 05:17:14 PM: [ train: Epoch = 0 | iter = 2575/2707 | loss = 5.40 | elapsed time = 2109.39 (s) ]




05/09/2020 05:17:34 PM: [ train: Epoch = 0 | iter = 2600/2707 | loss = 5.39 | elapsed time = 2129.64 (s) ]






05/09/2020 05:17:56 PM: [ train: Epoch = 0 | iter = 2625/2707 | loss = 5.52 | elapsed time = 2151.78 (s) ]




05/09/2020 05:18:14 PM: [ train: Epoch = 0 | iter = 2650/2707 | loss = 5.46 | elapsed time = 2169.44 (s) ]






05/09/2020 05:18:32 PM: [ train: Epoch = 0 | iter = 2675/2707 | loss = 5.08 | elapsed time = 2188.20 (s) ]




05/09/2020 05:18:53 PM: [ train: Epoch = 0 | iter = 2700/2707 | loss = 5.35 | elapsed time = 2208.64 (s) ]


05/09/2020 05:18:57 PM: [ train: Epoch 0 done. Time for epoch = 2212.70 (s) ]






























































05/09/2020 05:20:10 PM: [ train valid unofficial: Epoch = 0 | start = 39.80 | end = 43.75 | exact = 31.87 | examples = 10016 | valid time = 73.21 (s) ]
















05/09/2020 05:21:18 PM: [ dev valid unofficial: Epoch = 0 | start = 46.06 | end = 49.62 | exact = 39.00 | examples = 10570 | valid time = 67.63 (s) ]


















05/09/2020 05:22:27 PM: [ dev valid official: Epoch = 0 | EM = 40.98 | F1 = 51.55 | examples = 10570 | valid time = 69.38 (s) ]
05/09/2020 05:22:27 PM: [ Best valid: f1 = 51.55 (epoch 0, 2707 updates) ]


# Running Full Pipeline

In [16]:
import torch
import argparse
import code
import prettytable

from termcolor import colored
from drqa import pipeline
from drqa.retriever import utils

if __name__ == '__main__':
    # Arguments
    args_reader_model="/home/ubuntu/cvd19-truth-finder/data/covid19/model/20200509-c9d21e14.mdl"
    args_retriever_model="/home/ubuntu/cvd19-truth-finder/data/covid19/doc_ranker/doc-tfidf-ngram=2-hash=16777216-tokenizer=simple.npz"
    args_doc_db="/home/ubuntu/cvd19-truth-finder/data/covid19/doc-db/doc.db"
    args_tokenizer="spacy"
    args_candidate_file=None
    args_no_cuda=True
    args_gpu=-1

    args_cuda = not args_no_cuda and torch.cuda.is_available()
    if args_cuda:
        torch.cuda.set_device(args_gpu)

    if args_candidate_file:
        candidates = set()
        with open(args_candidate_file) as f:
            for line in f:
                line = utils.normalize(line.strip()).lower()
                candidates.add(line)
    else:
        candidates = None

    DrQA = pipeline.DrQA(
        cuda=args_cuda,
        fixed_candidates=candidates,
        reader_model=args_reader_model,
        ranker_config={'options': {'tfidf_path': args_retriever_model}},
        db_config={'options': {'db_path': args_doc_db}},
        tokenizer=args_tokenizer
    )

    # ------------------------------------------------------------------------------
    # Drop in to interactive mode
    # ------------------------------------------------------------------------------


    def process(question, candidates=None, top_n=1, n_docs=5):
        predictions = DrQA.process(
            question, candidates, top_n, n_docs, return_context=True
        )
        table = prettytable.PrettyTable(
            ['Rank', 'Answer', 'Doc', 'Answer Score', 'Doc Score']
        )
        for i, p in enumerate(predictions, 1):
            table.add_row([i, p['span'], p['doc_id'],
                           '%.5g' % p['span_score'],
                           '%.5g' % p['doc_score']])
        print('Top Predictions:')
        print(table)
        print('\nContexts:')
        for p in predictions:
            text = p['context']['text']
            start = p['context']['start']
            end = p['context']['end']
            output = (text[:start] +
                      colored(text[start: end], 'green', attrs=['bold']) +
                      text[end:])
            print('[ Doc = %s ]' % p['doc_id'])
            print(output + '\n')

    process("Who is infected?")

processing paragraph: 0


Process ForkPoolWorker-16:
Process ForkPoolWorker-19:
Process ForkPoolWorker-18:
Process ForkPoolWorker-23:
Process ForkPoolWorker-21:
Process ForkPoolWorker-20:
Process ForkPoolWorker-22:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/ubuntu/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ubuntu/anaconda3/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home

KeyboardInterrupt: 

  File "/home/ubuntu/anaconda3/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
  File "/home/ubuntu/anaconda3/lib/python3.6/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt
