In [1]:
!pip install cdqa

Collecting cdqa
  Downloading cdqa-1.3.9.tar.gz (45 kB)
[K     |████████████████████████████████| 45 kB 427 kB/s 
Collecting Flask==1.1.1
  Downloading Flask-1.1.1-py2.py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 238 kB/s 
Collecting flask_cors==3.0.8
  Downloading Flask_Cors-3.0.8-py2.py3-none-any.whl (14 kB)
Collecting joblib==0.13.2
  Downloading joblib-0.13.2-py2.py3-none-any.whl (278 kB)
[K     |████████████████████████████████| 278 kB 800 kB/s 
[?25hCollecting markdown==3.1.1
  Downloading Markdown-3.1.1-py2.py3-none-any.whl (87 kB)
[K     |████████████████████████████████| 87 kB 1.0 MB/s 
Collecting pandas==0.25.0
  Downloading pandas-0.25.0-cp37-cp37m-manylinux1_x86_64.whl (10.4 MB)
[K     |████████████████████████████████| 10.4 MB 2.4 MB/s 
Collecting scikit_learn==0.21.2
  Downloading scikit_learn-0.21.2-cp37-cp37m-manylinux1_x86_64.whl (6.7 MB)
[K     |████████████████████████████████| 6.7 MB 2.5 MB/s 
Collecting tika==1

## cdQA: Closed Domain Question Answering!

An End-To-End Closed Domain Question Answering System. Built on top of the HuggingFace transformers library.(though it not maintained now, they have shiften development to [Haystack](https://github.com/deepset-ai/haystack). 

In [2]:
# importing the necessary libraries
import os
import pandas as pd
from ast import literal_eval
import urllib.request

from cdqa.utils.converters import pdf_converter
from cdqa.utils.filters import filter_paragraphs
from cdqa.pipeline import QAPipeline
from cdqa.utils.download import download_model
download_model(model='bert-squad_1.1', dir='./models')   # distilbert-squad model can also be downloaded as per requirement.




Downloading trained model...


### Downloading Tourist Guide files from different URLs

In [3]:
urllib.request.urlretrieve('https://www.traveliteindia.com/uimages/downloads/45.pdf', "1.pdf")
urllib.request.urlretrieve('https://www.approachguides.com/ebook-previews/Preview-ApproachGuides-India-Delhi-Agra-Architecture.pdf', "2.pdf")
urllib.request.urlretrieve('http://ficci.in/Sector/Report/20294/Unexplored-tourism-destinations-of-India.pdf', "3.pdf")
urllib.request.urlretrieve('https://pubs.iied.org/sites/default/files/pdfs/migrate/17646IIED.pdf')


('/tmp/tmpe2zancv3', <http.client.HTTPMessage at 0x7f19a829a590>)

### **pdf_converter** can create a cdqa dataframe from a directory containing .pdf files

In [4]:
df = pdf_converter(directory_path='./')

2021-01-27 14:48:20,668 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.19/tika-server-1.19.jar to /tmp/tika-server.jar.
2021-01-27 14:48:24,885 [MainThread  ] [INFO ]  Retrieving http://search.maven.org/remotecontent?filepath=org/apache/tika/tika-server/1.19/tika-server-1.19.jar.md5 to /tmp/tika-server.jar.md5.
2021-01-27 14:48:25,976 [MainThread  ] [WARNI]  Failed to see startup log message; retrying...


### Fitting the pipeline on corpus using the pre-trained reader

In [5]:
cdqa_pipeline = QAPipeline(reader='./models/bert_qa.joblib', max_df=1.0, )
cdqa_pipeline.fit_retriever(df=df)

100%|██████████| 231508/231508 [00:00<00:00, 893024.43B/s]


QAPipeline(reader=BertQA(adam_epsilon=1e-08, bert_model='bert-base-uncased',
                         do_lower_case=True, fp16=False,
                         gradient_accumulation_steps=1, learning_rate=5e-05,
                         local_rank=-1, loss_scale=0, max_answer_length=30,
                         n_best_size=20, no_cuda=False,
                         null_score_diff_threshold=0.0, num_train_epochs=3.0,
                         output_dir=None, predict_batch_size=8, seed=42,
                         server_ip='', server_po..._size=8,
                         verbose_logging=False, version_2_with_negative=False,
                         warmup_proportion=0.1, warmup_steps=0),
           retrieve_by_doc=False,
           retriever=BM25Retriever(b=0.75, floor=None, k1=2.0, lowercase=True,
                                   max_df=1.0, min_df=2, ngram_range=(1, 2),
                                   preprocessor=None, stop_words='english',
                                   t

## Predications/QA!!

In [6]:
query = 'when Jama Masjid was built?'
prediction = cdqa_pipeline.predict(query)
print('query: {}'.format(query))
print('answer: {}'.format(prediction[0]))
print('title: {}'.format(prediction[1]))
print('paragraph: {}'.format(prediction[2]))

query: when Jama Masjid was built?
answer: 1650-1656
title: 2
paragraph: Overview• Built: 1650-1656.• Ruler: Mughal ruler Shah Jahan (ruled 1628-1658).• Location: Chandni Chowk neighborhood in New Delhi, near the Red Fort. See in Google 


In [7]:
query = 'Rajasthan is famous forts?'
prediction = cdqa_pipeline.predict(query)
print('query: {}'.format(query))
print('answer: {}'.format(prediction[0]))
print('title: {}'.format(prediction[1]))
print('paragraph: {}'.format(prediction[2]))

query: Rajasthan is famous forts?
answer: Rajasthan houses the largest number of forts and palaces in the world
title: 1
paragraph: Rajasthan houses the largest number of forts and palaces in the world. Today, some of these architectural marvels have been converted into heritage hotels, allowing you the luxury of a palatial stay in India. 


In [8]:
query = 'Where is Hampi?'
prediction = cdqa_pipeline.predict(query)
print('query: {}'.format(query))
print('answer: {}'.format(prediction[0]))
print('title: {}'.format(prediction[1]))
print('paragraph: {}'.format(prediction[2]))

query: Where is Hampi?
answer: Karnataka
title: 3
paragraph: 1.7 Hampi (Karnataka) ...................................................................19
