In [61]:
import requests

In [62]:
api = "https://datasets-server.huggingface.co/rows?dataset=squad_v2&config=squad_v2&split=train&offset=0&limit=100"
response = requests.get(api)

In [63]:
train_data = response.json()

In [64]:
train_data

{'features': [{'feature_idx': 0,
   'name': 'id',
   'type': {'dtype': 'string', '_type': 'Value'}},
  {'feature_idx': 1,
   'name': 'title',
   'type': {'dtype': 'string', '_type': 'Value'}},
  {'feature_idx': 2,
   'name': 'context',
   'type': {'dtype': 'string', '_type': 'Value'}},
  {'feature_idx': 3,
   'name': 'question',
   'type': {'dtype': 'string', '_type': 'Value'}},
  {'feature_idx': 4,
   'name': 'answers',
   'type': {'feature': {'text': {'dtype': 'string', '_type': 'Value'},
     'answer_start': {'dtype': 'int32', '_type': 'Value'}},
    '_type': 'Sequence'}}],
 'rows': [{'row_idx': 0,
   'row': {'id': '56be85543aeaaa14008c9063',
    'title': 'Beyoncé',
    'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R

In [65]:
from haystack.nodes import TextConverter, PreProcessor, retriever
from haystack.pipelines import ExtractiveQAPipeline, Pipeline
from haystack.nodes import FARMReader
from haystack.nodes import DensePassageRetriever, BM25Retriever, EmbeddingRetriever
from haystack.document_stores.memory import InMemoryDocumentStore


In [66]:
import json
train_data_json = json.dumps(train_data, indent=4)

In [67]:
with open ("train.json", "w") as output:
    output.write(train_data_json)


In [68]:
converter = TextConverter()
doc = converter.convert(file_path='./train.json', meta=None)
# docs = processor.process(doc)

In [69]:
doc

[<Document: {'content': '{\n    "features": [\n        {\n            "feature_idx": 0,\n            "name": "id",\n            "type": {\n                "dtype": "string",\n                "_type": "Value"\n            }\n        },\n        {\n            "feature_idx": 1,\n            "name": "title",\n            "type": {\n                "dtype": "string",\n                "_type": "Value"\n            }\n        },\n        {\n            "feature_idx": 2,\n            "name": "context",\n            "type": {\n                "dtype": "string",\n                "_type": "Value"\n            }\n        },\n        {\n            "feature_idx": 3,\n            "name": "question",\n            "type": {\n                "dtype": "string",\n                "_type": "Value"\n            }\n        },\n        {\n            "feature_idx": 4,\n            "name": "answers",\n            "type": {\n                "feature": {\n                    "text": {\n                        "

In [70]:
document_store = InMemoryDocumentStore(use_bm25=True)
document_store.write_documents(doc)

Updating BM25 representation...: 100%|██████████| 1/1 [00:00<00:00, 77.13 docs/s]


In [71]:
document_store

<haystack.document_stores.memory.InMemoryDocumentStore at 0x24fc3465400>

In [72]:
preprocessor = PreProcessor(clean_empty_lines=True, clean_whitespace=True, clean_header_footer=True, split_by="word")
preprocessor.run(doc)

Preprocessing: 100%|██████████| 1/1 [00:00<00:00,  8.82docs/s]


({'documents': [<Document: {'content': '{\n"features": [\n{\n"feature_idx": 0,\n"name": "id",\n"type": {\n"dtype": "string",\n"_type": "Value"\n}\n},\n{\n"feature_idx": 1,\n"name": "title",\n"type": {\n"dtype": "string",\n"_type": "Value"\n}\n},\n{\n"feature_idx": 2,\n"name": "context",\n"type": {\n"dtype": "string",\n"_type": "Value"\n}\n},\n{\n"feature_idx": 3,\n"name": "question",\n"type": {\n"dtype": "string",\n"_type": "Value"\n}\n},\n{\n"feature_idx": 4,\n"name": "answers",\n"type": {\n"feature": {\n"text": {\n"dtype": "string",\n"_type": "Value"\n},\n"answer_start": {\n"dtype": "int32",\n"_type": "Value"\n}\n},\n"_type": "Sequence"\n}\n}\n],\n"rows": [\n{\n"row_idx": 0,\n"row": {\n"id": "56be85543aeaaa14008c9063",\n"title": "Beyonc\\u00e9",\n"context": "Beyonc\\u00e9 Giselle Knowles-Carter (/bi\\u02d0\\u02c8j\\u0252nse\\u026a/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in 

In [73]:
from haystack.nodes import BM25Retriever

retriever = BM25Retriever(document_store)

In [74]:
model_name_or_path = "deepset/roberta-base-squad2"
reader = FARMReader(model_name_or_path, use_gpu=True)

In [75]:
pipeline = ExtractiveQAPipeline(reader, retriever)

In [81]:
# Perform semantic search
query = "when did beyonce become popular?"

prediction = pipeline.run(query=query, params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 1}})
answers = prediction['answers']



Inferencing Samples: 100%|██████████| 10/10 [3:22:56<00:00, 1217.61s/ Batches] 


In [82]:
answers

[<Answer {'answer': '2003', 'type': 'extractive', 'score': 0.8382994532585144, 'context': "atus saw the release of Beyonc\\u00e9's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy A", 'offsets_in_document': [{'start': 19855, 'end': 19859}], 'offsets_in_context': [{'start': 73, 'end': 77}], 'document_ids': ['d2258362377ee76229a984945e23f61c'], 'meta': {}}>]