In [None]:
# Install the latest master of Haystack
!pip install grpcio-tools==1.34.1
!pip install git+https://github.com/deepset-ai/haystack.git


In [None]:
!pip install urllib3==1.24.3

In [None]:
from haystack.preprocessor.cleaning import clean_wiki_text
from haystack.preprocessor.utils import convert_files_to_dicts, fetch_archive_from_http
from haystack.reader.farm import FARMReader
from haystack.reader.transformers import TransformersReader
from haystack.utils import print_answers

In [None]:
# In Colab / No Docker environments: Start Elasticsearch from source
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.9.2

import os
from subprocess import Popen, PIPE, STDOUT
es_server = Popen(['elasticsearch-7.9.2/bin/elasticsearch'],
                   stdout=PIPE, stderr=STDOUT,
                   preexec_fn=lambda: os.setuid(1)  # as daemon
                  )
# wait until ES has started
! sleep 30

In [None]:
# Connect to Elasticsearch

from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")

In [None]:
# Let's first fetch some documents that we want to query
doc_dir = "data"

dicts = convert_files_to_dicts(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True)

print(dicts[:3])

# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(dicts)

In [None]:
from haystack.retriever.sparse import ElasticsearchRetriever
retriever = ElasticsearchRetriever(document_store=document_store)

In [None]:
# Load a  local model or any of the QA models on

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

In [None]:
from haystack.pipeline import ExtractiveQAPipeline
pipe = ExtractiveQAPipeline(reader, retriever)

In [None]:
# You can configure how many candidates the reader and retriever shall return
# The higher top_k_retriever, the better (but also the slower) your answers. 
prediction = pipe.run(query="What is Rec Room?", top_k_retriever=50, top_k_reader=4)

In [None]:
print_answers(prediction, details="minimal")

In [None]:
import json

print (prediction)
jsonString = json.dumps(prediction)
jsonFile = open("data2.json", "w")
jsonFile.write(jsonString)
jsonFile.close()


In [None]:
import json
text = ""
with open('data2.json') as file:
  data = json.load(file)

  for pred in data['answers']:
    text += pred['context'] + '\n'
  
  print (text)

In [None]:
import requests
token = 'cf3c6c6c-c6ea-4ab9-8a70-3a558775e837-843339462' # colocar aquí el token
url_endpoint = 'https://tagme.d4science.org/tagme/tag?lang=en&include_abstract=true&include_categories=true&gcube-token='
headers = {'user-agent': 'Mozilla/5.0', 'accept': 'application/json', 'content-type': 'application/json'}
dbr = 'http://dbpedia.org/resource/'
dbc = 'http://dbpedia.org/resource/Category:'

In [None]:
def getAnnotations(text):
    url = url_endpoint + token + '&text=' + text

    resp = requests.get(url, headers=headers).json()
    resp.keys()
    annotations = resp['annotations']

    dbCategories = []

    for i in range(len(annotations)):
        ann = annotations[i]['spot']
        if annotations[i]['rho'] > 0.1 and annotations[i]['link_probability'] > 0.1:
            ann = dbr + ann.replace(' ', '_')     
            # Guardar como recursos de la Dbpedia:
            for c in annotations[i]['dbpedia_categories']:
                dbCategories.append([ann, annotations[i]['rho'],
                                 annotations[i]['link_probability'], dbc + c.replace(' ', '_')])
    return dbCategories

In [None]:
dbCategories = getAnnotations(text)

In [None]:
for a in dbCategories:
    print(40*'-')
    print(a) # DBpedia categories