<a href="https://colab.research.google.com/github/Aggregate-Intellect/xir/blob/main/haystack_wiki.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installation

In [None]:
!pip install beir
!pip install tensorflow-text
!pip install farm-haystack
!pip install --upgrade pip
!pip install git+https://github.com/deepset-ai/haystack.git

In [2]:
import logging
import pathlib, os
import random

from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers
from haystack.nodes import FARMReader, TransformersReader

# Dataset

dataset: 

https://huggingface.co/vblagoje

https://huggingface.co/datasets/vblagoje/wikipedia_snippets_streamed

In [3]:
from datasets import load_dataset

wiki_data = load_dataset(
    'vblagoje/wikipedia_snippets_streamed',
    split='train',
    streaming=True
)

next(iter(wiki_data))

Downloading builder script:   0%|          | 0.00/4.58k [00:00<?, ?B/s]



{'article_title': "St John the Baptist's Church, Atherton",
 'end_character': 511,
 'end_paragraph': 6,
 'passage_text': "St John the Baptist's Church, Atherton History There have been three chapels or churches on the site of St John the Baptist parish church. The first chapel at Chowbent was built in 1645 by John Atherton as a chapel of ease of Leigh Parish Church. It was sometimes referred to as the Old Bent Chapel. It was not consecrated and used by the Presbyterians as well as the Vicar of Leigh. In 1721 Lord of the manor Richard Atherton expelled the dissenters who subsequently built Chowbent Chapel. The first chapel was consecrated in 1723 by the Bishop of Sodor and",
 'section_title': 'History',
 'start_character': 0,
 'start_paragraph': 2,
 'wiki_id': 'Q7593707'}

In [4]:
# only use history section
history = wiki_data.filter(
    lambda d: d['section_title'].startswith('History')
)

# Faiss Document store

In [5]:
from typing import List
import requests
import pandas as pd
from haystack import Document
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import RAGenerator, DensePassageRetriever
from haystack.utils import fetch_archive_from_http

from haystack.document_stores import FAISSDocumentStore
document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True)


INFO - haystack.telemetry -  Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by calling disable_telemetry() or by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page. More information at https://haystack.deepset.ai/guides/telemetry


In [None]:
# Delete existing documents in documents store
document_store.delete_documents()

from haystack import Document
from tqdm.auto import tqdm  # progress bar

total_doc_count = 50000
batch_size = 10000

counter = 0
docs = []
for d in tqdm(history, total=total_doc_count):
    # create haystack document object with text content and doc metadata
    doc = Document(
        content=d["passage_text"],
        meta={
            "article_title": d["article_title"],
            'section_title': d['section_title']
        }
    )
    docs.append(doc)
    counter += 1
    if counter % batch_size == 0:
        # writing docs everytime 10k docs are reached
        document_store.write_documents(docs)
        docs.clear()
    if counter == total_doc_count:
        break


In [7]:
document_store.get_document_count()

49995

# Retriever

*   BaseGraphRetriever(BaseComponent) ！！！
*   BaseRetriever(BaseComponent)
*   BM25Retriever(BaseRetriever)
*   FilterRetriever(BM25Retriever)
*   TfidfRetriever(BaseRetriever)
*   DensePassageRetriever(BaseRetriever)
*   TableTextRetriever(BaseRetriever)
*   EmbeddingRetriever(BaseRetriever)
*   Text2SparqlRetriever(BaseGraphRetriever)！！！

See documentation at: https://github.com/deepset-ai/haystack/blob/master/docs/_src/api/api/retriever.md




model: https://huggingface.co/flax-sentence-embeddings/all_datasets_v3_mpnet-base

In [None]:
from haystack.retriever.dense import EmbeddingRetriever

retriever = EmbeddingRetriever(
   document_store=document_store,
   embedding_model="flax-sentence-embeddings/all_datasets_v3_mpnet-base",
   model_format="sentence_transformers"
)

# Embeddings

In [None]:
document_store.update_embeddings(retriever, batch_size=128)

In [10]:
document_store.get_embedding_count()

49995

# Generator

https://huggingface.co/vblagoje/bart_lfqa

In [None]:
from haystack.generator.transformers import Seq2SeqGenerator

generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa")

# reader

In [None]:
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

# Pipeline

In [18]:
from haystack.pipelines import GenerativeQAPipeline
generator_pipe = GenerativeQAPipeline(generator, retriever)

from haystack.pipelines import ExtractiveQAPipeline
reader_pipe = ExtractiveQAPipeline(reader, retriever)


# Answer of generator

In [20]:
result = generator_pipe.run(
        query="what was the war of currents?",
        params={
            "Retriever": {"top_k":10 },
            "Generator": {"top_k":1}
        })

print_answers(result, details="minimum")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: what was the war of currents?
Answers:
[   {   'answer': 'The War of Currents was the rivalry between Thomas Edison '
                  "and George Westinghouse's companies over which form of "
                  'transmission (direct or alternating current) was superior.'}]


# Answer of reader

In [25]:
result = reader_pipe.run(
    query='what was the war of currents?', 
    params={
        "Retriever": {"top_k": 10}, 
        "Reader": {"top_k": 1}
        })

print_answers(result, details="minimum")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 11.51 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 26.61 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 25.88 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 30.71 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 31.84 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 34.90 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 38.14 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 42.25 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 37.89 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 38.06 Batches/s]


Query: what was the war of currents?
Answers:
[   {   'answer': 'War of 1812',
        'context': 'other country into military service. The most notable '
                   'example was the War of 1812, triggered by British '
                   'impressment of American seamen who were allege'}]





# Answer of generator

In [27]:
result = generator_pipe.run(
        query="when was the first electric power system built?",
        params={
            "Retriever": {"top_k": 3},
            "Generator": {"top_k": 1}
        })

print_answers(result, details="minimum")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Query: when was the first electric power system built?
Answers:
[   {   'answer': 'The first electric power system was built in 1881 at '
                  'Godalming in England. It was powered by two waterwheels and '
                  'produced an alternating current that in turn supplied seven '
                  'Siemens arc lamps at 250 volts and 34 incandescent lamps at '
                  '40 volts.'}]


# Answer of reader

In [28]:
result = reader_pipe.run(
    query='when was the first electric power system built?', 
    params={
        "Retriever": {"top_k": 10}, 
        "Reader": {"top_k": 1}
        })

print_answers(result, details="minimum")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 11.43 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 17.57 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 32.53 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 22.01 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 31.89 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 31.35 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 32.90 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 32.59 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 32.06 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00, 32.43 Batches/s]


Query: when was the first electric power system built?
Answers:
[   {   'answer': '1881',
        'context': 'Electric power system History In 1881, two electricians '
                   "built the world's first power system at Godalming in "
                   'England. It was powered by two waterwheel'}]





In [15]:
result["answers"]

[<Answer {'answer': 'The first electric power system was built in 1881 at Godalming in England. It was powered by two waterwheels and produced an alternating current that in turn supplied seven Siemens arc lamps at 250 volts and 34 incandescent lamps at 40 volts.', 'type': 'generative', 'score': None, 'context': None, 'offsets_in_document': None, 'offsets_in_context': None, 'document_id': None, 'meta': {'doc_ids': ['280ac43032e1e343f9ea19766ff23ef3', 'ad9ea2da202b338d6a1647e3ebf04da1', '3a43249b33b1435e94ef9b22f01989b6'], 'doc_scores': [0.501774066841441, 0.5016578460080546, 0.5015464331073533], 'content': ["Electric power system History In 1881, two electricians built the world's first power system at Godalming in England. It was powered by two waterwheels and produced an alternating current that in turn supplied seven Siemens arc lamps at 250 volts and 34 incandescent lamps at 40 volts. However, supply to the lamps was intermittent and in 1882 Thomas Edison and his company, The Ediso

In [None]:
result['documents'][0]