In [None]:
!nvidia-smi

In [None]:
OPENAI_KEY = 

In [None]:
%%bash

mkdir data
export data_url=https://gist.githubusercontent.com/yujong-lee/18c53f033e80df8b56321b9a4764b332/raw/3cb7be84e1d6d18354183931902781041aebb50e/transformer.txt
curl ${data_url} > data/transformer.txt

pip install --upgrade pip
pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab]

wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
chown -R daemon:daemon elasticsearch-7.9.2

In [3]:
%%bash --bg

sudo -u daemon -- elasticsearch-7.9.2/bin/elasticsearch

In [4]:
import time
time.sleep(30)

In [None]:
!curl -X GET "localhost:9200/?pretty" # Check Elasticsearch

In [6]:
import os
from haystack.document_stores import ElasticsearchDocumentStore

host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

document_store = ElasticsearchDocumentStore(
    host=host,
    username="",
    password="",
    recreate_index=True,
    index="document",
    embedding_field="emb",
    embedding_dim=768,
    excluded_meta_data=["emb"],
    similarity="dot_product",
)

In [None]:
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http
from haystack.nodes import PreProcessor

doc = convert_files_to_docs(dir_path="data", clean_func=clean_wiki_text, split_paragraphs=True)

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=200,
    split_overlap=50,
    split_respect_sentence_boundary=True,
    language="en"
)
docs = preprocessor.process(doc)

print(len(doc), len(docs)) 
document_store.write_documents(docs)

In [None]:
from haystack.nodes import EmbeddingRetriever

retriever = EmbeddingRetriever(
    document_store=document_store,
    embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1",
    model_format="sentence_transformers",
)

In [None]:
document_store.update_embeddings(retriever)

In [None]:
from haystack.nodes import OpenAIAnswerGenerator

# QA pair from T5 paper
examples = [
  [
    'What did authors try to accomplish? Describe with rich examples.',
    'The authors of this paper sought to explore the potential of transfer learning in natural language processing (NLP) by introducing a unified framework that converts all text-based language problems into a text-to-text format. They compared pre-training objectives, architectures, unlabeled data sets, transfer approaches, and other factors on dozens of language understanding tasks and achieved state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more.'
  ],
  [
    'What were the key elements of the approach? Describe the mathematics behind the key elements as well with rich examples.',
    'The key elements of the approach included the use of pre-training objectives, architectures, unlabeled datasets, and transfer approaches. The mathematics behind these elements involved the use of supervised and unsupervised learning methods to create models that could learn from labeled and unlabeled data. Additionally, the authors employed transfer learning techniques, such as fine-tuning pre-trained models, to further improve the accuracy of the models.'
  ],
  [
    'In what ways the approach was limited by? Describe with rich examples.',
    'The approach was limited by the amount of data available for pre-training and fine-tuning, as well as the difficulty of finding datasets with sufficient diversity to accurately model language understanding tasks. Additionally, the authors noted that transfer learning is not always effective for tasks that are more complex or require more nuanced understanding, such as sentiment analysis, text generation, and natural language inference.'
  ],
  [
    'How could you use it for computer-assisted language learning? Describe with rich examples.',
    'The approach could be used for computer-assisted language learning by pre-training a model on a language corpus or dataset and then fine-tuning the model on a language-specific task. This would allow the model to learn the language’s grammar and syntax and provide it with a better understanding of the language. Additionally, the model could be used to identify errors in student-generated text, providing feedback to the students to help them improve their language skills.'
  ],
  [
    'What other references should you follow?',
    'Other references that should be followed include the original paper by Devlin et al. (2018) on BERT, the paper by Radford et al. (2019) on GPT-2, and the paper by Conneau et al. (2019) on XLM. Additionally, the papers by Howard and Ruder (2018) and Peters et al. (2019) on ELMO and ULMFiT, respectively, can provide further insight into the use of transfer learning for NLP.'
  ]
]

generator = OpenAIAnswerGenerator(
    api_key=OPENAI_KEY,
    model='text-davinci-003',
    max_tokens=300,
    top_k=1,
    temperature=0.7,
    frequency_penalty=0.3,
    examples=examples,
)

In [11]:
from haystack.pipelines import GenerativeQAPipeline

qa = GenerativeQAPipeline(generator=generator, retriever=retriever)

In [None]:
from haystack.utils import print_answers

questions = [
  'What did authors try to accomplish? Describe with rich examples.',
  'What were the key elements of the approach? Describe the mathematics behind the key elements as well with rich examples.',
  'In what ways the approach was limited by? Describe with rich examples.',
  'How could you use it for computer-assisted language learning? Describe with rich examples.',
  'What other references should you follow? ',
]

for question in questions:
  res = qa.run(
    query=question,
    params={
      "Retriever": {"top_k": 3},
      "Generator": {"top_k": 1},
    },
  )
  print(question)
  print_answers(res, details="minimum")
