In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


## Setting Environment

In [None]:
%%bash

pip install --upgrade pip
pip install git+https://github.com/deepset-ai/haystack.git
#egg=farm-haystack[colab]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pip
  Downloading pip-22.2.2-py3-none-any.whl (2.0 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 21.1.3
    Uninstalling pip-21.1.3:
      Successfully uninstalled pip-21.1.3
Successfully installed pip-22.2.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/deepset-ai/haystack.git
  Cloning https://github.com/deepset-ai/haystack.git to /tmp/pip-req-build-iufx5uqh
  Resolved https://github.com/deepset-ai/haystack.git to commit 2b803a265b6e66a46ec7e6688a2fd2b7b3114d77
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing me

  Running command git clone --filter=blob:none --quiet https://github.com/deepset-ai/haystack.git /tmp/pip-req-build-iufx5uqh


In [None]:
# Imports needed to run this notebook

from pprint import pprint
from tqdm import tqdm
from haystack.nodes import QuestionGenerator, BM25Retriever, FARMReader
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.pipelines import (
    QuestionGenerationPipeline,
    RetrieverQuestionGenerationPipeline,
    QuestionAnswerGenerationPipeline,
)
from haystack.utils import launch_es, print_questions

In [None]:
# Option 2: In Colab / No Docker environments: Start Elasticsearch from source
! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q
! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz
! chown -R daemon:daemon elasticsearch-7.9.2

import os
from subprocess import Popen, PIPE, STDOUT

es_server = Popen(
    ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1)  # as daemon
)
# wait until ES has started
! sleep 30

In [None]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

## Reading the file

In [None]:
path = '/content/drive/MyDrive/nlp/data_clean.txt'
data = open(path, 'r', errors='ignore').read()
data

'There are three major types of rockigneous, sedimentary, and metamorphic. The rock cycle is an important concept in geology which illustrates the relationships between these three types of rock, and magma. When a rock crystallizes from melt magma andor lava, it is an igneous rock. This rock can be weathered and eroded, and then redeposited and lithified into a sedimentary rock, or be turned into a metamorphic rock due to heat and pressure that change the mineral content of the rock which gives it a characteristic fabric. The sedimentary rock can then be subsequently turned into a metamorphic rock due to heat and pressure and is then weathered, eroded, deposited, and lithified, ultimately becoming a sedimentary rock. Sedimentary rock may also be reeroded and redeposited, and metamorphic rock may also undergo additional metamorphism. All three types of rocks may be remelted when this happens, a new magma is formed, from which an igneous rock may once again crystallize. In the s, a serie

### Loading the data to the model

In [None]:
docs = [{"content": data}]

In [None]:
# Initialize document store and write in the documents
document_store = ElasticsearchDocumentStore()
document_store.write_documents(docs)

INFO:haystack.telemetry:Haystack sends anonymous usage data to understand the actual usage and steer dev efforts towards features that are most meaningful to users. You can opt-out at anytime by calling disable_telemetry() or by manually setting the environment variable HAYSTACK_TELEMETRY_ENABLED as described for different operating systems on the documentation page. More information at https://haystack.deepset.ai/guides/telemetry


In [None]:
import pandas as pd 
import numpy as np

#### The Question from the Previous Day Question Generator

In [None]:
qs = pd.read_csv('/content/drive/MyDrive/nlp/QG.csv')
qs

Unnamed: 0.1,Unnamed: 0,question,answer
0,0,What is the principle of inclusions and components?,"The principle of inclusions and components states that, with sedimentary roc..."
1,1,What is the relationship between the movement of the plates on the surface a...,There is an intimate coupling between the movement of the plates on the surf...
2,2,What is the meaning of convergent boundaries?,"Arcs of volcanoes and earthquakes were explained as convergent boundaries, w..."
3,3,What are the long linear regions of geologic features?,Long linear regions of geologic features could be explained as plate boundar...
4,4,What is the principle of faunal succession?,The principle of faunal succession is based on the appearance of fossils in ...
5,5,What is the difference between xenoliths and sedimentary rocks?,"As a result, xenoliths are older than the rock which contains them."
6,6,What is the role of plate tectonics in the evolution of the Earth?,Plate tectonics also provided a mechanism for Alfred Wegeners theory of cont...
7,7,What is the most important reason for expanding the fourth scale?,The Holocene the latest epoch is too small to be shown clearly on the third ...
8,8,What is the power of the theory of plate tectonics?,The power of the theory of plate tectonics lies in its ability to combine al...
9,9,What is the meaning of succession?,Based on principles laid out by William Smith almost a hundred years before ...


## Retriever and reader

In [None]:
# An in-memory TfidfRetriever based on Pandas dataframes
from haystack.nodes import TfidfRetriever

retriever = TfidfRetriever(document_store=document_store)

INFO:haystack.nodes.retriever.sparse:Found 1 candidate paragraphs from 1 docs in DB


In [None]:
from haystack.nodes import FARMReader
# Load a  local model or any of the QA models on
# Hugging Face's model hub (https://huggingface.co/models)
reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True)

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1


Downloading config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

INFO:haystack.modeling.model.language_model: * LOADING MODEL: 'deepset/roberta-base-squad2' (Roberta)


Downloading pytorch_model.bin:   0%|          | 0.00/473M [00:00<?, ?B/s]

INFO:haystack.modeling.model.language_model:Auto-detected model language: english
INFO:haystack.modeling.model.language_model:Loaded 'deepset/roberta-base-squad2' (Roberta model) from model hub.


Downloading tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
INFO:haystack.modeling.infer:Got ya 2 parallel workers to do inference ...
INFO:haystack.modeling.infer: 0     0  
INFO:haystack.modeling.infer:/w\   /w\ 
INFO:haystack.modeling.infer:/'\   / \ 


## Pipeline

In [None]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader, retriever)

In [None]:
ques = qs['question'].to_numpy()

In [None]:
prediction = pipe.run(query=ques[0], params={"Retriever": {"top_k": 1}, "Reader": {"top_k": 1}})
prediction['answers'][0].context

Inferencing Samples: 100%|██████████| 1/1 [00:03<00:00,  3.06s/ Batches]


' or clasts are found in a formation, then the inclusions must be older than the formation that contains them. For example, in sedimentary rocks, it is'

In [None]:
# ...or use a util to simplify the output
from haystack.utils import print_answers

# Initialize a DataFrame
preds = pd.DataFrame(columns = ['Query', 'Context', 'Haystack Answer'])

# Loop over the questions from question generation model
for quer in ques:
  prediction = pipe.run(query=quer, params={"Retriever": {"top_k": 1}, "Reader": {"top_k": 1}})
  # The answer of the prediction
  ans = prediction['answers'][0].answer
  # The context of the prediction
  con = prediction['answers'][0].context
  # Add the question, context and answer to the dataframe
  dic = {'Query': quer, 'Context': con, 'Haystack Answer': ans}
  preds = preds.append(dic, ignore_index = True)
  # df = df.append(df2, ignore_index = True)

preds

Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  5.28 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  5.97 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.20 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.29 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.10 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.05 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.31 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.24 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.18 Batches/s]
Inferencing Samples: 100%|██████████| 1/1 [00:00<00:00,  7.43 Batches/s]


Unnamed: 0,Query,Context,Haystack Answer
0,What is the principle of inclusions and components?,"or clasts are found in a formation, then the inclusions must be older than ...",the inclusions must be older than the formation that contains them
1,What is the relationship between the movement of the plates on the surface a...,"olid, upper mantle, which is called the asthenosphere. There is an intimate ...",intimate coupling
2,What is the meaning of convergent boundaries?,"earthquakes were explained as convergent boundaries, where one plate subduct...",one plate subducts under another
3,What are the long linear regions of geologic features?,th. Long linear regions of geologic features could be explained as plate bou...,plate boundaries
4,What is the principle of faunal succession?,ins them. The principle of faunal succession is based on the appearance of f...,based on the appearance of fossils in sedimentary rocks
5,What is the difference between xenoliths and sedimentary rocks?,"later to cool in the matrix. As a result, xenoliths are older than the rock ...",older than the rock which contains them
6,What is the role of plate tectonics in the evolution of the Earth?,f the convecting mantle. This coupling between rigid plates moving on the su...,coupling between rigid plates moving on the surface of the Earth and the con...
7,What is the most important reason for expanding the fourth scale?,ecent era is expanded in the third scale. Since the Quaternary is a very sho...,the Quaternary is a very short period with short epochs
8,What is the power of the theory of plate tectonics?,nics lies in its ability to combine all of these observations into a single ...,its ability to combine all of these observations into a single theory of how...
9,What is the meaning of succession?,iths are older than the rock which contains them. The principle of faunal su...,faunal succession


In [None]:
preds['QG Answers'] = qs['answer']
preds

Unnamed: 0,Query,Context,Haystack Answer,QG Answers
0,What is the principle of inclusions and components?,"or clasts are found in a formation, then the inclusions must be older than ...",the inclusions must be older than the formation that contains them,"The principle of inclusions and components states that, with sedimentary roc..."
1,What is the relationship between the movement of the plates on the surface a...,"olid, upper mantle, which is called the asthenosphere. There is an intimate ...",intimate coupling,There is an intimate coupling between the movement of the plates on the surf...
2,What is the meaning of convergent boundaries?,"earthquakes were explained as convergent boundaries, where one plate subduct...",one plate subducts under another,"Arcs of volcanoes and earthquakes were explained as convergent boundaries, w..."
3,What are the long linear regions of geologic features?,th. Long linear regions of geologic features could be explained as plate bou...,plate boundaries,Long linear regions of geologic features could be explained as plate boundar...
4,What is the principle of faunal succession?,ins them. The principle of faunal succession is based on the appearance of f...,based on the appearance of fossils in sedimentary rocks,The principle of faunal succession is based on the appearance of fossils in ...
5,What is the difference between xenoliths and sedimentary rocks?,"later to cool in the matrix. As a result, xenoliths are older than the rock ...",older than the rock which contains them,"As a result, xenoliths are older than the rock which contains them."
6,What is the role of plate tectonics in the evolution of the Earth?,f the convecting mantle. This coupling between rigid plates moving on the su...,coupling between rigid plates moving on the surface of the Earth and the con...,Plate tectonics also provided a mechanism for Alfred Wegeners theory of cont...
7,What is the most important reason for expanding the fourth scale?,ecent era is expanded in the third scale. Since the Quaternary is a very sho...,the Quaternary is a very short period with short epochs,The Holocene the latest epoch is too small to be shown clearly on the third ...
8,What is the power of the theory of plate tectonics?,nics lies in its ability to combine all of these observations into a single ...,its ability to combine all of these observations into a single theory of how...,The power of the theory of plate tectonics lies in its ability to combine al...
9,What is the meaning of succession?,iths are older than the rock which contains them. The principle of faunal su...,faunal succession,Based on principles laid out by William Smith almost a hundred years before ...


In [None]:
preds.to_csv('/content/drive/MyDrive/nlp/predictions.csv')

#### Now we have a dataframe containing Predected and Generated Answers together

In [None]:
!pip install h5py
!pip install typing-extensions
!pip install wheel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0mtime: 10.9 s (started: 2022-09-22 15:35:13 +00:00)


In [None]:
!pip install allennlp==2.1.0 allennlp-models==2.1.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[0mtime: 4.31 s (started: 2022-09-22 15:35:23 +00:00)


In [None]:
!pip install bert-extractive-summarizer --upgrade --force-reinstall
!pip install spacy==2.1.3 --upgrade --force-reinstall
!pip install --quiet git+https://github.com/huggingface/transformers.git@5c00918681d6b4027701eb46cea8f795da0d4064
!pip install git+https://github.com/boudinfl/pke.git
!pip install --quiet sentencepiece==0.1.95
!pip install -U wn==0.0.22
!pip install flashtext
!pip install yake 
!pip install --quiet ipython-autotime
#!pip install monkeylearn
!python3 -m spacy download en_core_web_sm
!pip install flask-ngrok
!pip install trafilatura
!pip install pandas
!pip install allennlp==2.1.0 allennlp-models==2.1.0
#!pip install pymongo
!pip install -U pywsd
%load_ext autotime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bert-extractive-summarizer
  Using cached bert_extractive_summarizer-0.10.1-py3-none-any.whl (25 kB)
Collecting spacy
  Using cached spacy-3.4.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.3 MB)
Collecting scikit-learn
  Using cached scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.8 MB)
Collecting transformers
  Using cached transformers-4.22.1-py3-none-any.whl (4.9 MB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting scipy>=1.1.0
  Using cached scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)
Collecting numpy>=1.14.6
  Using cached numpy-1.21.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
Collecting joblib>=0.11
  Using cached joblib-1.2.0-py3-none-any.whl (297 kB)
Collecting spacy-legacy<3.1.0,>=3.0.9
  Using cached spac

In [None]:
!pip install markupsafe==2.0.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting markupsafe==2.0.1
  Using cached MarkupSafe-2.0.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (31 kB)
Installing collected packages: markupsafe
  Attempting uninstall: markupsafe
    Found existing installation: MarkupSafe 2.1.1
    Uninstalling MarkupSafe-2.1.1:
      Successfully uninstalled MarkupSafe-2.1.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
altair 4.2.0 requires jsonschema>=3.0, but you have jsonschema 2.6.0 which is incompatible.[0m[31m
[0mSuccessfully installed markupsafe-2.0.1
[0mtime: 4.92 s (started: 2022-09-22 15:38:48 +00:00)


In [None]:
!sudo pip install awscli --ignore-installed six

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting awscli
  Using cached awscli-1.25.79-py3-none-any.whl (3.9 MB)
Collecting six
  Using cached six-1.16.0-py2.py3-none-any.whl (11 kB)
Collecting docutils<0.17,>=0.10
  Using cached docutils-0.16-py2.py3-none-any.whl (548 kB)
Collecting PyYAML<5.5,>=3.10
  Using cached PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
Collecting botocore==1.27.78
  Using cached botocore-1.27.78-py3-none-any.whl (9.1 MB)
Collecting s3transfer<0.7.0,>=0.6.0
  Using cached s3transfer-0.6.0-py3-none-any.whl (79 kB)
Collecting rsa<4.8,>=3.1.2
  Using cached rsa-4.7.2-py3-none-any.whl (34 kB)
Collecting colorama<0.4.5,>=0.2.5
  Using cached colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting python-dateutil<3.0.0,>=2.1
  Using cached python_dateutil-2.8.2-py2.py3-none-any.whl (247 kB)
Collecting jmespath<2.0.0,>=0.7.1
  Using cached jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting urllib3<1.2

In [None]:
from allennlp.predictors.predictor import Predictor
import allennlp_models.rc

predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/bidaf-elmo.2021-02-11.tar.gz")
# predictor.predict(
#     passage="The Matrix is a 1999 science fiction action film written and directed by The Wachowskis, starring Keanu Reeves, Laurence Fishburne, Carrie-Anne Moss, Hugo Weaving, and Joe Pantoliano.",
#     question="Who stars in The Matrix?"
# )

ImportError: ignored

time: 106 ms (started: 2022-09-22 15:39:10 +00:00)


In [None]:
try:  # Platform-specific: Python 3.6
    from ssl import PROTOCOL_TLS

    PROTOCOL_SSLv23 = PROTOCOL_TLS
except ImportError:
    try:
        from ssl import PROTOCOL_SSLv23 as PROTOCOL_TLS

        PROTOCOL_SSLv23 = PROTOCOL_TLS
    except ImportError:
        PROTOCOL_SSLv23 = PROTOCOL_TLS = 2

time: 1.19 ms (started: 2022-09-22 15:20:27 +00:00)
