In [1]:
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U



In [2]:
from datasets import get_dataset_config_names
domains = get_dataset_config_names("subjqa")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
from datasets import load_dataset
subjqa = load_dataset("subjqa", name="electronics")


In [4]:
print(subjqa["train"]["answers"][1])

{'text': ['Bass is weak as expected', 'Bass is weak as expected, even with EQ adjusted up'], 'answer_start': [1302, 1302], 'answer_subj_level': [1, 1], 'ans_subj_score': [0.5083333253860474, 0.5083333253860474], 'is_ans_subjective': [True, True]}


In [5]:
import pandas as pd
dfs = {split: dset.to_pandas() for split, dset in subjqa.flatten().items()}
for split, df in dfs.items():
	print(f"Number of questions in {split}: {df['id'].nunique()}")

Number of questions in train: 1295
Number of questions in test: 358
Number of questions in validation: 255


In [6]:
qa_cols = ["title", "question", "answers.text", "answers.answer_start", "context"]
sample_df = dfs["train"][qa_cols].sample(2, random_state=7)
sample_df


Unnamed: 0,title,question,answers.text,answers.answer_start,context
791,B005DKZTMG,Does the keyboard lightweight?,[this keyboard is compact],[215],I really like this keyboard. I give it 4 star...
1159,B00AAIPT76,How is the battery?,[],[],I bought this after the first spare gopro batt...


In [7]:
start_idx = sample_df["answers.answer_start"].iloc[0][0]
end_idx = start_idx + len(sample_df["answers.text"].iloc[0][0])
sample_df["context"].iloc[0][start_idx:end_idx]

'this keyboard is compact'

In [8]:
import matplotlib.pyplot as plt
counts = {}
question_types = ["What", "How", "Is", "Does", "Do", "Was", "Where", "Why"]
for q in question_types:
	counts[q] = dfs["train"]["question"].str.startswith(q).value_counts()

pd.Series(counts)
# .sort_values().plot.barh()
# plt.title("Frequency of Question Types")
# plt.show()


What     False    1059
True      236
Name: question, dt...
How      True     780
False    515
Name: question, dtyp...
Is       False    1195
True      100
Name: question, dt...
Does     False    1250
True       45
Name: question, dt...
Do       False    1212
True       83
Name: question, dt...
Was      False    1283
True       12
Name: question, dt...
Where    False    1267
True       28
Name: question, dt...
Why      False    1274
True       21
Name: question, dt...
dtype: object

In [9]:
for question_type in ["How", "What", "Is"]:
	for question in ( dfs["train"][dfs["train"].question.str.startswith(question_type)] .sample(n=3, random_state=42)['question']):
		print(question)

How is the camera?
How do you like the control?
How fast is the charger?
What is direction?
What is the quality of the construction of the bag?
What is your impression of the product?
Is this how zoom works?
Is sound clear?
Is it a wireless keyboard?


In [10]:
from transformers import AutoTokenizer
model_ckpt = "deepset/minilm-uncased-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [11]:
uestion = "How much music can this hold?"
context = """An MP3 is about 1 MB/minute, so about 6000 hours depending on \ file size."""
inputs = tokenizer(question, context, return_tensors="pt")


In [12]:
print(tokenizer.decode(inputs["input_ids"][0]))

[CLS] is it a wireless keyboard? [SEP] an mp3 is about 1 mb / minute, so about 6000 hours depending on \ file size. [SEP]


In [13]:
import torch
from transformers import AutoModelForQuestionAnswering
model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)
with torch.no_grad():
	outputs = model(**inputs)
	print(outputs)

Some weights of the model checkpoint at deepset/minilm-uncased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[ 2.8510, -5.0135, -5.2084, -4.0294, -2.8482, -3.7303, -6.0579,  2.8510,
         -2.4657, -0.8893, -4.6217, -2.2798, -1.9392, -4.8937, -5.5844, -4.9004,
         -5.9556, -3.8715, -2.7315, -2.5275, -4.7680, -4.3791, -5.6296, -3.9556,
         -4.7228, -5.0145, -5.3759,  2.8510]]), end_logits=tensor([[ 3.1789, -6.0457, -5.2371, -5.2178, -4.9497, -0.8735, -3.3717,  3.1788,
         -5.2578, -0.6590, -5.3717, -5.7263, -4.6543, -4.8667, -5.8881, -2.6058,
         -2.7213, -4.7760, -6.1632, -4.1363, -2.3603, -4.7872, -5.8269, -5.1322,
         -5.5225, -1.9867, -2.1770,  3.1788]]), hidden_states=None, attentions=None)


In [14]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits

In [15]:
print(f"Input IDs shape: {inputs.input_ids.size()}")
print(f"Start logits shape: {start_logits.size()}")
print(f"End logits shape: {end_logits.size()}")

Input IDs shape: torch.Size([1, 28])
Start logits shape: torch.Size([1, 28])
End logits shape: torch.Size([1, 28])


In [16]:
import torch
start_idx = torch.argmax(start_logits)
end_idx = torch.argmax(end_logits) + 1
answer_span = inputs["input_ids"][0][start_idx:end_idx]
answer = tokenizer.decode(answer_span)
print(f"Question: {question}")
print(f"Answer: {answer}")

Question: Is it a wireless keyboard?
Answer: [CLS]


In [17]:
from transformers import pipeline
pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)
pipe(question=question, context=context, topk=3)



[{'score': 0.0004626244481187314, 'start': 3, 'end': 6, 'answer': 'MP3'},
 {'score': 9.562810737406835e-05, 'start': 0, 'end': 6, 'answer': 'An MP3'},
 {'score': 8.440070814685896e-05,
  'start': 3,
  'end': 48,
  'answer': 'MP3 is about 1 MB/minute, so about 6000 hours'}]

In [18]:
pipe(question="Why is there no data?", context=context, handle_impossible_answer=True)

{'score': 0.9193894863128662, 'start': 0, 'end': 0, 'answer': ''}

In [19]:
example = dfs["train"].iloc[0][["question", "context"]]
tokenized_example = tokenizer(example["question"], example["context"], return_overflowing_tokens=True, max_length=100, stride=25)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [20]:
for idx, window in enumerate(tokenized_example["input_ids"]):
  print(f"Window #{idx} has {len(window)} tokens")

Window #0 has 100 tokens
Window #1 has 88 tokens


In [21]:
for window in tokenized_example["input_ids"]:
	print(f"{tokenizer.decode(window)} \n")

[CLS] how is the bass? [SEP] i have had koss headphones in the past, pro 4aa and qz - 99. the koss portapro is portable and has great bass response. the work great with my android phone and can be " rolled up " to be carried in my motorcycle jacket or computer bag without getting crunched. they are very light and do not feel heavy or bear down on your ears even after listening to music with them on all day. the sound is [SEP] 

[CLS] how is the bass? [SEP] and do not feel heavy or bear down on your ears even after listening to music with them on all day. the sound is night and day better than any ear - bud could be and are almost as good as the pro 4aa. they are " open air " headphones so you cannot match the bass to the sealed types, but it comes close. for $ 32, you cannot go wrong. [SEP] 



In [22]:
!pip install haystack



In [23]:
url = """https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz"""
!wget -nc -q {url}
!tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz

In [24]:
import os
from subprocess import Popen, PIPE, STDOUT
# Run Elasticsearch as a background process
!chown -R daemon:daemon elasticsearch-7.9.2
es_server = Popen(args=['elasticsearch-7.9.2/bin/elasticsearch'], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1))
# Wait until Elasticsearch has started
!sleep 30

In [25]:
!curl -X GET "localhost:9200/?pretty"

curl: (7) Failed to connect to localhost port 9200 after 0 ms: Connection refused


In [26]:
!pip install farm-haystack[inference]



In [27]:
!pip install pydantic
from pydantic import Field



In [28]:
!pip install farm-haystack[elasticsearch]



In [29]:
from haystack.document_stores import ElasticsearchDocumentStore
# Return the document embedding for later use with dense retriever
document_store = ElasticsearchDocumentStore(return_embedding=True)

In [35]:
for split, df in dfs.items():
	# Exclude duplicate reviews
	docs = [{"text": row["context"], "meta":{"item_id": row["title"], "question_id": row["id"], "split": split}}
	for _,row in df.drop_duplicates(subset="context").iterrows()]

	# Extract the text content from each row
	for doc in docs:
		doc["content"] = doc["text"]

	document_store.write_documents(docs, index="document")
print(f"Loaded {document_store.get_document_count()} documents")

Loaded 1615 documents


In [40]:
from haystack import ElasticsearchRetriever
es_retriever = ElasticsearchRetriever(document_store=document_store)
item_id = "B0074BW614"
query = "Is it good for reading?"
retrieved_docs = es_retriever.retrieve( query=query, top_k=3, filters={"item_id":[item_id], "split":["train"]})
print(retrieved_docs[0])

ImportError: cannot import name 'ElasticsearchRetriever' from 'haystack' (/usr/local/lib/python3.10/dist-packages/haystack/__init__.py)

In [41]:
from haystack.reader.farm import FARMReader
model_ckpt = "deepset/minilm-uncased-squad2"
max_seq_length, doc_stride = 384, 128
reader = FARMReader(model_name_or_path=model_ckpt, progress_bar=False, max_seq_len=max_seq_length, doc_stride=doc_stride, return_no_answer=True)
print(reader.predict_on_texts(question=question, texts=[context], top_k=1))


ModuleNotFoundError: No module named 'haystack.reader'

In [None]:
from haystack.pipeline import ExtractiveQAPipeline
pipe = ExtractiveQAPipeline(reader, es_retriever)
n_answers = 3
preds = pipe.run(query=query, top_k_retriever=3, top_k_reader=n_answers, filters={"item_id": [item_id], "split":["train"]})
print(f"Question: {preds['query']} \n")
for idx in range(n_answers):
	print(f"Answer {idx+1}: {preds['answers'][idx]['answer']}")
	print(f"Review snippet: ...{preds['answers'][idx]['context']}...")
	print("\n\n")

In [None]:
from haystack.pipeline import Pipeline
from haystack.eval import EvalDocuments
class EvalRetrieverPipeline:
	def __init__(self, retriever):
		self.retriever = retriever
		self.eval_retriever = EvalDocuments()
		pipe = Pipeline() pipe.add_node(component=self.retriever, name="ESRetriever", inputs=["Query"])
		pipe.add_node(component=self.eval_retriever, name="EvalRetriever", inputs=["ESRetriever"])
		self.pipeline = pipe

pipe = EvalRetrieverPipeline(es_retriever)


In [None]:
from haystack import Label
labels = []
for i, row in dfs["test"].iterrows():

  meta = {"item_id": row["title"], "question_id": row["id"]}
  if len(row["answers.text"]):
    for answer in row["answers.text"]:
      label = Label( question=row["question"], answer=answer, id=i, origin=row["id"], meta=meta, is_correct_answer=True, is_correct_document=True, no_answer=False)
      labels.append(label)

  else:
    label = Label( question=row["question"], answer="", id=i, origin=row["id"], meta=meta, is_correct_answer=True, is_correct_document=True, no_answer=True)
    labels.append(label)
print(labels[0])


In [None]:
document_store.write_labels(labels, index="label")
print(f"""Loaded {document_store.get_label_count(index="label")} \ question-answer pairs""")

In [None]:
labels_agg = document_store.get_all_labels_aggregated( index="label", open_domain=True, aggregate_by_meta=["item_id"] )

print(len(labels_agg))
print(labels_agg[109])


In [None]:
document_store.write_labels(labels, index="label")
print(f"""Loaded {document_store.get_label_count(index="label")} \ question-answer pairs""")

In [None]:
labels_agg = document_store.get_all_labels_aggregated( index="label", open_domain=True, aggregate_by_meta=["item_id"] )

print(len(labels_agg))

In [None]:
def run_pipeline(pipeline, top_k_retriever=10, top_k_reader=4):
	for l in labels_agg:
    	_ = pipeline.pipeline.run( query=l.question, top_k_retriever=top_k_retriever, top_k_reader=top_k_reader, top_k_eval_documents=top_k_retriever, labels=l, filters={"item_id": [l.meta["item_id"]], "split": ["test"]})

run_pipeline(pipe, top_k_retriever=3)
print(f"Recall@3: {pipe.eval_retriever.recall:.2f}")


In [None]:
def evaluate_retriever(retriever, topk_values = [1,3,5,10,20]):
	topk_results = {}
	for topk in topk_values:
		# Create Pipeline
		p = EvalRetrieverPipeline(retriever)
		# Loop over each question-answers pair in test set
		run_pipeline(p, top_k_retriever=topk)
		# Get metrics
		topk_results[topk] = {"recall": p.eval_retriever.recall}
	return pd.DataFrame.from_dict(topk_results, orient="index")
es_topk_df = evaluate_retriever(es_retriever)


In [None]:
def plot_retriever_eval(dfs, retriever_names):
	fig, ax = plt.subplots()
	for df, retriever_name in zip(dfs, retriever_names):
		df.plot(y="recall", ax=ax, label=retriever_name)
	plt.xticks(df.index)
	plt.ylabel("Top-k Recall")
	plt.xlabel("k")
    plt.show()
plot_retriever_eval([es_topk_df], ["BM25"])


In [None]:
from haystack.retriever.dense import DensePassageRetriever
dpr_retriever = DensePassageRetriever(document_store=document_store, query_embedding_model="facebook/dpr-question_encoder-single-nq-base", passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", embed_title=False)


In [None]:
document_store.update_embeddings(retriever=dpr_retriever)


In [None]:
dpr_topk_df = evaluate_retriever(dpr_retriever)
plot_retriever_eval([es_topk_df, dpr_topk_df], ["BM25", "DPR"])


In [None]:
from farm.evaluation.squad_evaluation import compute_f1, compute_exact
pred = "about 6000 hours"
label = "6000 hours"
print(f"EM: {compute_exact(label, pred)}")
print(f"F1: {compute_f1(label, pred)}")

In [None]:
pred = "about 6000 dollars"
print(f"EM: {compute_exact(label, pred)}")
print(f"F1: {compute_f1(label, pred)}")

In [None]:
from haystack.eval import EvalAnswers
def evaluate_reader(reader):
	score_keys = ['top_1_em', 'top_1_f1']
	eval_reader = EvalAnswers(skip_incorrect_retrieval=False)
	pipe = Pipeline()
	pipe.add_node(component=reader, name="QAReader", inputs=["Query"])
	pipe.add_node(component=eval_reader, name="EvalReader", inputs= ["QAReader"])
	for l in labels_agg:
		doc = document_store.query(l.question, filters={"question_id":[l.origin]}) _ = pipe.run(query=l.question, documents=doc, labels=l)
	return {k:v for k,v in eval_reader.__dict__.items() if k in score_keys}
reader_eval = {}
reader_eval["Fine-tune on SQuAD"] = evaluate_reader(reader)


In [None]:
def plot_reader_eval(reader_eval):
	fig, ax = plt.subplots()
	df = pd.DataFrame.from_dict(reader_eval)
	df.plot(kind="bar", ylabel="Score", rot=0, ax=ax) ax.set_xticklabels(["EM", "F1"])
	plt.legend(loc='upper left')
	plt.show()

plot_reader_eval(reader_eval)


In [None]:
def create_paragraphs(df):
	paragraphs = []
	id2context = dict(zip(df["review_id"], df["context"]))
	for review_id, review in id2context.items():
		qas = []
		# Filter for all question-answer pairs about a specific context
		review_df = df.query(f"review_id == '{review_id}'")
		id2question = dict(zip(review_df["id"], review_df["question"]))
		# Build up the qas array
		for qid, question in id2question.items():
        	# Filter for a single question ID
        	question_df = df.query(f"id == '{qid}'").to_dict(orient="list")
        	ans_start_idxs = question_df["answers.answer_start"][0].tolist()
            ans_text = question_df["answers.text"][0].tolist()
            # Fill answerable questions
            if len(ans_start_idxs):
            	answers = [ {"text": text, "answer_start": answer_start} for text, answer_start in zip(ans_text, ans_start_idxs)]
            	is_impossible = False
            else:
            	answers = []
            	is_impossible = True
            	# Add question-answer pairs to
            qas qas.append({"question": question, "id": qid, "is_impossible": is_impossible, "answers": answers})
         # Add context and question-answer pairs to paragraphs
       paragraphs.append({"qas": qas, "context": review})
   return paragraphs


In [None]:
product = dfs["train"].query("title == 'B00001P4ZH'")
create_paragraphs(product)

In [None]:
import json
def convert_to_squad(dfs):
	for split, df in dfs.items():
		subjqa_data = {}
		# Create `paragraphs` for each product ID
		groups = (df.groupby("title").apply(create_paragraphs) .to_frame(name="paragraphs").reset_index())
		subjqa_data["data"] = groups.to_dict(orient="records")
		# Save the result to disk
		with open(f"electronics-{split}.json", "w+", encoding="utf-8") as f:
			json.dump(subjqa_data, f)

convert_to_squad(dfs)


In [None]:
train_filename = "electronics-train.json"
dev_filename = "electronics-validation.json"
reader.train(data_dir=".", use_gpu=True, n_epochs=1, batch_size=16, train_filename=train_filename, dev_filename=dev_filename)


In [None]:
minilm_ckpt = "microsoft/MiniLM-L12-H384-uncased"
minilm_reader = FARMReader(model_name_or_path=minilm_ckpt, progress_bar=False, max_seq_len=max_seq_length, doc_stride=doc_stride, return_no_answer=True)
minilm_reader.train(data_dir=".", use_gpu=True, n_epochs=1, batch_size=16, train_filename=train_filename, dev_filename=dev_filename)
reader_eval["Fine-tune on SubjQA"] = evaluate_reader(minilm_reader)

plot_reader_eval(reader_eval)


In [None]:
# Initialize retriever pipeline
pipe = EvalRetrieverPipeline(es_retriever)
# Add nodes for reader
eval_reader = EvalAnswers()
pipe.pipeline.add_node(component=reader, name="QAReader", inputs=["EvalRetriever"])
pipe.pipeline.add_node(component=eval_reader, name="EvalReader", inputs=["QAReader"])
# Evaluate!
run_pipeline(pipe)
# Extract metrics from reader
reader_eval["QA Pipeline (top-1)"] = { k:v for k,v in eval_reader.__dict__.items() if k in ["top_1_em", "top_1_f1"]}


In [None]:
from haystack.generator.transformers import RAGenerator
generator = RAGenerator(model_name_or_path="facebook/rag-token-nq", embed_title=False, num_beams=5)
from haystack.pipeline import GenerativeQAPipeline
pipe = GenerativeQAPipeline(generator=generator, retriever=dpr_retriever)


In [None]:
def generate_answers(query, top_k_generator=3):
	preds = pipe.run(query=query, top_k_generator=top_k_generator, top_k_retriever=5, filters={"item_id":["B0074BW614"]})
	print(f"Question: {preds['query']} \n")
	for idx in range(top_k_generator):
		print(f"Answer {idx+1}: {preds['answers'][idx]['answer']}")
generate_answers(query)
generate_answers("What is the main drawback?")
