<a href="https://colab.research.google.com/github/AlekhSaxena/Perceptron/blob/main/Question_answering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/nlp-with-transformers/notebooks.git

In [None]:
cd notebooks

In [None]:
from install import *

In [None]:
print('Alu')

In [None]:
install_requirements(is_chapter7_v2=True)

In [None]:
%env TOKENIZERS_PARALLELISM=false

In [None]:
import logging
for module in ["farm.utils", "farm.infer", "haystack.reader.farm.FARMReader",
              "farm.modeling.prediction_head", "elasticsearch", "haystack.eval",
               "haystack.document_store.base", "haystack.retriever.base", 
              "farm.data_handler.dataset"]:
    module_logger = logging.getLogger(module)
    module_logger.setLevel(logging.ERROR)

In [None]:
from datasets import get_dataset_config_names

domains = get_dataset_config_names("subjqa")
domains

In [None]:
from datasets import load_dataset

subjqa = load_dataset("subjqa", name="electronics")

In [None]:
print(subjqa["train"]["answers"][1])

In [None]:
import pandas as pd
dfs = {split: dset.to_pandas() for split, dset in subjqa.flatten().items()}

for split, df in dfs.items():
    print(f"Number of questions in {split}: {df['id'].nunique()}")

In [None]:

qa_cols = ["title", "question", "answers.text", 
           "answers.answer_start", "context"]
sample_df = dfs["train"][qa_cols].sample(2, random_state=7)
sample_df

In [None]:
start_idx = sample_df["answers.answer_start"].iloc[0][0]
end_idx = start_idx + len(sample_df["answers.text"].iloc[0][0])
sample_df["context"].iloc[0][start_idx:end_idx]

In [None]:
counts = {}
question_types = ["What", "How", "Is", "Does", "Do", "Was", "Where", "Why"]

for q in question_types:
    counts[q] = dfs["train"]["question"].str.startswith(q).value_counts()[True]

pd.Series(counts).sort_values().plot.barh()
plt.title("Frequency of Question Types")
plt.show()

In [None]:
for question_type in ["How", "What", "Is"]:
    for question in (
        dfs["train"][dfs["train"].question.str.startswith(question_type)]
        .sample(n=3, random_state=42)['question']):
        print(question)
     

In [None]:
from transformers import AutoTokenizer

model_ckpt = "deepset/minilm-uncased-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

In [None]:
question = "How much music can this hold?"
context = """An MP3 is about 1 MB/minute, so about 6000 hours depending on \
file size."""
inputs = tokenizer(question, context, return_tensors="pt")

In [None]:
input_df = pd.DataFrame.from_dict(tokenizer(question, context), orient="index")
input_df

In [None]:
print(tokenizer.decode(inputs["input_ids"][0]))

In [None]:
import torch
from transformers import AutoModelForQuestionAnswering

In [None]:
model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)

In [None]:
with torch.no_grad():
    outputs = model(**inputs)
print(outputs)

In [None]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits

In [None]:
print(f"Input IDs shape: {inputs.input_ids.size()}")
print(f"Start logits shape: {start_logits.size()}")
print(f"End logits shape: {end_logits.size()}")

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
s_scores = start_logits.detach().numpy().flatten()
e_scores = end_logits.detach().numpy().flatten()
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
token_ids = range(len(tokens))

fig, (ax1, ax2) = plt.subplots(nrows=2, sharex=True)
colors = ["C0" if s != np.max(s_scores) else "C1" for s in s_scores]
ax1.bar(x=token_ids, height=s_scores, color=colors)
ax1.set_ylabel("Start Scores")
colors = ["C0" if s != np.max(e_scores) else "C1" for s in e_scores]
ax2.bar(x=token_ids, height=e_scores, color=colors)
ax2.set_ylabel("End Scores")
plt.xticks(token_ids, tokens, rotation="vertical")
plt.show()

In [None]:
import torch 

start_idx = torch.argmax(start_logits)  
end_idx = torch.argmax(end_logits) + 1  
answer_span = inputs["input_ids"][0][start_idx:end_idx]
answer = tokenizer.decode(answer_span)
print(f"Question: {question}")
print(f"Answer: {answer}")

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)
pipe(question=question, context=context, topk=3)

In [None]:
pipe(question="Why is there no data?", context=context, 
     handle_impossible_answer=True)

In [None]:
question="what are the benefits of 401(k) plan?"

In [None]:
context="""
Keep your 401(k) with your former employer
Most companies—but not all—allow you to keep your retirement savings in their plans after you leave.
Some benefits:
Your money has the chance to continue to grow tax-deferred.
You can take penalty-free withdrawals if you leave your job at age 55 or older.
Many offer institutionally priced (i.e., lower-cost) or unique investment options.
Federal law provides broad protection against creditors."""

In [None]:
pipe(question=question, context=context, 
     handle_impossible_answer=True)

In [None]:
context="""Changing or leaving a job can be an emotional time. You're probably excited about a new opportunity—and nervous too. And if you're retiring, the same can be said. As you say goodbye to your workplace, don’t forget about your 401(k) or 403(b) with that employer. You have several options and it’s an important decision.

Because your 401(k) may be a big chunk of your retirement savings, it's important to weigh the pros and cons of your options and find the one that makes sense for you.

Here are 4 choices to consider.

1. Keep your 401(k) with your former employer
Most companies—but not all—allow you to keep your retirement savings in their plans after you leave.

Some benefits:
1. Your money has the chance to continue to grow tax-deferred.
2. You can take penalty-free withdrawals if you leave your job at age 55 or older.
3. Many offer institutionally priced (i.e., lower-cost) or unique investment options.
4. Federal law provides broad protection against creditors.
But:
If you have less than $5,000 in the plan, the money may be automatically sent to you (or sent to an IRA for you).
If you choose to keep the money in your former employer's plan, you won't be able to add any more money to the account, or, in most cases, take a 401(k) loan.
Withdrawal options may be limited. For instance, you may not be able to take a partial withdrawal; you may have to take the entire balance.
After you reach age 72, you'll have to take annual required minimum distributions (RMDs).
If you hold appreciated company stock in your workplace savings account, consider the potential impact of net unrealized appreciation (NUA) before choosing between staying in the plan, taking the stock in kind, or rolling over the stock to an IRA. Rolling over the stock to an IRA will eliminate any NUA.

2. Roll over the money into an IRA
A Rollover IRA is a retirement account that allows you to move money from your former employer-sponsored retirement plan into an IRA.

You can open the IRA with a financial institution. Make sure to research fees and expenses when choosing an IRA provider, though, as they can really vary.

Some benefits:
Your money has the chance to continue to grow tax-deferred.
If you're under age 59½, you can withdraw money penalty-free for a qualifying first-time home purchase or higher education expenses.1
You may be able to get a broader range of investment choices than is available in an employer's plan.
Rolling over assets can be done by source type. This means you can roll over Roth assets independently to a Roth IRA. You will not need to take RMDs from those Roth assets like you would have if they remained in plan.
But:
After you reach age 72, you’ll have to take annual required minimum distributions (RMDs) from a traditional IRA every year, even if you're still working.
Federal law offers more protection for money in 401(k) plans than in IRAs. However, some states offer certain creditor protection for IRAs too.
3. Roll over your 401(k) into a new employer's plan
Not all employers will accept a rollover from a previous employer’s plan, so check with your new employer before making any decisions.

Some benefits:
Your money has the chance to continue to grow tax-deferred.
Having only one 401(k) can make it easier to manage your retirement savings.
Many plans offer lower-cost or plan-specific investment options.
Federal law provides broad protection against creditors. You can defer RMDs even if you're still working after age 72.2
But:
Make sure to understand your new plan rules. 
Consider the range of investment options available in the new plan.
4. Cash out
Taking the money out of retirement accounts altogether should be avoided unless the immediate need for cash is critical and you have no other options. The consequences vary depending on your age and tax situation. If you withdraw from your 401(k) before age 59½, the money will generally be subject to both ordinary income taxes and a potential 10% early withdrawal penalty. (An early withdrawal penalty doesn't apply if you stopped working for your former employer in or after the year you reached age 55, but are not yet age 59½. This exception doesn’t apply to assets rolled over to an IRA.)"""


In [None]:
question="what are the benefits of 401(k) plan?"

In [None]:
pipe(question=question, context=context, 
     handle_impossible_answer=True)

In [None]:
### We need to fine Tune the model

In [None]:
context="""The traditional IRA and the Roth IRA offer ways to save for retirement, although each offers different benefits and advantages. This article explores the important decision variables when choosing between the 2, as well as the impact of each on your current vs. future tax liabilities.

The traditional IRA allows an individual with earned income to take a tax deduction for dollars contributed (if income falls below a certain threshold), and the growth in the account is tax deferred. When distributions are taken from a traditional IRA, they are taxed as ordinary income. If one chooses not to take distributions from an IRA after reaching 59½, the IRS will force distributions to be taken at age 72. These are known as required minimum distributions (RMDs) and are based on the presumable retiree's life expectancy.

In order to take the deduction in 2022, an employee who is covered by a workplace retirement plan (such as a 401(k) or a similar plan) must make less than $68,000 to $78,000 as an individual or $109,000 to $129,000 as a married couple. If one of two spouses is covered by an employer-sponsored plan, the income limits for the household are increased to $204,000 to $214,000, and if no one in a household is covered by a plan, there is no income limitation in order to deduct contributions to a Traditional IRA.

For the ranges specified above, traditional IRA contributions are subject to an income phase-out rule, meaning that if your income falls within these ranges, your ability to take a tax deduction is phased out. So, for instance, as an individual in 2022, if you make less than $68,000, you will receive a full deduction; if you make between $68,000 and $78,000, you will receive a phased out deduction; and if you make more than $78,000, you will receive no deduction.

The 2022 contribution limit for a traditional IRA is $6,000 with an extra $1,000 catch-up contribution for those 50 and over.

The other option is a Roth IRA. The Roth IRA was established as an account into which after-tax dollars are invested. While the Roth gives no tax deduction on the front end, the growth—and eventual distribution—is federal tax-free. The Roth IRA allows one to take out 100% of contributions at any time for any reason with no taxes or penalties. It is only the growth on which one must wait until the age of 59½ to draw penalty-free. There is also a 5-year aging period, which means that a payment made from a Roth IRA account is considered a qualified distribution if it is made after a 5-year period, beginning with the first taxable year after which a contribution to the Roth IRA occurs. There are exceptions for death or disability, and there is a one-time $10,000 qualified distribution for first-time home buyers.

As of 2022, if you make less than $129,000 for a single individual or $204,000 for a married couple, you can contribute $6,000 per person ($7,000 for individuals age 50 or older).

For 2022, between $129,000 and $144,000 for an individual or $204,000 and $214,000 for a married couple, your allowable Roth contribution is phased out, and if you make over those top thresholds, you're not able to contribute to a Roth IRA.

So now the stage is set for the epic battle: traditional IRA versus Roth IRA. The contrast is argued by many sides for various reasons. Most have focused on the difference between the 2 regarding taxation. It is commonly suggested for folks who would anticipate a higher rate of tax in the future, the Roth is the best option. For those, however, currently in their peak income earning years and expecting a lower tax rate in retirement, the traditional IRA has always been considered best.

One decision variable to keep in mind too, however, is income tax levels in the future. Most of the income generated by those in retirement is taxable. Even though at its inception Social Security income was promised not to be taxed, now up to 85% of one's Social Security retirement benefit could be taxed, depending on income. Pension income is taxed, although some states do not tax recipients of some pension income to draw retirees to their state. And, of course, tax-deductible contributions to 401(k)s and IRAs are going to be taxed in the year in which you take a distribution.

There are pros and cons to each retirement account, but ultimately the decision should be based on your own situation with special attention paid to your age and where you are in your career (peak income years versus retirement years)."""


# New Section

In [None]:
question="what is the age for Required minimum distribution?"

In [None]:
pipe(question=question, context=context, 
     handle_impossible_answer=True)

In [None]:
question="what is the limitation on deduction?"

In [None]:
pipe(question=question, context=context, 
     handle_impossible_answer=True)

In [None]:
question="whether Roth IRA is better than traditional IRA?"

In [None]:
pipe(question=question, context=context, 
     handle_impossible_answer=True)

In [None]:
question="whether traditional IRA is better than roth IRA?"

In [None]:
pipe(question=question, context=context, 
     handle_impossible_answer=True)

In [None]:

#hide_input
#id subjqa-dist
#caption Distribution of tokens for each question-context pair in the SubjQA training set
def compute_input_length(row):
    inputs = tokenizer(row["question"], row["context"])
    return len(inputs["input_ids"])

dfs["train"]["n_tokens"] = dfs["train"].apply(compute_input_length, axis=1)

fig, ax = plt.subplots()
dfs["train"]["n_tokens"].hist(bins=100, grid=False, ec="C0", ax=ax)
plt.xlabel("Number of tokens in question-context pair")
ax.axvline(x=512, ymin=0, ymax=1, linestyle="--", color="C1", 
           label="Maximum sequence length")
plt.legend()
plt.ylabel("Count")
plt.show()

In [None]:

example = dfs["train"].iloc[0][["question", "context"]]
tokenized_example = tokenizer(example["question"], example["context"], 
                              return_overflowing_tokens=True, max_length=100, 
                              stride=25)

In [None]:

for idx, window in enumerate(tokenized_example["input_ids"]):
    print(f"Window #{idx} has {len(window)} tokens")

In [None]:

for window in tokenized_example["input_ids"]:
    print(f"{tokenizer.decode(window)} \n")
     

In [None]:
url = """https://artifacts.elastic.co/downloads/elasticsearch/\
elasticsearch-7.9.2-linux-x86_64.tar.gz"""
!wget -nc -q {url}
!tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz

In [None]:
import os
from subprocess import Popen, PIPE, STDOUT

# Run Elasticsearch as a background process
!chown -R daemon:daemon elasticsearch-7.9.2
es_server = Popen(args=['elasticsearch-7.9.2/bin/elasticsearch'],
                  stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1))
# Wait until Elasticsearch has started
!sleep 30

In [None]:

# Alternative if Docker is installed
from haystack.utils import launch_es

launch_es()

In [None]:

!curl -X GET "localhost:9200/?pretty"

In [None]:
from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore

# Return the document embedding for later use with dense retriever 
document_store = ElasticsearchDocumentStore(return_embedding=True)

In [None]:
# It's a good idea to flush Elasticsearch with each notebook restart
if len(document_store.get_all_documents()) or len(document_store.get_all_labels()) > 0:
    document_store.delete_documents(index="document")
    document_store.delete_documents(index="label")

In [None]:

for split, df in dfs.items():
    # Exclude duplicate reviews
    docs = [{"content": row["context"], "id": row["review_id"],
             "meta":{"item_id": row["title"], "question_id": row["id"], 
                     "split": split}} 
        for _,row in df.drop_duplicates(subset="context").iterrows()]
    document_store.write_documents(documents=docs, index="document")
    
print(f"Loaded {document_store.get_document_count()} documents")

In [None]:
from haystack.nodes.retriever import BM25Retriever

bm25_retriever = BM25Retriever(document_store=document_store)

In [None]:
item_id = "B0074BW614"
query = "Is it good for reading?"
retrieved_docs = bm25_retriever.retrieve(
    query=query, top_k=3, filters={"item_id":[item_id], "split":["train"]})

In [None]:

print(retrieved_docs[0])

In [None]:

from haystack.nodes import FARMReader

model_ckpt = "deepset/minilm-uncased-squad2" #alternative larger models: deepset/roberta-base-squad2-distilled or deepset/xlm-roberta-large-squad2 or the tiny distilled model: deepset/tinyroberta-squad2
max_seq_length, doc_stride = 384, 128
reader = FARMReader(model_name_or_path=model_ckpt, progress_bar=False,
                    max_seq_len=max_seq_length, doc_stride=doc_stride, 
                    return_no_answer=True)

In [None]:

print(reader.predict_on_texts(question=question, texts=[context], top_k=1))

In [None]:
from haystack.pipelines import ExtractiveQAPipeline

pipe = ExtractiveQAPipeline(reader=reader, retriever=bm25_retriever)

In [None]:

n_answers = 3
preds = pipe.run(query=query, params={"Retriever": {"top_k": 3, "filters":{"item_id": [item_id], "split":["train"]}}, 
                                      "Reader": {"top_k": n_answers}})

print(f"Question: {preds['query']} \n")

for idx in range(n_answers):
    print(f"Answer {idx+1}: {preds['answers'][idx].answer}")
    print(f"Review snippet: ...{preds['answers'][idx].context}...")
    print("\n\n")
     

In [None]:
from haystack.pipelines import DocumentSearchPipeline

pipe = DocumentSearchPipeline(retriever=bm25_retriever)

In [None]:
from haystack import Label, Answer, Document

labels = []
for i, row in dfs["test"].iterrows():
    # Metadata used for filtering in the Retriever
    meta = {"item_id": row["title"], "question_id": row["id"]}
    # Populate labels for questions with answers
    if len(row["answers.text"]):
        for answer in row["answers.text"]:
            label = Label(
                query=row["question"], answer=Answer(answer=answer), origin="gold-label", document=Document(content=row["context"], id=row["review_id"]),
                meta=meta, is_correct_answer=True, is_correct_document=True,
                no_answer=False, filters={"item_id": [meta["item_id"]], "split":["test"]})
            labels.append(label)
    # Populate labels for questions without answers
    else:
        label = Label(
            query=row["question"], answer=Answer(answer=""), origin="gold-label", document=Document(content=row["context"], id=row["review_id"]),
            meta=meta, is_correct_answer=True, is_correct_document=True,
            no_answer=True, filters={"item_id": [row["title"]], "split":["test"]})  
        labels.append(label)

In [None]:
document_store.write_labels(labels, index="label")

print(f"""Loaded {document_store.get_label_count(index="label")} \
question-answer pairs""")

In [None]:
labels_agg = document_store.get_all_labels_aggregated(
    index="label",
    open_domain=True,
    aggregate_by_meta=["item_id"]
)
print(len(labels_agg))

In [None]:
eval_result = pipe.eval(
    labels=labels_agg,
    params={"Retriever": {"top_k": 3}},
)
metrics = eval_result.calculate_metrics()