<a href="https://colab.research.google.com/github/AmitNikhade/AmitNikhade/blob/main/jar_beta_7.6.22.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip uninstall pipelines -y

Found existing installation: pipelines 0.0.1
Uninstalling pipelines-0.0.1:
  Successfully uninstalled pipelines-0.0.1


In [None]:
import zipfile

with zipfile.ZipFile("/content/a.zip") as zf:
    zf.extractall()

In [8]:
!pip uninstall transformers -y
!pip uninstall sentencepiece
!pip install -U transformers==3.0.0
!pip install -U sentencepiece==0.1.91
!python -m nltk.downloader punkt



Found existing installation: transformers 4.19.2
Uninstalling transformers-4.19.2:
  Successfully uninstalled transformers-4.19.2
Found existing installation: sentencepiece 0.1.91
Uninstalling sentencepiece-0.1.91:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/_sentencepiece.cpython-37m-x86_64-linux-gnu.so
    /usr/local/lib/python3.7/dist-packages/sentencepiece-0.1.91.dist-info/*
    /usr/local/lib/python3.7/dist-packages/sentencepiece.py
Proceed (y/n)? y
  Successfully uninstalled sentencepiece-0.1.91
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
y
Collecting transformers==3.0.0
  Using cached transformers-3.0.0-py3-none-any.whl (754 kB)
Collecting sentencepiece
  Using cached sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
Collecting tokenizers==0.8.0-rc4
  Using cached tokenizers-0.8.0rc4-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
Installing collected packages: tokenizers, sente

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentencepiece==0.1.91
  Using cached sentencepiece-0.1.91-cp37-cp37m-manylinux1_x86_64.whl (1.1 MB)
Installing collected packages: sentencepiece
  Attempting uninstall: sentencepiece
    Found existing installation: sentencepiece 0.1.96
    Uninstalling sentencepiece-0.1.96:
      Successfully uninstalled sentencepiece-0.1.96
Successfully installed sentencepiece-0.1.91


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
import itertools
import logging
from typing import Optional, Dict, Union

from nltk import sent_tokenize

import torch
from transformers import(
    AutoModelForSeq2SeqLM, 
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
)

logger = logging.getLogger(__name__)

class QGPipeline:
    """Poor man's QG pipeline"""
    def __init__(
        self,
        model: PreTrainedModel,
        tokenizer: PreTrainedTokenizer,
        ans_model: PreTrainedModel,
        ans_tokenizer: PreTrainedTokenizer,
        qg_format: str,
        use_cuda: bool
    ):
        self.model = model
        self.tokenizer = tokenizer

        self.ans_model = ans_model
        self.ans_tokenizer = ans_tokenizer

        self.qg_format = qg_format

        self.device = "cuda" if torch.cuda.is_available() and use_cuda else "cpu"
        self.model.to(self.device)

        if self.ans_model is not self.model:
            self.ans_model.to(self.device)

        assert self.model.__class__.__name__ in ["T5ForConditionalGeneration", "BartForConditionalGeneration"]
        
        if "T5ForConditionalGeneration" in self.model.__class__.__name__:
            self.model_type = "t5"
        else:
            self.model_type = "bart"

    def __call__(self, inputs: str):
        inputs = " ".join(inputs.split())
        sents, answers = self._extract_answers(inputs)
        flat_answers = list(itertools.chain(*answers))
        
        if len(flat_answers) == 0:
          return []

        if self.qg_format == "prepend":
            qg_examples = self._prepare_inputs_for_qg_from_answers_prepend(inputs, answers)
        else:
            qg_examples = self._prepare_inputs_for_qg_from_answers_hl(sents, answers)
        
        qg_inputs = [example['source_text'] for example in qg_examples]
        questions = self._generate_questions(qg_inputs)
        output = [{'answer': example['answer'], 'question': que} for example, que in zip(qg_examples, questions)]
        return output
    
    def _generate_questions(self, inputs):
        inputs = self._tokenize(inputs, padding=True, truncation=True)
        
        outs = self.model.generate(
            input_ids=inputs['input_ids'].to(self.device), 
            attention_mask=inputs['attention_mask'].to(self.device), 
            max_length=32,
            num_beams=4,
        )
        
        questions = [self.tokenizer.decode(ids, skip_special_tokens=True) for ids in outs]
        return questions
    
    def _extract_answers(self, context):
        sents, inputs = self._prepare_inputs_for_ans_extraction(context)
        inputs = self._tokenize(inputs, padding=True, truncation=True)

        outs = self.ans_model.generate(
            input_ids=inputs['input_ids'].to(self.device), 
            attention_mask=inputs['attention_mask'].to(self.device), 
            max_length=32,
        )
        
        dec = [self.ans_tokenizer.decode(ids, skip_special_tokens=False) for ids in outs]
        answers = [item.split('<sep>') for item in dec]
        answers = [i[:-1] for i in answers]
        
        return sents, answers
    
    def _tokenize(self,
        inputs,
        padding=True,
        truncation=True,
        add_special_tokens=True,
        max_length=512
    ):
        inputs = self.tokenizer.batch_encode_plus(
            inputs, 
            max_length=max_length,
            add_special_tokens=add_special_tokens,
            truncation=truncation,
            padding="max_length" if padding else False,
            pad_to_max_length=padding,
            return_tensors="pt"
        )
        return inputs
    
    def _prepare_inputs_for_ans_extraction(self, text):
        sents = sent_tokenize(text)

        inputs = []
        for i in range(len(sents)):
            source_text = "extract answers:"
            for j, sent in enumerate(sents):
                if i == j:
                    sent = "<hl> %s <hl>" % sent
                source_text = "%s %s" % (source_text, sent)
                source_text = source_text.strip()
            
            if self.model_type == "t5":
                source_text = source_text + " </s>"
            inputs.append(source_text)

        return sents, inputs
    
    def _prepare_inputs_for_qg_from_answers_hl(self, sents, answers):
        inputs = []
        for i, answer in enumerate(answers):
            if len(answer) == 0: continue
            for answer_text in answer:
                sent = sents[i]
                sents_copy = sents[:]
                
                answer_text = answer_text.strip()
                
                ans_start_idx = sent.index(answer_text)
                
                sent = f"{sent[:ans_start_idx]} <hl> {answer_text} <hl> {sent[ans_start_idx + len(answer_text): ]}"
                sents_copy[i] = sent
                
                source_text = " ".join(sents_copy)
                source_text = f"generate question: {source_text}" 
                if self.model_type == "t5":
                    source_text = source_text + " </s>"
                
                inputs.append({"answer": answer_text, "source_text": source_text})
        
        return inputs
    
    def _prepare_inputs_for_qg_from_answers_prepend(self, context, answers):
        flat_answers = list(itertools.chain(*answers))
        examples = []
        for answer in flat_answers:
            source_text = f"answer: {answer} context: {context}"
            if self.model_type == "t5":
                source_text = source_text + " </s>"
            
            examples.append({"answer": answer, "source_text": source_text})
        return examples

    
class MultiTaskQAQGPipeline(QGPipeline):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
    
    def __call__(self, inputs: Union[Dict, str]):
        if type(inputs) is str:
            # do qg
            return super().__call__(inputs)
        else:
            # do qa
            return self._extract_answer(inputs["question"], inputs["context"])
    
    def _prepare_inputs_for_qa(self, question, context):
        source_text = f"question: {question}  context: {context}"
        if self.model_type == "t5":
            source_text = source_text + " </s>"
        return  source_text
    
    def _extract_answer(self, question, context):
        source_text = self._prepare_inputs_for_qa(question, context)
        inputs = self._tokenize([source_text], padding=False)
    
        outs = self.model.generate(
            input_ids=inputs['input_ids'].to(self.device), 
            attention_mask=inputs['attention_mask'].to(self.device), 
            max_length=16,
        )

        answer = self.tokenizer.decode(outs[0], skip_special_tokens=True)
        return answer


class E2EQGPipeline:
    def __init__(
        self,
        model: PreTrainedModel,
        tokenizer: PreTrainedTokenizer,
        use_cuda: bool
    ) :

        self.model = model
        self.tokenizer = tokenizer

        self.device = "cuda" if torch.cuda.is_available() and use_cuda else "cpu"
        self.model.to(self.device)

        assert self.model.__class__.__name__ in ["T5ForConditionalGeneration", "BartForConditionalGeneration"]
        
        if "T5ForConditionalGeneration" in self.model.__class__.__name__:
            self.model_type = "t5"
        else:
            self.model_type = "bart"
        
        self.default_generate_kwargs = {
            "max_length": 256,
            "num_beams": 4,
            "length_penalty": 1.5,
            "no_repeat_ngram_size": 3,
            "early_stopping": True,
        }
    
    def __call__(self, context: str, **generate_kwargs):
        inputs = self._prepare_inputs_for_e2e_qg(context)

        # TODO: when overrding default_generate_kwargs all other arguments need to be passsed
        # find a better way to do this
        if not generate_kwargs:
            generate_kwargs = self.default_generate_kwargs
        
        input_length = inputs["input_ids"].shape[-1]
        
        # max_length = generate_kwargs.get("max_length", 256)
        # if input_length < max_length:
        #     logger.warning(
        #         "Your max_length is set to {}, but you input_length is only {}. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)".format(
        #             max_length, input_length
        #         )
        #     )

        outs = self.model.generate(
            input_ids=inputs['input_ids'].to(self.device), 
            attention_mask=inputs['attention_mask'].to(self.device),
            **generate_kwargs
        )

        prediction = self.tokenizer.decode(outs[0], skip_special_tokens=True)
        questions = prediction.split("<sep>")
        questions = [question.strip() for question in questions[:-1]]
        return questions
    
    def _prepare_inputs_for_e2e_qg(self, context):
        source_text = f"generate questions: {context}"
        if self.model_type == "t5":
            source_text = source_text + " </s>"
        
        inputs = self._tokenize([source_text], padding=False)
        return inputs
    
    def _tokenize(
        self,
        inputs,
        padding=True,
        truncation=True,
        add_special_tokens=True,
        max_length=512
    ):
        inputs = self.tokenizer.batch_encode_plus(
            inputs, 
            max_length=max_length,
            add_special_tokens=add_special_tokens,
            truncation=truncation,
            padding="max_length" if padding else False,
            pad_to_max_length=padding,
            return_tensors="pt"
        )
        return inputs


SUPPORTED_TASKS = {
    "question-generation": {
        "impl": QGPipeline,
        "default": {
            "model": "valhalla/t5-small-qg-hl",
            "ans_model": "valhalla/t5-small-qa-qg-hl",
        }
    },
    "multitask-qa-qg": {
        "impl": MultiTaskQAQGPipeline,
        "default": {
            "model": "valhalla/t5-small-qa-qg-hl",
        }
    },
    "e2e-qg": {
        "impl": E2EQGPipeline,
        "default": {
            "model": "valhalla/t5-small-e2e-qg",
        }
    }
}

def pipeline(
    task: str,
    model: Optional = None,
    tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
    qg_format: Optional[str] = "highlight",
    ans_model: Optional = None,
    ans_tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None,
    use_cuda: Optional[bool] = True,
    **kwargs,
):
    # Retrieve the task
    if task not in SUPPORTED_TASKS:
        raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys())))

    targeted_task = SUPPORTED_TASKS[task]
    task_class = targeted_task["impl"]

    # Use default model/config/tokenizer for the task if no model is provided
    if model is None:
        model = targeted_task["default"]["model"]
    
    # Try to infer tokenizer from model or config name (if provided as str)
    if tokenizer is None:
        if isinstance(model, str):
            tokenizer = model
        else:
            # Impossible to guest what is the right tokenizer here
            raise Exception(
                "Impossible to guess which tokenizer to use. "
                "Please provided a PretrainedTokenizer class or a path/identifier to a pretrained tokenizer."
            )
    
    # Instantiate tokenizer if needed
    if isinstance(tokenizer, (str, tuple)):
        if isinstance(tokenizer, tuple):
            # For tuple we have (tokenizer name, {kwargs})
            tokenizer = AutoTokenizer.from_pretrained(tokenizer[0], **tokenizer[1])
        else:
            tokenizer = AutoTokenizer.from_pretrained(tokenizer)
    
    # Instantiate model if needed
    if isinstance(model, str):
        model = AutoModelForSeq2SeqLM.from_pretrained(model)
    
    if task == "question-generation":
        if ans_model is None:
            # load default ans model
            ans_model = targeted_task["default"]["ans_model"]
            ans_tokenizer = AutoTokenizer.from_pretrained(ans_model)
            ans_model = AutoModelForSeq2SeqLM.from_pretrained(ans_model)
        else:
            # Try to infer tokenizer from model or config name (if provided as str)
            if ans_tokenizer is None:
                if isinstance(ans_model, str):
                    ans_tokenizer = ans_model
                else:
                    # Impossible to guest what is the right tokenizer here
                    raise Exception(
                        "Impossible to guess which tokenizer to use. "
                        "Please provided a PretrainedTokenizer class or a path/identifier to a pretrained tokenizer."
                    )
            
            # Instantiate tokenizer if needed
            if isinstance(ans_tokenizer, (str, tuple)):
                if isinstance(ans_tokenizer, tuple):
                    # For tuple we have (tokenizer name, {kwargs})
                    ans_tokenizer = AutoTokenizer.from_pretrained(ans_tokenizer[0], **ans_tokenizer[1])
                else:
                    ans_tokenizer = AutoTokenizer.from_pretrained(ans_tokenizer)

            if isinstance(ans_model, str):
                ans_model = AutoModelForSeq2SeqLM.from_pretrained(ans_model)
    
    if task == "e2e-qg":
        return task_class(model=model, tokenizer=tokenizer, use_cuda=use_cuda)
    elif task == "question-generation":
        return task_class(model=model, tokenizer=tokenizer, ans_model=ans_model, ans_tokenizer=ans_tokenizer, qg_format=qg_format, use_cuda=use_cuda)
    else:
        return task_class(model=model, tokenizer=tokenizer, ans_model=model, ans_tokenizer=tokenizer, qg_format=qg_format, use_cuda=use_cuda)

In [8]:
nlp =pipeline("question-generation", model="valhalla/t5-base-qg-hl")

In [10]:
f = open("/content/sh.txt")
f = f.read().strip().replace("\n","").replace("  ","")
text= f

In [11]:
# print(nlp("Day one: finding your breath.“Take a moment to listen to your breathing,” the recording of Dr. McCourt said, through the car’s speakers. “Don’t try to control it, just feel the air as it enters and exits your nose.”Garry slammed his brakes and his horn. Just like the driver ahead. Just like the driver behind.“Stop fucking honking at me asswipe!” he screamed over his shoulder. He could see the driver behind him, screaming himself red. Look at this guy! What an idiot.Somewhere a light turned green."))

In [12]:

from tqdm.notebook import tqdm_notebook
import uuid
# data = []
text_ = text.split('.')
tm = 0
for n,s in enumerate(tqdm_notebook(text_)):
  print(nlp(s))
  # if n == 2000:
  #   break
  # else:
    # print(n)
  # try:
  #   qa = nlp(s)
  #   print(qa)
  #   for i in qa:
  #     if str(i['answer']) in str(s):
  #       print([str(uuid.uuid4()).replace("-",""),s,i['question'],i['answer'],str(text.index(i['answer']))])
  #       # data.append([str(uuid.uuid4()).replace("-",""),s,i['question'],i['answer'],str(text.index(i['answer']))])
  #         # tm = tm+1
      
          
  # except:
  #   pass 

  
     
  # # break

  0%|          | 0/7165 [00:00<?, ?it/s]

  beam_id = beam_token_id // vocab_size


[{'answer': '1894', 'question': 'In what year was Ronald Adair murdered?'}, {'answer': 'Ronald Adair', 'question': 'Who was murdered in the spring of 1894?'}]
[{'answer': 'the prosecution was so overwhelmingly strong that it was not necessary to bring forward all the facts', 'question': 'Why was the case suppressed?'}]
[{'answer': 'nearly ten years', 'question': 'How long have I been able to supply the missing links that make up the whole remarkable chain?'}]
[{'answer': 'the inconceivable sequel', 'question': 'What was the greatest shock and surprise of any event in my adventurous life?'}, {'answer': 'the greatest shock and surprise of any event in my adventurous life', 'question': 'What did the sequel give me?'}]
[{'answer': 'joy, amazement, and incredulity', 'question': 'What kind of emotions do I feel after this long interval?'}]
[{'answer': 'withdrawn', 'question': 'What happened to the positive prohibition from his lips?'}]
[{'answer': 'indifferent success', 'question': "How did 

ValueError: ignored

In [13]:
data

{'answer_start': [],
 'answer_text': [],
 'answers': [],
 'context': [],
 'id': [],
 'question': []}

In [6]:
import pandas as pd
 
# intialise data of lists.
data = {'id':[i[0] for i in data],
        'context':[i[1] for i in data],
        'question':[i[2] for i in data],
        'answer_text':[i[3] for i in data],
        'answer_start':[i[4] for i in data],
        'answers':[{'answer_start':[int(i[4])],'text':[i[3]]} for i in data]
    
        }
 
# Create DataFrame
train = pd.DataFrame(data)
 
# Print the output.
# print(df)

In [12]:
train

Unnamed: 0.1,Unnamed: 0,id,context,question,answer_text,answer_start,answers


In [7]:
train.to_csv('file1.csv')

In [8]:
!cp /content/file1.csv /content/drive/MyDrive/

In [9]:
import pandas as pd
pd.read_csv('/content/file1.csv')

Unnamed: 0.1,Unnamed: 0,id,context,question,answer_text,answer_start,answers


In [None]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.2.2-py3-none-any.whl (346 kB)
[K     |████████████████████████████████| 346 kB 4.9 MB/s 
[?25hCollecting dill<0.3.5
  Downloading dill-0.3.4-py2.py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 7.7 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 76.4 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 7.3 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 73.9 MB/s 
Collecting

In [None]:
# !cd /content/drive/MyDrive/
!wget "https://storage.googleapis.com/kaggle-data-sets/1525850/2702665/compressed/deepset.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20220525%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20220525T101350Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=5667ed568b3a917617d205774480e6bfcc20cc61aea8533b5ed5bdaaa76fd86e36a3cc395bc2092d3b13539d57e31da7995dd00b0dc333ef6ec18c2cbb1c22245d89fcfcf9b6e8b13c35a8d82983d8321dd6625823fdb2582274e651926f93d580b12f0fd58608e9886aeded57a018ea47b510774627c04468630880e4dbbe4b5f29c25da95443df9f2cfa16d7a6ea9f46e884570afb7f3652583e7bca82c18dbd0805e25ad616916b3eec9edbde957d6b305e4c9b44bd556706184fac8541fc12160f15aef2f00d5de422a3a4ba029da4d7f4fce470afb17089a7bd3b49f7b5687861c3ff2ebe0e50afe42b25c6309927c0562f0b3af4830e730492d6c19314"

The name is too long, 767 chars total.
Trying to shorten...
New name is deepset.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com@kaggle-161607.iam.gserviceaccount.com%2F20220525%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20220525T101350Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=hos.
--2022-05-26 09:04:48--  https://storage.googleapis.com/kaggle-data-sets/1525850/2702665/compressed/deepset.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20220525%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20220525T101350Z&X-Goog-Expires=259199&X-Goog-SignedHeaders=host&X-Goog-Signature=5667ed568b3a917617d205774480e6bfcc20cc61aea8533b5ed5bdaaa76fd86e36a3cc395bc2092d3b13539d57e31da7995dd00b0dc333ef6ec18c2cbb1c22245d89fcfcf9b6e8b13c35a8d82983d8321dd6625823fdb2582274e651926f93d580b12f0fd58608e9886aeded57a018ea47b510774627c04468630880e4dbbe4b5f29c25da95443df9f2cfa16d7a6ea9f46e884570afb7f3652583e7bca82c18dbd0805e25ad61

In [10]:
# !pip uninstall transformers -y
# !pip install transformers
import pandas as pd
train = pd.read_csv('/content/file1.csv')

In [11]:
train

Unnamed: 0.1,Unnamed: 0,id,context,question,answer_text,answer_start,answers


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# from transformers import *
from transformers import default_data_collator, Trainer
from transformers import AutoTokenizer, TrainingArguments,AutoModelForQuestionAnswering
import tensorflow as tf
# import collection
from datasets import Dataset
import os
import sys
# sys.setdefaultencoding('utf-8')
# sys.setprofile('utf-8')
tokenizer = AutoTokenizer.from_pretrained('/content/deepset/xlm-roberta-large-squad2')

batch_size = 1
max_length = 10 
doc_stride = 12
pad_on_right = tokenizer.padding_side == "right"

def prepare_train_features(examples):
    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
    # left whitespace
    examples["question"] = [q.lstrip() for q in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples


def convert_answers(r):
    start = r[0]
    text = r[1]
    return {
        'answer_start': [start],
        'text': [text]
    }

train = train.sample(frac=1, random_state=42)
train['answers'] = train[['answer_start', 'answer_text']].apply(convert_answers, axis=1)

# df_train = train[:-64].reset_index(drop=True)
# df_valid = train[-64:].reset_index(drop=True)

# train_dataset = Dataset.from_pandas(train)
# valid_dataset = Dataset.from_pandas(df_valid)

ImportError: ignored

In [None]:
import zipfile
with zipfile.ZipFile('/content/deepset.zip', 'r') as zip_ref:
    zip_ref.extractall()
# train

In [None]:
!pip uninstall datasets -y
# !pip install huggingface-hub==0.2.1

Found existing installation: datasets 1.7.0
Uninstalling datasets-1.7.0:
  Successfully uninstalled datasets-1.7.0


In [None]:
# !pip uninstall fsspec -qq -y
!pip install --no-index --find-links /content/wheels datasets -qq

In [None]:
train_dataset = Dataset.from_pandas(train)

NameError: ignored

In [None]:
train_dataset

Dataset({
    features: ['Unnamed: 0', 'id', 'context', 'question', 'answer_text', 'answer_start', 'answers', '__index_level_0__'],
    num_rows: 28
})

In [None]:
tokenized_train_ds = train_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)
tokenized_valid_ds = valid_dataset.map(prepare_train_features, batched=True, remove_columns=train_dataset.column_names)

AttributeError: ignored