In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



In [4]:
# accessing g drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [8]:
# Load the data from the CSV file
data = pd.read_csv("/content/drive/MyDrive/DevRev/train_data.csv")

# Preprocess the data
# data["question"] = data["Question"].str.lower()
# data["paragraph"] = data["Paragraph"].str.lower()
data['answers'] = [{'answer_start': data['Answer_start'][it], 'text': data['Answer_text'][it]} for it in range(len(data))]
# Split the data into training and test sets
train_contexts, train_questions, train_answers = data['Paragraph'], data['Question'], data['answers']

  exec(code_obj, self.user_global_ns, self.user_ns)


In [9]:
def add_end_idx(answers, contexts):
    i = 0
    for answer, context in zip(answers, contexts):
        #print(i)
        gold_text = answer['text']
        si = answer['answer_start']
        
        if len(str(si)) == 2:
            start_idx = si
            end_idx = None
        elif str(si)[1:len(str(si))-1].isnumeric() == False:
            start_idx = si
            end_idx = None
        else:
            start_idx = int(str(si)[1:len(str(si))-1])
            #print(start_idx)
            end_idx = start_idx + len(gold_text) - 5
        # print(start_idx)
        # print(end_idx)
        # print(f'[{context[start_idx:end_idx+1]}]')
        # print(gold_text)
        #sometimes squad answers are off by a character or two – fix this
        if end_idx != None:
            if f"['{context[start_idx:end_idx+1]}']" == gold_text:
                answer['answer_start'] = start_idx 
                answer['answer_end'] = end_idx+1
            elif f"['{context[start_idx-1:end_idx]}']" == gold_text:
                answer['answer_start'] = start_idx - 1
                answer['answer_end'] = end_idx      # When the gold label is off by one character
            elif f"['{context[start_idx-2:end_idx-1]}']" == gold_text:
                answer['answer_start'] = start_idx - 2
                answer['answer_end'] = end_idx - 1     # When the gold label is off by two characters
            else:
                answer['answer_start'] = start_idx
                answer['answer_end'] = None
        else:
            answer['answer_start'] = start_idx
            answer['answer_end'] = None
        i+=1

#add_end_idx(val_answers, val_contexts)
add_end_idx(train_answers, train_contexts)

In [10]:
ind = []
for row in range(len(train_answers)):
    
    if train_answers[row]['answer_end'] == None:
        ind.append(row)

In [11]:
train_answers.drop(ind, inplace = True)
train_questions.drop(ind, inplace = True)
train_contexts.drop(ind, inplace = True)


In [12]:
train_answers.reset_index(drop=True, inplace = True)
train_questions.reset_index(drop = True, inplace = True)
train_contexts.reset_index(drop = True, inplace = True)

In [13]:
train_answers_1 = train_answers[:5000]
train_questions_1 = train_questions[:5000]
train_contexts_1 = train_contexts[:5000]

In [15]:
! pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 29.2 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 81.6 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 68.3 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [16]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [17]:
train_encodings = tokenizer(train_contexts.to_list(), train_questions.to_list(), truncation=True, padding=True)


In [18]:
def add_token_positions(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        if answers[i]['answer_end'] != None:
            start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
            end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
            # if None, the answer passage has been truncated
            if start_positions[-1] is None:
                start_positions[-1] = tokenizer.model_max_length
            if end_positions[-1] is None:
                end_positions[-1] = tokenizer.model_max_length
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

add_token_positions(train_encodings, train_answers)
#add_token_positions(val_encodings, val_answers)

In [19]:
from transformers import TFDistilBertForQuestionAnswering
model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Downloading:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForQuestionAnswering: ['vocab_projector', 'activation_13', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs', 'dropout_19']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = SquadDataset(train_encodings)
#val_dataset = SquadDataset(val_encodings)

In [21]:
from transformers import DistilBertForQuestionAnswering
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this mode

In [None]:
from torch.utils.data import DataLoader
from transformers import AdamW

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)
s= 0
t = 0
for epoch in range(3):
    print('t', t)
    for batch in train_loader:
        if s%50 == 0:
          print(s)
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        s+=1
    t+=1
model.eval()



t 0
0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
