In [1]:
import pandas as pd
import numpy as np

# Display the full output in this notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
!pip install torch
!pip install transformers
import ast

[1m
         .:::.     .::.       
        ....yy:    .yy.       
        :.  .yy.    y.        
             :y:   .:         
             .yy  .:          
              yy..:           
              :y:.            
              .y.             
             .:.              
        ....:.                
        :::.                  
[0;33m
• Project files and data should be stored in /project. This is shared among everyone
  in the project.
• Personal files and configuration should be stored in /home/faculty.
• Files outside /project and /home/faculty will be lost when this server is terminated.
• Create custom environments to setup your servers reproducibly.
[0m
Collecting torch
  Using cached torch-1.11.0-cp38-cp38-manylinux1_x86_64.whl (750.6 MB)
Installing collected packages: torch
Successfully installed torch-1.11.0
[1m
         .:::.     .::.       
        ....yy:    .yy.       
        :.  .yy.    y.        
             :y:   .:         
             .yy  .:     

In [3]:
df = pd.read_csv("/project/question-answers-processed/fin-ba-processed-combined.csv")

In [4]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,question,answers,context_file,answer_start,answer_end,context
0,0,What is the fee for Business Analytics?,"UK - £19,400, and Overseas - £35,100",corpus/business-analytics/1.txt,63.0,99.0,Start date: September 2022 Duration: 12 months...
1,1,What is the fee for MSc of Business Analytics?,"UK - £19,400, and Overseas - £35,100",corpus/business-analytics/1.txt,63.0,99.0,Start date: September 2022 Duration: 12 months...
2,2,What's the masters fee for business analytics?,"UK - £19,400, and Overseas - £35,100",corpus/business-analytics/1.txt,63.0,99.0,Start date: September 2022 Duration: 12 months...


In [5]:
from sklearn.model_selection import train_test_split

df_train, df_dev = train_test_split(df, test_size=0.2, random_state=42)

# Split df_dev into a validation and test set
df_dev, df_test = df_dev[:205], df_dev[205:]

In [6]:
len(df_train)
len(df_dev)
len(df_test)

1633

205

204

In [7]:
from transformers import BertForQuestionAnswering, AutoTokenizer

model_name = 'deepset/bert-base-cased-squad2'


model = BertForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
train = tokenizer(list(df_train["context"]), list(df_train["question"]),
                  truncation=True, padding='max_length',
                  max_length=512, return_tensors='pt')

In [9]:
from tqdm.auto import tqdm  # for showing progress bar

In [10]:
tokenizer.decode(train['input_ids'][0])

'[CLS] MSc Finance students are expected to have basic knowledge in financial mathematics and econometrics, and should be motivated to take their knowledge to the next level. To get to that next level, we expect a great deal from our students, so if you choose to study with us, you can expect to be working hard, challenging yourself as we challenge you, and regularly finding yourself out of your comfort zone. [SEP] Are relevant work experiences required to be eligible for admission for the master program in Finance? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD

In [11]:
df_train = df_train.astype({"answer_start": int, "answer_end": int})

In [12]:
train_start_pos = df_train['answer_start'].tolist()
train_end_pos = df_train['answer_end'].tolist()

In [13]:
# Import the start_position & end_position to the dict
train.update({'start_positions': train_start_pos, 'end_positions': train_end_pos})

In [14]:
# Check the keys in the dict
train.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'])

In [15]:
# Train the model using Pytorch
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# build datasets for both our training data
train_dataset = SquadDataset(train)

In [16]:
# Feed our train dataset
loader = torch.utils.data.DataLoader(train_dataset,
                                     batch_size=32,
                                     shuffle=True)

In [17]:
from transformers import AdamW

# device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device('cpu')
model.to(device)
model.train()
optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [18]:
for epoch in range(4):
    loop = tqdm(loader)
    
    
    for batch in loop:
        optim.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
#         print('inputid', input_ids)
#         print('inputid', input_ids.shape)
#         print('attnm', attention_mask)
#         print('startpos', start_positions)
#         print('startpos', end_positions)

        outputs = model(input_ids, attention_mask=attention_mask, 
                        start_positions=start_positions,
                        end_positions=end_positions)
        
        loss = outputs[0]
        loss.sum().backward()
        optim.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.sum().item())

  0%|          | 0/52 [00:00<?, ?it/s]

  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


  0%|          | 0/52 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]

  0%|          | 0/52 [00:00<?, ?it/s]

## Testing on the dev model

In [19]:
from transformers import pipeline
nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)

In [20]:
df_dev['question'].values[0]

'Do I need basic finance knowledge for the Finance master?'

In [21]:
df_dev.reset_index(drop=True, inplace=True)

In [22]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

nlp('how are u', 'I am good today')

{'score': 0.12977248430252075,
 'start': 2,
 'end': 15,
 'answer': 'am good today'}

In [23]:
nlp(df_dev.loc[1]['question'], df_dev.loc[1]['context'])['score']

6.295947241596878e-05

In [24]:
df_dev

Unnamed: 0.1,Unnamed: 0,question,answers,context_file,answer_start,answer_end,context
0,1316,Do I need basic finance knowledge for the Fina...,MSc Finance students are expected to have basi...,corpus/finance/26.txt,0.0,167.0,MSc Finance students are expected to have basi...
1,1000,"For MSc Finance, what is the structure of the ...",All participants study four core modules in Te...,corpus/finance/3.txt,0.0,644.0,All participants study four core modules in Te...
2,1068,What is the requirement for GMAR or GRE exam f...,We do not require GMAT/GRE scores,corpus/finance/52.txt,378.0,411.0,Qualifications which are requested to support ...
3,1969,"For the master program in Finance, who should ...",the Programme Administrator via email at: mgmt...,corpus/finance/54.txt,76.0,140.0,For further information regarding the MSc Fina...
4,529,What information do I need to provide for my M...,You will need to include a degree transcript i...,corpus/business-analytics/105.txt,0.0,232.0,You will need to include a degree transcript i...
...,...,...,...,...,...,...,...
200,367,Is there a contact email for potential candita...,the Programme Administrator via: mgmt-ba@ucl.a...,corpus/business-analytics/73.txt,76.0,126.0,For further information regarding the Business...
201,757,Does my bachelor degree has to be highly numer...,we stress to our applicants that this programm...,corpus/finance/28.txt,84.0,227.0,While we do welcome students from a wide varie...
202,1985,What is the structure of the program for the m...,All participants study four core modules in Te...,corpus/finance/3.txt,0.0,644.0,All participants study four core modules in Te...
203,1582,"For the master program in Finance, what is the...","UK/EU/Overseas - £38,000",corpus/finance/1.txt,63.0,87.0,Start date: September 2022 Duration: 12 months...


In [25]:
result_df = pd.DataFrame(columns=['question', 'answer', 'model_answer', 'confidence_score', 'start_index', 'end_index'])

for i in range(df_dev.shape[0]):
    output = nlp(df_dev['question'].values[i], df_dev['context'].values[i])
    output_df = pd.DataFrame({'question': df_dev['question'].values[i], 'answer': df_dev['answers'].values[i],'model_answer': output['answer'], 'confidence_score': output['score'], 'start_index':output['start'], 'end_index':output['end']}, index=[i])
    
    result_df = pd.concat([result_df, output_df])

In [26]:
result_df.to_csv('dev-set-verification/bert-base-fine-tune-1.csv')

result_df

Unnamed: 0,question,answer,model_answer,confidence_score,start_index,end_index
0,Do I need basic finance knowledge for the Fina...,MSc Finance students are expected to have basi...,basic knowledge in financial mathematics and e...,0.000015,42,100
1,"For MSc Finance, what is the structure of the ...",All participants study four core modules in Te...,"elective portfolio is offered, with four elective",0.000018,220,269
2,What is the requirement for GMAR or GRE exam f...,We do not require GMAT/GRE scores,GMAT/GRE,0.030415,396,404
3,"For the master program in Finance, who should ...",the Programme Administrator via email at: mgmt...,Programme Administrator via email at,0.000041,80,116
4,What information do I need to provide for my M...,You will need to include a degree transcript i...,evidence of their English language ability,0.000285,163,205
...,...,...,...,...,...,...
200,Is there a contact email for potential candita...,the Programme Administrator via: mgmt-ba@ucl.a...,the UCL Postgraduate Admissions Webpage,0.000651,181,220
201,Does my bachelor degree has to be highly numer...,we stress to our applicants that this programm...,prepare for.,0.024388,442,454
202,What is the structure of the program for the m...,All participants study four core modules in Te...,"An elective portfolio is offered, with four el...",0.000007,217,269
203,"For the master program in Finance, what is the...","UK/EU/Overseas - £38,000",Minimum of 2:1 or equivalent in a relevant dis...,0.000951,254,307


In [27]:
result_df['comparison'] = np.where(result_df['answer'] == result_df['model_answer'] , 'True', 'False')

result_df

Unnamed: 0,question,answer,model_answer,confidence_score,start_index,end_index,comparison
0,Do I need basic finance knowledge for the Fina...,MSc Finance students are expected to have basi...,basic knowledge in financial mathematics and e...,0.000015,42,100,False
1,"For MSc Finance, what is the structure of the ...",All participants study four core modules in Te...,"elective portfolio is offered, with four elective",0.000018,220,269,False
2,What is the requirement for GMAR or GRE exam f...,We do not require GMAT/GRE scores,GMAT/GRE,0.030415,396,404,False
3,"For the master program in Finance, who should ...",the Programme Administrator via email at: mgmt...,Programme Administrator via email at,0.000041,80,116,False
4,What information do I need to provide for my M...,You will need to include a degree transcript i...,evidence of their English language ability,0.000285,163,205,False
...,...,...,...,...,...,...,...
200,Is there a contact email for potential candita...,the Programme Administrator via: mgmt-ba@ucl.a...,the UCL Postgraduate Admissions Webpage,0.000651,181,220,False
201,Does my bachelor degree has to be highly numer...,we stress to our applicants that this programm...,prepare for.,0.024388,442,454,False
202,What is the structure of the program for the m...,All participants study four core modules in Te...,"An elective portfolio is offered, with four el...",0.000007,217,269,False
203,"For the master program in Finance, what is the...","UK/EU/Overseas - £38,000",Minimum of 2:1 or equivalent in a relevant dis...,0.000951,254,307,False


In [28]:
result_df.comparison.value_counts()

False    204
True       1
Name: comparison, dtype: int64

In [29]:
import collections

# get tokens from text; just by splitting by spces
def get_simple_tokens(text):
    tokens = [token.strip() for token in text.split()]
    
    return tokens

# calculate f1 score for a single prediction-answer pair
def get_f1(pred, answer):
    pred_tokens = get_simple_tokens(pred)
    ans_tokens = get_simple_tokens(answer)
    
    common_tokens = collections.Counter(pred_tokens) & collections.Counter(ans_tokens)
    common_tokens_n = sum(common_tokens.values())
    
    if common_tokens_n == 0:
        return 0
    
    precision = 1.0 * common_tokens_n/len(pred_tokens)
    recall = 1.0 * common_tokens_n/len(ans_tokens)
    
    f1 = 2*((precision*recall)/(precision+recall))
    
    return f1

In [30]:
# loop through all answers, for df use for i in range(result_df.shape[0])

f1_scores = []
for i in range(result_df.shape[0]):
    f1_scores.append(get_f1(result_df['model_answer'].values[i], result_df['answer'].values[i]))
    
result_df['f1_score'] = f1_scores

In [31]:
result_df['f1_score'].mean()

0.22963281697125187

In [32]:
# Plot the training and validation losses
fig = plt.figure(figsize=(18, 8))
x = list(range(1, 1+len(train_losses)))

plt.plot(x, train_losses, 'b', linewidth=2)
plt.plot(x, valid_losses, 'r', linewidth=2)
plt.xlabel('Epoch #')
plt.ylabel('Binary Cross-Entropy Loss')
plt.legend(('Training Loss', 'Validation Loss'))
plt.show()

NameError: name 'plt' is not defined