<a href="https://colab.research.google.com/github/meti-94/OpenQA/blob/main/OpanQA_v2/Table%203.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installing necessary libraries

In [1]:
!pip3 install 'transformers[torch]' -q
!pip install --upgrade openpyxl -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
!git clone https://github.com/meti-94/OpenQA.git

Cloning into 'OpenQA'...
remote: Enumerating objects: 262, done.[K
remote: Counting objects: 100% (25/25), done.[K
remote: Compressing objects: 100% (20/20), done.[K
remote: Total 262 (delta 9), reused 14 (delta 3), pack-reused 237[K
Receiving objects: 100% (262/262), 102.62 MiB | 30.16 MiB/s, done.
Resolving deltas: 100% (112/112), done.


### Importing libraries

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoTokenizer, AutoModel, BertForSequenceClassification
from sklearn.metrics import classification_report
from sklearn import preprocessing

### Creating datasets

In [4]:
## loading data
## X --> Texts
## y --> Labels
le = preprocessing.LabelEncoder()
df_train = pd.read_excel('/content/OpenQA/data/freebase/train_useful_records.xlsx')
df_valid = pd.read_excel('/content/OpenQA/data/freebase/valid_useful_records.xlsx')
df_test = pd.read_excel('/content/OpenQA/data/freebase/test_useful_records.xlsx')
le.fit(df_train.relation_type.to_list()+df_valid.relation_type.to_list()+df_test.relation_type.to_list())
df_train['label'] = le.transform(df_train.relation_type.to_list())
df_valid['label'] = le.transform(df_valid.relation_type.to_list())
df_test['label'] = le.transform(df_test.relation_type.to_list())

### Loading BERT model and tokenizer model

Loading the Standard 12L Bert model

In [6]:

config = AutoConfig.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(le.classes_))


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Creating dataset and dataloader for X-y

converting raw data into conventional PyTorch Dataset class

In [7]:
train_encodings = tokenizer(df_train.Question.to_list(), truncation=True, padding=True, max_length=200)
val_encodings = tokenizer(df_valid.Question.to_list(), truncation=True, padding=True, max_length=200)

In [8]:
# convert raw text file to proper dataset object (based on task)
import torch

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        # initialization
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # slicing method X[index]
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ClassificationDataset(train_encodings, df_train.label.to_list())
valid_dataset = ClassificationDataset(val_encodings, df_valid.label.to_list())

Performing training for 5 epochs

In [10]:
# transformers API for train :)
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=128,  # batch size per device during training
    per_device_eval_batch_size=128,   # batch size for evaluation
    # warmup_steps=500,                # number of warmup steps for learning rate scheduler
    # weight_decay=0.01,               # strength of weight decay
    learning_rate= 1e-4,
    adam_epsilon = 1e-8,
    logging_dir='./logs',            # directory for storing logs
    # logging_steps=10,
    do_eval=True,
    evaluation_strategy = 'epoch'


)
# training_args.evaluation_strategy = EvaluationStrategy.EPOCH
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset

)

trainer.train()

Epoch,Training Loss,Validation Loss
1,2.5779,1.21549
2,1.0835,0.914546
3,0.7747,0.814943
4,0.6128,0.792034
5,0.4914,0.794632


TrainOutput(global_step=2960, training_loss=1.0004456185005806, metrics={'train_runtime': 2224.7275, 'train_samples_per_second': 170.106, 'train_steps_per_second': 1.331, 'total_flos': 7709391761076720.0, 'train_loss': 1.0004456185005806, 'epoch': 5.0})

# Evaluating the Model

In [11]:
# reading Test Data
test_encodings = tokenizer(df_test.Question.to_list(), truncation=True, padding=True, max_length=200)
test_dataset = ClassificationDataset(test_encodings, df_test.label.to_list())

Predicting on test set

In [12]:
pred = trainer.predict(test_dataset)

Accuracy

In [17]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, f1_score

print('Micro Precision', precision_score(pred.label_ids, pred.predictions.argmax(-1), average='micro'))
print('Micro Recall', recall_score(pred.label_ids, pred.predictions.argmax(-1), average='micro'))
print('Micro F1 score', f1_score(pred.label_ids, pred.predictions.argmax(-1), average='micro'))
print('Macro Precision', precision_score(pred.label_ids, pred.predictions.argmax(-1), average='macro', zero_division=0))
print('Macro Recall', recall_score(pred.label_ids, pred.predictions.argmax(-1), average='macro', zero_division=0))
print('Macro F1 score', f1_score(pred.label_ids, pred.predictions.argmax(-1), average='macro', zero_division=0))
print('Accuracy', accuracy_score(pred.label_ids, pred.predictions.argmax(-1)))


Micro Precision 0.8235811092094917
Micro Recall 0.8235811092094917
Micro F1 score 0.8235811092094917
Macro Precision 0.35897562221020357
Macro Recall 0.38914097638350853
Macro F1 score 0.3596161824347952
Accuracy 0.8235811092094917
