<a href="https://colab.research.google.com/github/meti-94/OpenQA/blob/main/Freebase_Relation_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Installing necessary libraries 

In [None]:
!pip3 install 'transformers[torch]' -q
!pip install --upgrade openpyxl -q 

In [1]:
!git clone https://github.com/meti-94/OpenQA.git

Cloning into 'OpenQA'...
remote: Enumerating objects: 231, done.[K
remote: Counting objects: 100% (231/231), done.[K
remote: Compressing objects: 100% (195/195), done.[K
remote: Total 231 (delta 95), reused 130 (delta 29), pack-reused 0[K
Receiving objects: 100% (231/231), 102.61 MiB | 33.97 MiB/s, done.
Resolving deltas: 100% (95/95), done.


### Importing libraries

In [2]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoTokenizer, AutoModel, BertForSequenceClassification
from sklearn.metrics import classification_report
from sklearn import preprocessing

### Creating datasets

In [4]:
## loading data
## X --> Texts
## y --> Labels
le = preprocessing.LabelEncoder()
df_train = pd.read_excel('/content/OpenQA/data/freebase/train_useful_records.xlsx')
df_valid = pd.read_excel('/content/OpenQA/data/freebase/valid_useful_records.xlsx')
df_test = pd.read_excel('/content/OpenQA/data/freebase/test_useful_records.xlsx')
le.fit(df_train.relation_type.to_list()+df_valid.relation_type.to_list()+df_test.relation_type.to_list())
df_train['label'] = le.transform(df_train.relation_type.to_list())
df_valid['label'] = le.transform(df_valid.relation_type.to_list())
df_test['label'] = le.transform(df_test.relation_type.to_list())

### Loading BERT model and tokenizer model

Loading the Standard 12L Bert model 

In [10]:

config = AutoConfig.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("/content/drive/MyDrive/data_freebase/classifier", num_labels=len(le.classes_))


# Creating dataset and dataloader for X-y

converting raw data into conventional PyTorch Dataset class

In [11]:
train_encodings = tokenizer(df_train.Question.to_list(), truncation=True, padding=True, max_length=200)
val_encodings = tokenizer(df_valid.Question.to_list(), truncation=True, padding=True, max_length=200)

In [12]:
# convert raw text file to proper dataset object (based on task)
import torch

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        # initialization
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # slicing method X[index]
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ClassificationDataset(train_encodings, df_train.label.to_list())
valid_dataset = ClassificationDataset(val_encodings, df_valid.label.to_list())

Performing training for 5 epochs 

In [13]:
# transformers API for train :)
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=128,  # batch size per device during training
    per_device_eval_batch_size=128,   # batch size for evaluation
    # warmup_steps=500,                # number of warmup steps for learning rate scheduler
    # weight_decay=0.01,               # strength of weight decay
    learning_rate= 1e-4,
    adam_epsilon = 1e-8, 
    logging_dir='./logs',            # directory for storing logs
    # logging_steps=10,
    do_eval=True,
    evaluation_strategy = 'epoch'
    
    
)
# training_args.evaluation_strategy = EvaluationStrategy.EPOCH
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset

)

# trainer.train()

# Evaluating the Model 

In [14]:
# reading Test Data
test_encodings = tokenizer(df_test.Question.to_list(), truncation=True, padding=True, max_length=200)
test_dataset = ClassificationDataset(test_encodings, df_test.label.to_list())

Predicting on test set

In [28]:
pred = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 21619
  Batch size = 128


Accuracy

In [29]:
pred.predictions.argmax(-1)[:10]
pred.label_ids

array([1318, 1320, 1449, ...,   58,  971, 1447])

In [30]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, f1_score

print('Micro Precision', precision_score(pred.label_ids, pred.predictions.argmax(-1), average='micro'))
print('Micro Recall', recall_score(pred.label_ids, pred.predictions.argmax(-1), average='micro'))
print('Micro F1 score', f1_score(pred.label_ids, pred.predictions.argmax(-1), average='micro'))
print('Macro Precision', precision_score(pred.label_ids, pred.predictions.argmax(-1), average='macro'))
print('Macro Recall', recall_score(pred.label_ids, pred.predictions.argmax(-1), average='macro'))
print('Macro F1 score', f1_score(pred.label_ids, pred.predictions.argmax(-1), average='macro'))
print('Accuracy', accuracy_score(pred.label_ids, pred.predictions.argmax(-1)))


Micro Precision 0.824922521855775
Micro Recall 0.824922521855775
Micro F1 score 0.824922521855775
Macro Precision 0.37099581877790233
Macro Recall 0.39488840388214586
Macro F1 score 0.3681853770777389
Accuracy 0.824922521855775


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
