<a href="https://colab.research.google.com/github/meti-94/OpenQA/blob/main/relation_classification_with_bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install necessary libraries 

In [1]:
# Install main transformers library
!pip install transformers -q
!rm -rf OpenQA/
!git clone https://github.com/meti-94/OpenQA.git

Cloning into 'OpenQA'...
remote: Enumerating objects: 132, done.[K
remote: Counting objects: 100% (132/132), done.[K
remote: Compressing objects: 100% (104/104), done.[K
remote: Total 132 (delta 38), reused 104 (delta 21), pack-reused 0[K
Receiving objects: 100% (132/132), 88.93 MiB | 32.30 MiB/s, done.
Resolving deltas: 100% (38/38), done.


# Import libraries

In [2]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoTokenizer, AutoModel, BertForSequenceClassification
from sklearn.metrics import classification_report
from sklearn import preprocessing

# load up data to classifiy

In [3]:
## loading data
## X --> Texts
## y --> Labels
le = preprocessing.LabelEncoder()
df_train = pd.read_excel('/content/OpenQA/data/freebase/train_useful_records.xlsx')
df_valid = pd.read_excel('/content/OpenQA/data/freebase/valid_useful_records.xlsx')
df_test = pd.read_excel('/content/OpenQA/data/freebase/test_useful_records.xlsx')
le.fit(df_train.relation_type.to_list()+df_valid.relation_type.to_list()+df_test.relation_type.to_list())
df_train['label'] = le.transform(df_train.relation_type.to_list())
df_valid['label'] = le.transform(df_valid.relation_type.to_list())
df_test['label'] = le.transform(df_test.relation_type.to_list())

In [4]:
df_train.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,token_matrix,Question,tokenized_question,Answer,first_entity_ids,second_entity_ids,relation_type,entity,answer_mid,relation_span,label
0,0,0,"[101, 2054, 2003, 1996, 2338, 1041, 2055, 102]",what is the book e about,"['[CLS]', 'e', '[SEP]']",fb:m.04whkz5,"[101, 1041, 102]","[101, 102]",fb:book.written_work.subjects,e,fb:m.04whkz5,"[0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",642
1,1,1,"[101, 2000, 2054, 2713, 2515, 1996, 2713, 2650...",to what release does the release track cardiac...,"['[CLS]', 'cardiac', 'arrest', '[SEP]']",fb:m.0tp2p24,"[101, 15050, 6545, 102]","[101, 102]",fb:music.release_track.release,cardiac arrest,fb:m.0tp2p24,"[0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, ...",1391
2,2,2,"[101, 2054, 2406, 2001, 1996, 2143, 1996, 7016...",what country was the film the debt from,"['[CLS]', 'the', 'debt', '[SEP]']",fb:m.04j0t75,"[101, 1996, 7016, 102]","[101, 102]",fb:film.film.country,the debt,fb:m.04j0t75,"[0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...",957
3,3,3,"[101, 2054, 2774, 2031, 2053, 8569, 2080, 1057...",what songs have nobuo uematsu produced ?,"['[CLS]', 'no', '##bu', '##o', 'u', '##ema', '...",fb:m.0ftqr,"[101, 2053, 8569, 2080, 1057, 14545, 10422, 102]","[101, 102]",fb:music.producer.tracks_produced,nobuo uematsu,fb:m.0ftqr,"[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...",1371
4,4,4,"[101, 2040, 2550, 6574, 1011, 19330, 13700, 10...",who produced eve-olution ?,"['[CLS]', 'eve', '-', 'ol', '##ution', '[SEP]']",fb:m.036p007,"[101, 6574, 1011, 19330, 13700, 102]","[101, 102]",fb:music.release.producers,eve-olution,fb:m.036p007,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",1386


# Loading BERT model and tokenizer model

In [5]:

# v1.0
# config = AutoConfig.from_pretrained("prajjwal1/bert-tiny")
# tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
# model = BertForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=len(le.classes_))
config = AutoConfig.from_pretrained("bert-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(le.classes_))


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# Creating dataset and dataloader for X-y

In [6]:
train_encodings = tokenizer(df_train.Question.to_list(), truncation=True, padding=True, max_length=200)
val_encodings = tokenizer(df_valid.Question.to_list(), truncation=True, padding=True, max_length=200)

In [7]:
# convert raw text file to proper dataset object (based on task)
import torch

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        # initialization
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # slicing method X[index]
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ClassificationDataset(train_encodings, df_train.label.to_list())
valid_dataset = ClassificationDataset(val_encodings, df_valid.label.to_list())

In [8]:
# transformers API for train :)
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=15,              # total number of training epochs
    per_device_train_batch_size=128,  # batch size per device during training
    per_device_eval_batch_size=128,   # batch size for evaluation
    # warmup_steps=500,                # number of warmup steps for learning rate scheduler
    # weight_decay=0.01,               # strength of weight decay
    learning_rate= 1e-4,
    adam_epsilon = 1e-8, 
    logging_dir='./logs',            # directory for storing logs
    # logging_steps=10,
    do_eval=True,
    evaluation_strategy = 'epoch'
    
    
)
# training_args.evaluation_strategy = EvaluationStrategy.EPOCH
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset

)

trainer.train()

***** Running training *****
  Num examples = 75688
  Num Epochs = 15
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 128
  Gradient Accumulation steps = 1
  Total optimization steps = 8880


Epoch,Training Loss,Validation Loss
1,2.6115,1.202373


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10809
  Batch size = 128


Epoch,Training Loss,Validation Loss
1,2.6115,1.202373
2,1.0577,0.873197
3,0.762,0.78514
4,0.5911,0.778568


Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10809
  Batch size = 128
Saving model checkpoint to ./results/checkpoint-1500
Configuration saved in ./results/checkpoint-1500/config.json
Model weights saved in ./results/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10809
  Batch size = 128
Saving model checkpoint to ./results/checkpoint-2000
Configuration saved in ./results/checkpoint-2000/config.json
Model weights saved in ./results/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 10809
  Batch size = 128
Saving model checkpoint to ./results/checkpoint-2500
Configuration saved in ./results/checkpoint-2500/config.json
Model weights saved in ./results/checkpoint-2500/pytorch_model.bin


KeyboardInterrupt: ignored

In [9]:
# reading Test Data
test_encodings = tokenizer(df_test.Question.to_list(), truncation=True, padding=True, max_length=200)
test_dataset = ClassificationDataset(test_encodings, df_test.label.to_list())

# Predicting on test set

In [10]:
pred = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 21619
  Batch size = 128


Epoch,Training Loss,Validation Loss
1,2.6115,1.202373
2,1.0577,0.873197
3,0.762,0.78514
4,0.5911,0.778568


# Evaluation

In [11]:
pred.predictions.argmax(-1)[:10]
pred.label_ids

array([1318, 1320, 1449, ...,   58,  971, 1447])

In [12]:
from sklearn.metrics import f1_score, accuracy_score

print(accuracy_score(pred.label_ids, pred.predictions.argmax(-1)))

0.8190480595772237


In [None]:
trainer.save_model("./classifier")

In [None]:
cp -r classifier/ drive/MyDrive/

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
# reading Test Data
ourtest_df = pd.read_excel('/content/drive/MyDrive/data_freebase/sbs.xlsx')
newtest_texts = ourtest_df.Question.to_list()
# ourtest_df.head()

newtest_encodings = tokenizer(newtest_texts, truncation=True, padding=True, max_length=200)
newtest_dataset = ClassificationDataset(newtest_encodings, [1 for _ in range(len(newtest_texts))])
new_pred = trainer.predict(newtest_dataset)

***** Running Prediction *****
  Num examples = 5003
  Batch size = 128


Epoch,Training Loss,Validation Loss
1,2.6115,1.202373
2,1.0577,0.873197
3,0.762,0.78514
4,0.5911,0.778568


In [24]:
import numpy as np

def softmax(x):
    
    f_x = np.exp(x) / np.sum(np.exp(x))
    return f_x
import numpy as np

first, second, third, sort = [], [], [], []
for idx, item in enumerate(new_pred.predictions):
  temp = softmax(item)
  indices = temp.argsort()
  values = np.sort(temp) 
  first.append((le.inverse_transform([indices[-1]])[0], values[-1]))
  second.append((le.inverse_transform([indices[-2]])[0], values[-2]))
  third.append((le.inverse_transform([indices[-3]])[0], values[-3]))
  sort.append(values[-1])

  # print(first, second, third)
  # break
debug = pd.DataFrame({
                      'Question':newtest_texts,
                      'First':first,
                      'Second':second,
                      'Third':third,
                      'Sort':sort
                     })
debug.to_excel('ours.xlsx', index=False)

In [30]:
debug[(debug.First.apply(lambda item:item[-1]>0.8))&debug.Question.apply(lambda item:item.find('is')!=-1)]

Unnamed: 0,Question,First,Second,Third,Sort
27,What is Driftwood,"(fb:common.topic.notable_types, 0.98721325)","(fb:type.object.type, 0.005978638)","(fb:people.person.profession, 0.0018229936)",0.987213
28,who is Yumiko,"(fb:common.topic.notable_types, 0.9764201)","(fb:people.person.profession, 0.0066310694)","(fb:type.object.type, 0.0042967545)",0.976420
30,Where is Nak,"(fb:location.location.containedby, 0.9529905)","(fb:location.location.partially_containedby, 0...","(fb:location.hud_county_place.place, 0.004513087)",0.952990
35,Who left his hometown of Chicago,"(fb:location.location.people_born_here, 0.9823...",(fb:fictional_universe.fictional_setting.ficti...,"(fb:people.person.place_of_birth, 0.0014106713)",0.982312
36,what is the ninth album by Tanya Tucker,"(fb:music.album_release_type.albums, 0.9149029)","(fb:music.artist.album, 0.061613135)","(fb:music.album_content_type.albums, 0.01381076)",0.914903
...,...,...,...,...,...
4971,where is KUAC,"(fb:location.location.containedby, 0.87494266)","(fb:broadcast.broadcast.area_served, 0.018128965)","(fb:location.hud_county_place.county, 0.014292...",0.874943
4975,where is Machrihanish,"(fb:location.location.containedby, 0.98988736)","(fb:location.location.partially_containedby, 0...","(fb:location.hud_county_place.place, 0.0006165...",0.989887
4977,What is Brhl,"(fb:common.topic.notable_types, 0.9816176)","(fb:type.object.type, 0.011156884)","(fb:people.person.profession, 0.0013883706)",0.981618
4997,what town is in Italy,"(fb:location.location.contains, 0.98795944)","(fb:location.country.second_level_divisions, 0...","(fb:location.us_county.hud_county_place, 0.001...",0.987959


In [28]:
debug[debug.Question.apply(lambda item:item.find('is')!=-1)]

Unnamed: 0,Question,First,Second,Third,Sort
1,of what is A Shooting Star another brand,(fb:medicine.drug_dosage_flavor.drugs_with_thi...,"(fb:food.ingredient.dishes, 0.03495156)","(fb:dining.restaurant.cuisine, 0.029498173)",0.035570
11,over what has Jewish leadership evolved,"(fb:user.alexander.philosophy.philosopher.era,...","(fb:religion.religious_practice.practice_of, 0...",(fb:computer.programming_language.influenced_b...,0.029125
13,What typically include nationalised industries,"(fb:type.type.instance, 0.07747618)","(fb:people.profession.specialization_of, 0.037...","(fb:people.profession.specializations, 0.02648...",0.077476
15,Where is L2CAP used,"(fb:broadcast.broadcast.area_served, 0.15592018)","(fb:astronomy.orbital_relationship.orbits, 0.0...","(fb:cvg.game_version.platform, 0.027077217)",0.155920
27,What is Driftwood,"(fb:common.topic.notable_types, 0.98721325)","(fb:type.object.type, 0.005978638)","(fb:people.person.profession, 0.0018229936)",0.987213
...,...,...,...,...,...
4994,since when has Steyning existed,"(fb:music.artist.origin, 0.109646924)",(fb:fictional_universe.fictional_character.app...,"(fb:organization.organization.place_founded, 0...",0.109647
4995,for what is parking created,"(fb:common.topic.notable_types, 0.63249725)","(fb:type.object.type, 0.14544044)","(fb:visual_art.artwork.media, 0.023522813)",0.632497
4997,what town is in Italy,"(fb:location.location.contains, 0.98795944)","(fb:location.country.second_level_divisions, 0...","(fb:location.us_county.hud_county_place, 0.001...",0.987959
5000,What is Louise,"(fb:common.topic.notable_types, 0.9920396)","(fb:type.object.type, 0.0036034826)","(fb:people.person.profession, 0.00080775894)",0.992040


In [21]:
import numpy as np

def softmax(x):
    
    f_x = np.exp(x) / np.sum(np.exp(x))
    return f_x
import numpy as np

first, second, third, answer, sort = [], [], [], [], []
for idx, item in enumerate(pred.predictions):
  temp = softmax(item)
  indices = temp.argsort()
  values = np.sort(temp) 
  first.append((le.inverse_transform([indices[-1]])[0], values[-1]))
  second.append((le.inverse_transform([indices[-2]])[0], values[-2]))
  third.append((le.inverse_transform([indices[-3]])[0], values[-3]))
  sort.append(values[-1])

  # print(first, second, third)
  # break
debug = pd.DataFrame({
                      'Question':df_test.Question.to_list(),
                      'First':first,
                      'Second':second,
                      'Third':third,
                      'Answer':df_test.relation_type.to_list()
                      'Sort':sort,
                     })
debug.to_excel('freebase.xlsx', index=False)

In [19]:
pred

PredictionOutput(predictions=array([[-1.5081786 , -0.2964749 , -1.8974983 , ..., -1.1388532 ,
        -2.6386049 , -0.6149134 ],
       [-3.752223  , -2.1240869 , -3.1193984 , ..., -2.93882   ,
        -3.849748  , -1.6930649 ],
       [-2.0788586 , -0.43151364, -1.3509523 , ..., -1.4018582 ,
        -2.4768994 , -0.5686034 ],
       ...,
       [-3.5965514 , -2.6662111 , -2.1668787 , ..., -1.6797744 ,
        -1.2552487 , -2.988654  ],
       [-1.373984  , -0.43671474, -1.6213957 , ..., -1.917182  ,
        -1.2366729 , -0.72635925],
       [-2.613196  , -1.7741838 , -1.3350426 , ..., -2.0923038 ,
        -2.9737718 , -1.7797832 ]], dtype=float32), label_ids=array([1318, 1320, 1449, ...,   58,  971, 1447]), metrics={'test_loss': 0.8185576796531677, 'test_runtime': 72.6116, 'test_samples_per_second': 297.735, 'test_steps_per_second': 2.327})