In [1]:
!pip install -q transformers
!pip install -q datasets
!pip install -q evaluate
import pandas as pd
import numpy as np
import torch
import evaluate
from transformers import AutoModelForSequenceClassification
from train import get_dataloaders, train_model, calculate_f1

import logging
logging.getLogger('transformers').setLevel(logging.ERROR)

[0m

In [2]:
data_dir = './datasets/'

In [3]:
original_train_data = pd.read_csv(data_dir + 'train_data.csv')
#augmented_train_data = pd.read_csv(data_dir + 'augmented_data_label_1_pegasus.csv')
augmented_train_data = pd.read_csv(data_dir + 'augmented_data_label_1_parrot.csv')
train_data = pd.concat([original_train_data, augmented_train_data], axis=0)

test_data = pd.read_csv(data_dir + 'test_data.csv')
final_pred_data = pd.read_csv(data_dir + 'final_pred.csv')

In [4]:
# create 5 balanced sub-datasets for bagging
num_sub_datasets = 5
train_data_list = []

ones = train_data[train_data['label'] == 1]
zeros = train_data[train_data['label'] == 0]
num = len(ones)

for i in range(num_sub_datasets):
    train_data_list.append(pd.concat([ones, zeros.sample(n=num, random_state=i)], axis=0))

In [5]:
# Check if there's any null value in the text column
test_data[test_data['text'].isnull()]

Unnamed: 0,text,label


In [5]:
# Hyperparameters
NUM_EPOCHS = 1
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
USE_LR_SCHEDULER = False
PRETRAINED_MODEL_NAME = "roberta-base"

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

args = {
    "NUM_EPOCHS": NUM_EPOCHS,
    "BATCH_SIZE": BATCH_SIZE,
    "LEARNING_RATE": LEARNING_RATE,
    "USE_LR_SCHEDULER": USE_LR_SCHEDULER,
    "PRETRAINED_MODEL_NAME": PRETRAINED_MODEL_NAME,
}

cuda


In [6]:
# Predict labels using the 5 models (voting)
def predict(model_names, test_dataloader):
    y_pred_list = []
    for name in model_names:
        model = AutoModelForSequenceClassification.from_pretrained(name, num_labels=2)
        model.to(device)
        model.eval()
        # The list of predictions for each model
        y_pred = []
        for batch in test_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            logits = outputs.logits
            # Extend every batch
            y_pred.extend(logits.argmax(-1).cpu().numpy())
        # Now we have a list of list of predictions
        y_pred_list.append(y_pred)
    # Voting
    y_pred = np.array(y_pred_list).T
    y_pred = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=1, arr=y_pred)
    return y_pred


def calculate_bagging_f1(model_names, val_dataloader):
    # Get the predictions
    y_pred = predict(model_names, val_dataloader)
    # Get the labels
    y_true = np.array([])
    for batch in val_dataloader:
        y_true = np.concatenate((y_true, batch['labels'].numpy()))
    # Calculate the f1 score
    metric = evaluate.combine(["accuracy", "f1", "precision", "recall"])
    result_dict = metric.compute(predictions=y_pred, references=y_true)
    return result_dict["f1"]

# If you need to load the models from disk here's your helper
def load_models(num_models: int, device: torch.device):
    model_list = []
    for i in range(num_models):
        model = AutoModelForSequenceClassification.from_pretrained(f"bagging_model_{i+1}", num_labels=2)
        model.to(device)
        model_list.append(model)
    return model_list

In [7]:
# Construct our "bags"
print('Training {} models...'.format(num_sub_datasets))
for i in range(num_sub_datasets):
    print('Training model {}...'.format(i+1))
    train_dataloader, _ = get_dataloaders(args, train_data_list[i], None)
    train_model(args, device, train_dataloader, model_name = "bagging_model_{}".format(i+1))

Training 5 models...
Training model 1...


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]



  0%|          | 0/5 [00:00<?, ?ba/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Epoch: 0


train loss: 0.2626906931400299: 100%|██████████| 263/263 [07:02<00:00,  1.61s/it] 


Training model 2...


  0%|          | 0/5 [00:00<?, ?ba/s]

Epoch: 0


train loss: 0.06922002881765366: 100%|██████████| 263/263 [06:41<00:00,  1.53s/it]


Training model 3...


  0%|          | 0/5 [00:00<?, ?ba/s]

Epoch: 0


train loss: 0.12306714057922363: 100%|██████████| 263/263 [07:00<00:00,  1.60s/it]


Training model 4...


  0%|          | 0/5 [00:00<?, ?ba/s]

Epoch: 0


train loss: 0.11373656988143921: 100%|██████████| 263/263 [07:17<00:00,  1.66s/it] 


Training model 5...


  0%|          | 0/5 [00:00<?, ?ba/s]

Epoch: 0


train loss: 0.2682770788669586: 100%|██████████| 263/263 [07:02<00:00,  1.61s/it] 


### Getting F1 score fot Test Data csv

In [9]:
_, test_dataloader = get_dataloaders(args, None, test_data)

print("Separate f1 score")
list_of_metrics = []
for i in range(num_sub_datasets):
    metric = calculate_f1(f"bagging_model_{i+1}", device, test_dataloader)
    list_of_metrics.append(metric)
print(list_of_metrics)
print("Bagging f1 score")
model_names = ["bagging_model_{}".format(i+1) for i in range(num_sub_datasets)]
metric = calculate_bagging_f1(model_names, test_dataloader)
print(metric)

  0%|          | 0/3 [00:00<?, ?ba/s]

Separate f1 score
[0.5301837270341208, 0.5517241379310345, 0.47477744807121663, 0.46991404011461324, 0.4839650145772595]
Bagging f1 score
0.5601503759398496


## Predicting results to generate dev.txt and test.txt

In [10]:
# generate dev.txt from test_data.csv
_, test_dataloader = get_dataloaders(args, None, test_data)

model_names = ["bagging_model_{}".format(i+1) for i in range(num_sub_datasets)]
y_pred = predict(model_names, test_dataloader)
# write into dev.txt, with 1 label per line
print("Start writing dev-generated.txt")
with open('dev.txt', 'w') as f:
    for label in y_pred:
        f.write(str(label))
        f.write('\n')

  0%|          | 0/3 [00:00<?, ?ba/s]

Start writing dev-generated.txt


In [11]:
# generate test.txt from final_pred.csv
_, final_pred_dataloader = get_dataloaders(args, None, final_pred_data)

model_names = ["bagging_model_{}".format(i+1) for i in range(num_sub_datasets)]
y_pred = predict(model_names, final_pred_dataloader)
# write into test-generated.txt, with 1 label per line
print("Start writing test.txt")
with open('test.txt', 'w') as f:
    for label in y_pred:
        f.write(str(label))
        f.write('\n')

  0%|          | 0/4 [00:00<?, ?ba/s]

Start writing test.txt
