In [347]:
import pandas as pd
import glob
import os
import numpy as np
seed = 15

Loading of data

In [None]:
path = '.' # current file
all_files = glob.glob("*.csv") # gets csv files in current folder

li = []
for filename in all_files: # we load all csv files
    print("Loading: "+str(filename))
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

data = pd.concat(li, axis=0, ignore_index=True) # we combine all dataframes into a single dataframe.

# we replace all empty fields with empty strings
data.fillna('')
data.perex = data.perex.fillna('')

# we combine both text collumns into one and forget the originals
data["text"] = data["title"] + " " + data["perex"]
data = data.drop('title', axis=1)
data = data.drop('perex', axis=1)

# we find all the classes
classes_ = data['label'].unique()
print(classes_)

# we create a dictionary of classes and their indexes
classes = {}
i = 0
for c in classes_:
    classes[c] = i
    i+=1

# we convert classes into numerical indexes and add them to our data frame
int_labels = []
for x in data['label']:
    int_labels.append(classes[x])

data['label'] = int_labels

# we find the weighs of all classes based on their inverse occurences
class_weights = []
for x in range(0,6):
    class_weights.append(float(len(int_labels))/float(int_labels.count(x)))
# normalization
for i in range(0,6):
    class_weights[i] = class_weights[i] / max(class_weights)
print("Weights: " + str(class_weights))

data.describe()


Loading: injuries.csv
Loading: interview.csv
Loading: prematch.csv
Loading: reaction.csv
Loading: report.csv
Loading: transfers.csv
['Injuries' 'Interview' 'Pre-Match' 'Reaction' 'Report' 'Transfers']
Weights: [1.0, 1.0, 1.0, 0.1, 1.0, 1.0]


Unnamed: 0,label
count,1500.0
mean,2.8
std,1.107919
min,0.0
25%,3.0
50%,3.0
75%,3.0
max,5.0


In [None]:
import torch
from transformers import BertTokenizer, BertModel, DataCollatorWithPadding, AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
from torchsummary import summary
from datasets import load_dataset,Dataset,DatasetDict,concatenate_datasets
from datasets import *

Creating the dataset

In [None]:
# we create the dataset and split it into train test and valid groups
dataset = Dataset.from_pandas(data)
train_test_valid = dataset.train_test_split(test_size=0.2, seed=seed)
test_valid = train_test_valid['test'].train_test_split(test_size=0.5, seed=seed)

dataset = DatasetDict({
    'train':train_test_valid['train'], # first 80% for training
    'test':test_valid['train'], # 10% for testing
    'valid':test_valid['test'] # 10% for validation
})

dataset


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 150
    })
    valid: Dataset({
        features: ['label', 'text'],
        num_rows: 150
    })
})

We load bert model and tokenizer

In [None]:
# We load the models tokenizer
model_name = 'bert-base-multilingual-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
#model = BertModel.from_pretrained(model_name)



We tokenize the dataset

In [None]:
def tokenize(batch): # we tokenize the text field
    return tokenizer(batch["text"], truncation = True)

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset.set_format('torch', columns=['input_ids','token_type_ids','attention_mask','label'])

# collator does some more preprocesing
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
tokenized_dataset

Map: 100%|██████████| 1200/1200 [00:00<00:00, 1637.50 examples/s]
Map: 100%|██████████| 150/150 [00:00<00:00, 1792.84 examples/s]
Map: 100%|██████████| 150/150 [00:00<00:00, 1782.98 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 150
    })
    valid: Dataset({
        features: ['label', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 150
    })
})

Creation of my model architecture

In [None]:
import torch.nn as nn


class SportClasifier(nn.Module):
    def __init__(self, model_name, num_labels, class_weights=None):
        super(SportClasifier, self).__init__()
        self.finetuning = False # do we want to train the whole model or just the head
        hidden_size = 256
        self.class_weights = torch.tensor(class_weights)
        self.num_labels = num_labels

        # we load the pretrained model based on its name
        self.model = BertModel.from_pretrained(model_name, 
                                                    config = AutoConfig.from_pretrained(model_name, output_attention= True, output_hidden_state = True))

        # we create clasification head
        self.head = nn.Sequential(nn.Dropout(0.1), 
                                  nn.LazyLinear(out_features = hidden_size),
                                  nn.ReLU(),
                                  nn.Dropout(0.1),
                                  nn.LazyLinear(out_features = num_labels))
        

    def forward(self, input_ids = None, token_type_ids=None ,attention_mask = None, labels = None):
        outputs = self.model(input_ids = input_ids, attention_mask=attention_mask)

        last_hidden_state = None

        if self.finetuning:
            last_hidden_state = outputs[0]
        else:
            last_hidden_state = outputs[0].detach() # this stops the gradient from flowing backwards to the pretrained model
        
        logits = self.head(last_hidden_state[:,0,:].view(-1,768))

        loss = None
        if labels is not None:
            loss_func = nn.CrossEntropyLoss(label_smoothing=0.05, weight = self.class_weights)
            loss = loss_func(logits, labels) 
            return TokenClassifierOutput(loss=loss, logits=logits, hidden_states = outputs.hidden_states)

Data loaders and training parameters

In [354]:
from torch.utils.data import DataLoader
PATH = 'sports_model'

batch_size = 8
epochs = 5
epochs_fine = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sport_model = SportClasifier(model_name=model_name, num_labels = 6, class_weights=class_weights).to(device)
train_dataLoader = DataLoader(tokenized_dataset['train'], shuffle = True, batch_size=batch_size, collate_fn=data_collator)
test_dataLoader = DataLoader(tokenized_dataset['test'], shuffle = True, collate_fn=data_collator)
valid_dataLoader = DataLoader(tokenized_dataset['valid'], shuffle = True, collate_fn=data_collator)

Training the head

In [None]:
from transformers import get_scheduler
from torch.optim import AdamW
import evaluate
from tqdm.auto import tqdm

# progress bars
train_steps = epochs * len(train_dataLoader)
progress_bar_train = tqdm(range(train_steps),position=0, leave=True)
progress_bar_test = tqdm(range((len(test_dataLoader))), position=0, leave=True)


# optimizer and scheduler
optimizer = AdamW(sport_model.head.parameters(), lr = 5e-5)

lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps=0,
    num_training_steps=train_steps)
metric = evaluate.load("f1")

for epoch in range(epochs):
    sport_model.train()
    for batch in train_dataLoader:
        batch = {k : v.to(device) for k,v  in batch.items()}
        outputs = sport_model(**batch)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)


    sport_model.eval()
    for batch in test_dataLoader:
        batch = {k : v.to(device) for k,v  in batch.items()}
        with torch.no_grad():
            outputs = sport_model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions = predictions, references = batch['labels'])
        progress_bar_test.update(1)
    
    print(metric.compute(average='weighted'))

# we save the model
torch.save(sport_model.state_dict(), PATH)

100%|██████████| 450/450 [54:55<00:00,  7.32s/it]
450it [54:55,  7.32s/it]00:00<?, ?it/s]
100%|██████████| 150/150 [05:54<00:00,  6.82it/s]  

{'f1': 0.6215108556832695}


300it [11:48,  6.46it/s]                           

{'f1': 0.6103738974497989}


 54%|█████▍    | 404/750 [15:40<14:37,  2.54s/it]  

We load the model, if we don't want to train it

In [None]:
# we load the saved model

model = SportClasifier(model_name=model_name, num_labels = 6, class_weights=class_weights).to(device)

state_dict = None
pre_finetuned = True
state_dict = torch.load(PATH, weights_only=True)
model.load_state_dict(state_dict)

<All keys matched successfully>

We evaluate the model

In [None]:
sport_model.eval()
for batch in valid_dataLoader:
        batch = {k : v.to(device) for k,v  in batch.items()}
        with torch.no_grad():
            outputs = sport_model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions = predictions, references = batch['labels'])

print(metric.compute(average='weighted'))

{'f1': 0.6390526315789474}


Finetuning

In [None]:
# same thing as in previous traning block, but we train the whole model
PATH_FINE = 'sports_model_finetuned'

sport_model.finetuning = True
train_steps = epochs_fine * len(train_dataLoader)
progress_bar_train = tqdm(range(train_steps),position=0, leave=True)
progress_bar_test = tqdm(range((len(test_dataLoader))), position=0, leave=True)

optimizer = AdamW(sport_model.parameters(), lr = 5e-5)


lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps=0,
    num_training_steps=train_steps)

metric = evaluate.load("f1")

from tqdm.auto import tqdm

for epoch in range(epochs_fine):
    sport_model.train()
    for batch in train_dataLoader:
        batch = {k : v.to(device) for k,v  in batch.items()}
        outputs = sport_model(**batch)
        loss = outputs.loss
        
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)


    sport_model.eval()
    for batch in test_dataLoader:
        batch = {k : v.to(device) for k,v  in batch.items()}
        with torch.no_grad():
            outputs = sport_model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions = predictions, references = batch['labels'])
        progress_bar_test.update(1)
    
    print(metric.compute(average='weighted'))

# we save the final model
torch.save(sport_model.state_dict(), PATH_FINE)
sport_model.finetuning = False

100%|██████████| 450/450 [19:18<00:00,  2.58s/it]
450it [19:18,  2.58s/it]00:00<?, ?it/s]
100%|██████████| 150/150 [15:43<00:00,  5.37it/s]  

{'f1': 0.435089748549323}


300it [31:30,  8.28it/s]                           

{'f1': 0.5648276934369649}


450it [47:05,  7.08it/s] [46:46<00:00,  4.84s/it]

{'f1': 0.7260003987240831}


Post training evaluation

Loading the model for final evaluation

In [None]:
model = SportClasifier(model_name=model_name, num_labels = 6, class_weights=class_weights).to(device)

state_dict = None
pre_finetuned = True
state_dict = torch.load(PATH_FINE, weights_only=True)

model.load_state_dict(state_dict)


<All keys matched successfully>

We evaluate the model

In [None]:
sport_model.eval()
for batch in valid_dataLoader:
        batch = {k : v.to(device) for k,v  in batch.items()}
        with torch.no_grad():
            outputs = sport_model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions = predictions, references = batch['labels'])

print(metric.compute(average='weighted'))

# epochs 1-1 = 0.78

{'f1': 0.8456458675424193}
