In [1]:
from pathlib import Path
import os

from transformers import pipeline
from transformers import RobertaTokenizer, RobertaModel, AutoModel, AutoTokenizer
import pandas as pd
import torch

# from data import Dataloader_finetuning
from data.dataloader_finetuning import VaalikoneDataset, VaalikoneDataModule
from models.model import VaalikoneClassifier
from config.finetune_config import config

# Clean vaalit_2019.csv and save it to data/finetune.csv if it doesn't exist
data_path = "data/finetune.csv"
if not os.path.exists(data_path):
    print("Cleaning data")
    df = pd.read_csv("data/vaalit_2019.csv")
    
    # Remove rows with empty strings
    lappi_column_names = [col for col in df.columns if col[:6] == 'Lappi.']
    df = df.dropna(subset=lappi_column_names)
    # df = df[df.apply(lambda x: x.str.len().gt(0).all(), axis=1)]

    # df.to_csv(data_path, index=False)
    df.to_csv(data_path)

# Create training, validation and test sets if they don't exist
train_path = "data/finetune_train.csv"
val_path = "data/finetune_val.csv"
test_path = "data/finetune_test.csv"

if not os.path.exists(train_path) or not os.path.exists(val_path) or not os.path.exists(test_path):
    print("Creating train, val and test sets")
    df = pd.read_csv(data_path)
    # df = df.sample(frac=1).reset_index(drop=True)
    
    # Split the data into train, val and test sets
    train_df = df.iloc[:int(len(df) * 0.8)]
    val_df = df.iloc[int(len(df) * 0.8):int(len(df) * 0.9)]
    test_df = df.iloc[int(len(df) * 0.9):]
    
    train_df.to_csv(train_path, index=False)
    val_df.to_csv(val_path, index=False)
    test_df.to_csv(test_path, index=False)

### Test dataset module
# Load data
ds = VaalikoneDataset(
    path=config["train_path"],
    config=config)

# dl_train.__getitem__(1)

# ### Test pytorch-lightning module
# datamodule = VaalikoneDataModule(
#     train_path=config["train_path"],
#     val_path=config["val_path"],
#     test_path=config["test_path"],
#     tokenizer=tokenizer,
#     config=config)
# datamodule.setup()
# dl_train2 = datamodule.train_dataloader()

# print(len(dl_train2))
# print(len(dl_train))

classifier = VaalikoneClassifier(config=config)

  from .autonotebook import tqdm as notebook_tqdm


Loaded data from data/finetune_train.csv with shape, (88, 10), and columns: Index(['L_1', 'L_2', 'L_3', 'L_4', 'L_5', 'L_6', 'L_7', 'L_8', 'L_9', 'L_10'], dtype='object')


Some weights of the model checkpoint at TurkuNLP/bert-large-finnish-cased-v1 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
idx = 1
item = ds.__getitem__(idx)
input_ids = item["input_ids"]
attention_mask = item["attention_mask"]
labels = item["labels"]
loss, logits = classifier(input_ids, attention_mask)

# Softmax
probs = torch.softmax(logits, dim=1)
print(probs)
print(ds.data.iloc[idx])
print(labels)
print(torch.abs(probs-labels))

tensor([[0.1445, 0.2235, 0.2975, 0.1931, 0.1413],
        [0.4322, 0.1733, 0.1979, 0.0889, 0.1076],
        [0.2813, 0.1958, 0.1022, 0.1749, 0.2458],
        [0.0944, 0.0748, 0.6473, 0.0767, 0.1068],
        [0.1894, 0.2874, 0.2257, 0.1521, 0.1455]], grad_fn=<SoftmaxBackward0>)
L_1                                                     5
L_2                                                     5
L_3                                                     1
L_4                                                     4
L_5                                                     1
L_6     Kaivospaikkakunnat pitää saada hyötymään enemm...
L_7     Työtehtävät voivat olla ympäri Lappia, tietoli...
L_8     Lappi on laaja maakunta. Lappia ei voi asettaa...
L_9                                       Ikuisuuskysymys
L_10    Meri-Lapin satamien kautta tavara ulkomaille. ...
Name: 1, dtype: object
tensor([[0, 0, 0, 0, 1],
        [0, 0, 0, 0, 1],
        [1, 0, 0, 0, 0],
        [0, 0, 0, 1, 0],
        [1, 0, 0, 

In [3]:
# Train loop
datamodule = VaalikoneDataModule(
    train_path=config["train_path"],
    val_path=config["val_path"],
    test_path=config["test_path"],
    config=config)
datamodule.setup()

classifier = VaalikoneClassifier(config=config)

import pytorch_lightning as pl
trainer = pl.Trainer(max_epochs=config["epochs"],
                    num_sanity_val_steps=50,
                    gpus=1, 
                    )

trainer.fit(classifier, datamodule)

Loaded data from data/finetune_train.csv with shape, (88, 10), and columns: Index(['L_1', 'L_2', 'L_3', 'L_4', 'L_5', 'L_6', 'L_7', 'L_8', 'L_9', 'L_10'], dtype='object')
Loaded data from data/finetune_val.csv with shape, (11, 10), and columns: Index(['L_1', 'L_2', 'L_3', 'L_4', 'L_5', 'L_6', 'L_7', 'L_8', 'L_9', 'L_10'], dtype='object')


Some weights of the model checkpoint at TurkuNLP/bert-large-finnish-cased-v1 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores