In [1]:
from pathlib import Path

import pandas as pd
import pytorch_lightning as pl
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from src.data.components.Dataset import GenericDatasetTransformer
from src.data.text_processing import TextPreprocessor
from src.utils import defines

In [6]:
train_df = pd.read_csv(Path(defines.INTERIM_DATA_DIR, "train.csv"))
text_processor = TextPreprocessor()
model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base")
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

Downloading (…)lve/main/config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading pytorch_model.bin:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight'

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight'

In [7]:
train_df["text"] = text_processor.transform_series(train_df["text"])
train_df["text"] = train_df["text"].apply(lambda x: " ".join(x))

In [8]:
train_df["text_encoded"] = train_df["text"].apply(
    lambda x: tokenizer.encode_plus(
        x,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",
    )
)

In [None]:
print(model)

In [None]:
model

In [None]:
pl.seed_everything(32)

In [None]:
model.classifier

In [None]:
text = train_df["text_encoded"][0]["input_ids"]

with torch.no_grad():
    x = model.roberta(text)
    x = model.classifier(x.last_hidden_state)
    test2 = model(text)
    print(torch.eq(x, test2.logits))

In [None]:
model.modules()

In [None]:
test = model(train_df["text_encoded"][0]["input_ids"])

In [None]:
test.logits

In [None]:
torch.nn.Sequence(model.base_model)
list(model.base_model.modules())

In [None]:
train_df["text_encoded"][0]["input_ids"]

In [None]:
logits_a = torch.rand((16, 2))
logits_b = torch.rand((16, 4))
logits_c = torch.rand((16, 11))

In [None]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

In [None]:
train_df["target_a"]

In [None]:
class_weights_a = torch.tensor(
    compute_class_weight(
        class_weight="balanced", classes=np.unique(train_df["target_a"]), y=train_df["target_a"]
    ),
    dtype=torch.float,
)
class_weights_b = torch.tensor(
    compute_class_weight(
        class_weight="balanced", classes=np.unique(train_df["target_b"]), y=train_df["target_b"]
    ),
    dtype=torch.float,
)
class_weights_c = torch.tensor(
    compute_class_weight(
        class_weight="balanced", classes=np.unique(train_df["target_c"]), y=train_df["target_c"]
    ),
    dtype=torch.float,
)

In [None]:
print(class_weights_a)
print(class_weights_b)
print(class_weights_c)

In [None]:
class_weights_a = torch.tensor(
    compute_class_weight(
        class_weight="balanced", classes=np.unique(train_df["target_a"]), y=train_df["target_a"]
    ),
    dtype=torch.float,
)
tmp_b = train_df[train_df["target_b"] != -1]
class_weights_b = torch.tensor(
    compute_class_weight(
        class_weight="balanced", classes=np.unique(tmp_b["target_b"]), y=tmp_b["target_b"]
    ),
    dtype=torch.float,
)
tmp_c = train_df[train_df["target_c"] != -1]
class_weights_c = torch.tensor(
    compute_class_weight(
        class_weight="balanced", classes=np.unique(tmp_c["target_c"]), y=tmp_c["target_c"]
    ),
    dtype=torch.float,
)

In [None]:
print(class_weights_a)
print(class_weights_b)
print(class_weights_c)