In [None]:
from pathlib import Path

import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

from src.data.components.Dataset import GenericDatasetTransformer
from src.data.text_processing import TextPreprocessor
from src.utils import defines

In [None]:
train_df = pd.read_csv(Path(defines.INTERIM_DATA_DIR, "val.csv"))
text_processor = TextPreprocessor()
model = AutoModelForSequenceClassification.from_pretrained("distilroberta-base")
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

In [None]:
train_df["text"] = text_processor.transform_series(train_df["text"])
train_df["text"] = train_df["text"].apply(lambda x: " ".join(x))

In [None]:
train_df["text_encoded"] = train_df["text"].apply(
    lambda x: tokenizer.encode_plus(
        x,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
        return_tensors="pt",
    )
)

In [None]:
model.roberta

In [None]:
with torch.no_grad():
    test = model.roberta(train_df["text_encoded"][0]["input_ids"])

In [None]:
test.last_hidden_state.shape

In [None]:
model.modules()

In [None]:
test = model(train_df["text_encoded"][0]["input_ids"])

In [None]:
test.logits

In [None]:
torch.nn.Sequence(model.base_model)
list(model.base_model.modules())

In [None]:
train_df["text_encoded"][0]["input_ids"]
