In [1]:
import torch
import polars as pl
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments

### Loading data and basic checks

In [2]:
# loading the dataset
train = pl.read_csv("./data/improved_train.csv")
val = pl.read_csv("./data/improved_val.csv")

train = train[["ID", "TEXT", "LABEL"]]
val = val[["ID", "TEXT", "LABEL"]]

# combining provided training set with the newly assembled one
train_provided = pl.read_csv("./data/train.csv")
train = pl.concat([train, train_provided])
train = train.sample(fraction=1, shuffle=True, seed=894552352)

train.head()

ID,TEXT,LABEL
i64,str,i64
614858,"""Absolutely excellent. The Gait…",0
874754,"""For these reasons Mr Blifil wa…",1
895574,"""A major factor in the Spanish …",0
746048,"""I joyed also that the old Scri…",0
205444,"""The little tailor went forth, …",1


In [3]:
# checking distribution of training examples
print(
    len(train.filter(train["LABEL"] == 0)),
    len(train.filter(train["LABEL"] == 1))
)

36798 35532


~The data is imbalanced.~ 

With the newly extracted text, labels are no longer that imbalanced!

In [4]:
# loading RoBERTa
model_name = "FacebookAI/roberta-base"

# pointing to a custom directory to save the model
# initially tried this on xdisk, memory issues, using personal machine
custom_cache_dir = "../.cache_xdisk/"

# loading model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    cache_dir=custom_cache_dir, 
    trust_remote_code=True,
    num_labels=2,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name, 
    cache_dir=custom_cache_dir, 
    trust_remote_code=True
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# replacing [SNIPPET] with separation token for the model
train = train.with_columns(
    pl.col("TEXT").str.replace(
        r"\[SNIPPET\]", 
        tokenizer.sep_token
    )
)

val = val.with_columns(
    pl.col("TEXT").str.replace(
        r"\[SNIPPET\]", 
        tokenizer.sep_token
    )
)

print(list(train[0]["TEXT"]))
print(list(val[0]["TEXT"]))

['Absolutely excellent. The Gaither sisters head down south to spend some time with their relatives in Alabama (Big Ma and her mother Ma Charles). While there the girls learn a lot about their ancestry and the feud going on between Ma Charles and her half-sister Miss Trotter. The elderly sisters are storytellers, which really appeals to Vonetta who ends up carrying bickering messages between the two front porches for nearly the entirety of her time down south. When an act of nature sets the whole clan to worrying, family ties from all across the nation end up at Big Ma\'s. Though there are bound to be questions as to whether this book can truly stand on its own considering the 2 previous books featuring this unforgettable trio of sisters, this book takes a sharp right turn by focusing on the family history. The author gives readers adequate information about characters appearing in previous novels and previous altercations (i. </s>  Weed, the great time-waster, provided a fortuitous ex

In [6]:
# tokenizing the dataset

train_hf = Dataset.from_polars(train)
val_hf = Dataset.from_polars(val)

In [7]:
def tokenize_function(df):
    df_tokenized = tokenizer(df["TEXT"], padding='max_length', truncation=True)
    df_tokenized["labels"] = df["LABEL"]
    
    return df_tokenized

In [8]:
tokenized_train = train_hf.map(tokenize_function, batched=True)

Map:   0%|          | 0/72330 [00:00<?, ? examples/s]

In [9]:
tokenized_val = val_hf.map(tokenize_function, batched=True)

Map:   0%|          | 0/6996 [00:00<?, ? examples/s]

In [10]:
tokenized_train

Dataset({
    features: ['ID', 'TEXT', 'LABEL', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 72330
})

### Model fine-tuning

In [11]:
# since the weight is slightly imbalanced, we will manage this
# by informing the optimizer
num_pos = len(train.filter(pl.col("LABEL") == 1))
num_neg = len(train.filter(pl.col("LABEL") == 0))

# finding the inverse frequency
neg_weight = len(train) / (2 * num_neg)
pos_weight = len(train) / (2 * num_pos)

(class_weights := [neg_weight, pos_weight])

[0.9827979781509865, 1.0178149273893955]

In [12]:
# slightly modified from https://discuss.huggingface.co/t/how-can-i-use-class-weights-when-training/1067/6

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels") 
        outputs = model(**inputs)
        logits = outputs.logits  
        
        # moving class weights to the same device as logits
        class_weights_tensor = torch.tensor(class_weights).to(logits.device)
        
        # defining the weighted loss function
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights_tensor)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        
        return (loss, outputs) if return_outputs else loss

In [13]:
# defining basic training arguments
training_args = TrainingArguments(
    output_dir="./results_1209/",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    learning_rate=1e-5,
    #warmup_steps=500,
    #lr_scheduler_type="cosine",
    #weight_decay=0.05,
    eval_steps=1000,
    save_steps=1000,            
    logging_steps=100,
    eval_strategy="steps",
    # metric_for_best_model="eval_loss",
    # greater_is_better=False,
    logging_dir=f"./results_1209/logs",
    #fp16=True, # hash this out if on MPS
    #ddp_find_unused_parameters=False,
)

In [14]:
# fine-tuning the model
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    processing_class=tokenizer,
)

trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=22605, training_loss=0.33618110227152403, metrics={'train_runtime': 38166.2049, 'train_samples_per_second': 9.476, 'train_steps_per_second': 0.592, 'total_flos': 9.5154113170944e+16, 'train_loss': 0.33618110227152403, 'epoch': 5.0})

### Actually getting the test labels and compiling .csv for the draft submission 

In [15]:
test = pl.read_csv("./data/test.csv")

test.with_columns(
    pl.col("TEXT").str.replace(
        r"\[SNIPPET\]", 
        tokenizer.sep_token
    )
)

def tokenize_function_testset(df):
    return tokenizer(df["TEXT"], truncation=True, padding="max_length")

test_hf = Dataset.from_polars(test)
tokenized_test = test_hf.map(tokenize_function_testset, batched=True)

Map:   0%|          | 0/899 [00:00<?, ? examples/s]

In [16]:
# load the best model
best_model = "./results_1209/checkpoint-22605"
another_attempt = "./results_new_train/checkpoint-9000"

trained_model = AutoModelForSequenceClassification.from_pretrained(
    best_model
)
trained_model.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [17]:
# making and retrieving predictions
input_ids = tokenized_test["input_ids"]
attention_mask = tokenized_test["attention_mask"]

input_ids = torch.tensor(input_ids)
attention_mask = torch.tensor(attention_mask)

# passing tensors to the model
with torch.no_grad():
    outputs = trained_model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    test_labels = torch.argmax(logits, dim=-1).numpy()

In [18]:
# making and retrieving predictions
# predictions_test = trainer.predict(tokenized_test)
# logits = predictions_test.predictions
# test_labels = logits.argmax(axis=-1)

In [19]:
pl.DataFrame({
    "ID": list(test["ID"]),
    "LABEL": list(test_labels)
}).write_csv("./results_1209.csv")