In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, Trainer, \
    TrainingArguments
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split

In [2]:
def classify_score(score: int):
    if score > 0:
        return 1
    # elif score == 0:
    #     return 0
    elif score <= 0:
        return 0

In [3]:
train_df = pd.read_csv('data/sample_train.csv')
test_df = pd.read_csv('data/sample_test.csv')
train_df['question_full_text'] = train_df['question_title'] + ' ' + train_df['question_cleaned_text']
test_df['question_full_text'] = test_df['question_title'] + ' ' + test_df['question_cleaned_text']
train_df['score_class'] = train_df['answer_score'].apply(classify_score)
test_df['score_class'] = test_df['answer_score'].apply(classify_score)

In [4]:
train_sample_df = train_df.sample(frac=0.1, random_state=42)

In [5]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [6]:
# CONSTANTS
model_checkpoint = 'intfloat/e5-small-v2'
num_labels = train_df['score_class'].nunique()

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-small-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def preprocess_function(examples):
    tokenized_inputs = tokenizer(examples['question_full_text'], examples['answer_cleaned_text'], truncation=True)
    tokenized_inputs['label'] = examples['score_class']
    return tokenized_inputs

In [9]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/78820 [00:00<?, ? examples/s]

Map:   0%|          | 0/19580 [00:00<?, ? examples/s]

In [10]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    output_dir=f'{model_name}-finetuned',
    overwrite_output_dir=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


In [None]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mbunnynobugs[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.6118,0.616423
