In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '4'

In [2]:
import torch
torch.cuda.device_count()

1

In [3]:
import wandb

wandb.login()
os.environ["WANDB_LOG_MODEL"] = "end"

[34m[1mwandb[0m: Currently logged in as: [33mbunnynobugs[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [21]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, Trainer, \
    TrainingArguments
import pandas as pd
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [5]:
wandb.init(
    project='ods-nlp-stackoverflow-project'
)

In [6]:
def classify_score(score: int):
    if score > 0:
        return 1
    # elif score == 0:
    #     return 0
    elif score <= 0:
        return 0

In [7]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
train_df['question_full_text'] = train_df['question_title'] + ' ' + train_df['question_cleaned_text']
test_df['question_full_text'] = test_df['question_title'] + ' ' + test_df['question_cleaned_text']
train_df['score_class'] = train_df['answer_score'].apply(classify_score)
test_df['score_class'] = test_df['answer_score'].apply(classify_score)

In [8]:
train_sample_df = train_df.sample(frac=0.1, random_state=42)

In [9]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [10]:
# CONSTANTS
model_checkpoint = 'intfloat/e5-small-v2'
num_labels = train_df['score_class'].nunique()

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-small-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def preprocess_function(examples):
    tokenized_inputs = tokenizer(examples['question_full_text'], examples['answer_cleaned_text'], truncation=True)
    tokenized_inputs['label'] = examples['score_class']
    return tokenized_inputs

In [13]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/888359 [00:00<?, ? examples/s]

Map:   0%|          | 0/98763 [00:00<?, ? examples/s]

In [14]:
model_name = model_checkpoint.split("/")[-1]
training_args = TrainingArguments(
    output_dir=f'{model_name}-finetuned',
    overwrite_output_dir=False,
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [35]:
wandb.finish()

VBox(children=(Label(value='0.010 MB of 0.010 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▁
eval/runtime,█▁
eval/samples_per_second,▁█
eval/steps_per_second,▁█
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/grad_norm,▁▂▂▂▄▃▂▂▂▃▄▂▂▃▃▃▃▃▃▅▅▄▅▆▃▅▄▅▃▅▄▆▄█▃▄▃▆▄▄
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█▆▆▅▅▅▅▅▄▄▄▄▄▄▄▃▃▄▃▃▃▂▃▂▂▃▁▂▂▃▂▂▁▂▂▂▂▂▂▂

0,1
eval/loss,0.59132
eval/runtime,500.3338
eval/samples_per_second,197.394
eval/steps_per_second,6.17
total_flos,1.1703723738041672e+17
train/epoch,2.0
train/global_step,55524.0
train/grad_norm,1.53055
train/learning_rate,0.0
train/loss,0.5801


In [20]:
test_pred = trainer.predict(tokenized_test_dataset)

In [34]:
print(classification_report(test_pred.label_ids, test_pred.predictions.argmax(axis=1)))

              precision    recall  f1-score   support

           0       0.59      0.27      0.37     33588
           1       0.71      0.90      0.79     65175

    accuracy                           0.69     98763
   macro avg       0.65      0.59      0.58     98763
weighted avg       0.67      0.69      0.65     98763
