In [1]:
import numpy as np
import pandas as pd

import transformers
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
train = pd.read_csv("Intern/intern_homework_train_dataset.csv")
valid = pd.read_csv("Intern/intern_homework_public_test_dataset.csv")
test = pd.read_csv("Intern/intern_homework_private_test_dataset.csv")
train.head()

In [None]:
train_set = train[['title', 'like_count_24h']]
train_set = train_set.rename({'like_count_24h': 'labels'}, axis=1)
valid_set = valid[['title', 'like_count_24h']]
valid_set = valid_set.rename({'like_count_24h': 'labels'}, axis=1)

In [4]:
dataset = DatasetDict({"train": Dataset.from_pandas(train_set),
                      "valid": Dataset.from_pandas(valid_set)})
dataset

DatasetDict({
    train: Dataset({
        features: ['title', 'labels'],
        num_rows: 50000
    })
    valid: Dataset({
        features: ['title', 'labels'],
        num_rows: 10000
    })
})

In [5]:
from torch.utils.data import DataLoader

BASE_MODEL = "bert-base-chinese"
LEARNING_RATE = 2e-5
# MAX_LENGTH = 256
BATCH_SIZE = 4
EPOCHS = 20

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
# set num_labels=1 -> linear regression model
model = AutoModelForSequenceClassification.from_pretrained(BASE_MODEL, num_labels=1)


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [6]:
def tokenize_function(examples):
    return tokenizer(examples["title"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

                                                                                                                                                             

DatasetDict({
    train: Dataset({
        features: ['title', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
    valid: Dataset({
        features: ['title', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [7]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

def compute_metrics_for_regression(eval_pred):
    predictions, labels = eval_pred
    labels = labels.reshape(-1, 1)
    
    mse = mean_squared_error(labels, predictions)
    mae = mean_absolute_error(labels, predictions)
    mape = mean_absolute_percentage_error(labels, predictions)
    r2 = r2_score(labels, predictions)
    single_squared_errors = ((predictions - labels).flatten()**2).tolist()
    
    # Compute accuracy 
    # Based on the fact that the rounded score = true score only if |single_squared_errors| < 0.5
    accuracy = sum([1 for e in single_squared_errors if e < 0.25]) / len(single_squared_errors)
    
    return {"mse": mse, "mae": mae,"mape": mape, "r2": r2, "accuracy": accuracy}

In [None]:
import torch

class RegressionTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs[0][:, 0]
        loss = torch.nn.functional.mse_loss(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="test_trainer_medium",
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    metric_for_best_model="accuracy",
    load_best_model_at_end=True,
    weight_decay=0.01,
)

In [None]:
trainer = RegressionTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    compute_metrics=compute_metrics_for_regression,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Mse,Mae,Mape,R2,Accuracy
1,42482.868,28223.775391,28223.775391,41.885956,1.934686,-0.006777,0.0079
2,58284.452,28254.5,28254.501953,41.348831,1.848674,-0.007873,0.0091
3,21652.854,28302.179688,28302.179688,40.618107,1.727373,-0.009573,0.0074
