In [None]:
!pip install wandb
!pip install datasets
!pip install -U accelerate
!pip install -U transformers

Collecting wandb
  Downloading wandb-0.16.6-py3-none-any.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.45.0-py2.py3-none-any.whl (267 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m267.1/267.1 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->w

In [None]:
import pandas as pd
import numpy as np
import random
import pickle
import torch
import wandb
import os

from tqdm import tqdm
from collections import Counter, defaultdict

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score

from transformers import Trainer, TrainingArguments, EvalPrediction, DataCollatorWithPadding
from transformers import DataCollatorWithPadding, AutoTokenizer, AutoModelForSequenceClassification

from datasets import load_dataset, load_metric, Dataset, DatasetDict

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
train_000 = pd.read_csv("drive/MyDrive/multilabel_emoji_prediction/train-00000.csv")
train_001 = pd.read_csv("drive/MyDrive/multilabel_emoji_prediction/train-00001.csv")
df = pd.concat([train_000, train_001])

In [None]:
labels = df.columns[2:]
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in id2label.items( )}

In [None]:
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [None]:
def compute_depth(pool):
    depth = pool.shape[0]
    for label in labels:
        depth = min(np.sum(pool[label]), depth)
    return depth

while True:
    train_size = 0.6
    test_size = 0.2
    val_size = 0.2

    train_pool, test_pool = train_test_split(df, test_size = test_size + val_size, random_state = 42)
    test_pool, val_pool = train_test_split(test_pool, test_size = val_size/(1 - train_size), random_state = 42)

    if compute_depth(test_pool) > 100 and compute_depth(val_pool) > 100:
        break

In [None]:
df_train = train_pool.iloc[[0]]
df_test = test_pool.iloc[[0]]
df_val = val_pool.iloc[[0]]

for label in labels:
    replace_flag = np.sum(train_pool[label]) < 1000

    sampled_train_rows = train_pool[train_pool[label] == 1].sample(n = 1000, random_state = 42, replace = replace_flag)
    train_pool = train_pool.drop(sampled_train_rows.index)
    df_train = pd.concat([df_train, sampled_train_rows])

    replace_flag = np.sum(test_pool[label]) < 100

    sampled_test_rows = test_pool[test_pool[label] == 1].sample(n = 100, random_state = 42, replace = replace_flag)
    test_pool = test_pool.drop(sampled_test_rows.index)
    df_test = pd.concat([df_test, sampled_test_rows])

    replace_flag = np.sum(val_pool[label]) < 100

    sampled_val_rows = val_pool[val_pool[label] == 1].sample(n = 100, random_state = 42, replace = replace_flag)
    val_pool = val_pool.drop(sampled_val_rows.index)
    df_val = pd.concat([df_val, sampled_val_rows])

In [None]:
weights = []

size = df_train.shape[0]

for label in labels:
    pos_size = np.sum(df_train[label])
    weights.append((size - pos_size)/pos_size)

In [None]:
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
val_dataset = Dataset.from_pandas(df_val)

multilabel_dataset_dict = DatasetDict({"train": train_dataset, "test": test_dataset, "val": val_dataset})

In [None]:
from transformers import AutoTokenizer, RobertaForSequenceClassification

In [None]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

print(f"Using '{device}' device")

### DistilBERT
# model_name = "distilbert/distilbert-base-cased"
### BERT
# model_name = "google-bert/bert-base-cased"
### BERTweet
# model_name = "vinai/bertweet-base"
### RoBERTa
# model_name = "FacebookAI/roberta-base"

model_name  = "google-bert/bert-base-cased"
problem_type = "multi_label_classification"
# RobertaForSequenceClassification.from_pretrained()
model = AutoModelForSequenceClassification.from_pretrained(model_name,
                                                           id2label = id2label,
                                                           label2id = label2id,
                                                           num_labels = len(labels),
                                                           problem_type = problem_type)
tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = 128)
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

Using 'cuda' device


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
def preprocess_data(examples):
    def tokenize_function(examples, padding="longest", truncation=True):
        return tokenizer(examples["text"], padding = padding, truncation = truncation)

    encoding = tokenize_function(examples)
    batch_size = len(examples["text"])
    num_labels = len(labels)
    label_array = np.zeros((batch_size, num_labels))

    for label_name in labels:
        if label_name in examples:
            j = label2id[label_name]
            for i, val in enumerate(examples[label_name]):
                if val:
                    label_array[i, j] = 1

    encoding['labels'] = label_array

    return encoding

multilabel_preprocessed = multilabel_dataset_dict.map(preprocess_data, batched = True, batch_size = None)

Map:   0%|          | 0/30001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

Map:   0%|          | 0/3001 [00:00<?, ? examples/s]

In [None]:
multilabel_training_args = TrainingArguments(
    output_dir = "/multilabel_output",
    overwrite_output_dir = True,
    learning_rate = 2e-5,
    per_device_train_batch_size = 64,
    per_device_eval_batch_size = 256,
    do_eval = True,
    seed = 42,
    evaluation_strategy = "steps",
    eval_steps = 400,
    save_strategy = "steps",
    save_steps = 400,
    num_train_epochs = 35,
    logging_dir = "/multilabel_log",
    load_best_model_at_end = True,
    metric_for_best_model = "eval_f1",
    greater_is_better = True,
    report_to = "wandb"
)

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def compute_multilabel_metrics(eval_pred: EvalPrediction):
    logits, labels = eval_pred
    probs = sigmoid(logits)
    preds = np.round(probs)

    for i in range(len(preds)):
        record = preds[i]
        if np.max(record) == 0.0:
            max_idx = np.argmax(probs[i])
            preds[i][max_idx] = 1.0

    precision = precision_score(labels, preds, average='micro')
    recall = recall_score(labels, preds, average='micro')
    f1 = f1_score(labels, preds, average='micro')

    return {
        'precision': precision, 'recall': recall, 'f1': f1
    }

In [None]:
from torch import nn, FloatTensor
from torch.nn import BCEWithLogitsLoss

class_weights = torch.tensor(weights, device = device)

class WeightedLossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, class_weights=class_weights):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss_compute = BCEWithLogitsLoss(pos_weight = class_weights)
        loss = loss_compute(logits.view(-1, model.num_labels), labels.view(-1, model.num_labels))

        return (loss, outputs) if return_outputs else loss

In [None]:
multilabel_trainer = WeightedLossTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_multilabel_metrics,
    args = multilabel_training_args,
    train_dataset = multilabel_preprocessed['train'],
    eval_dataset = multilabel_preprocessed['val']
)

In [None]:
multilabel_trainer.train()

In [None]:
probs = multilabel_trainer.predict(multilabel_preprocessed['test'])[0]

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

preds = np.round(sigmoid(probs))

for i in range(len(preds)):
    record = preds[i]
    if np.max(record) == 0.0:
        max_idx = np.argmax(preds[i])
        preds[i][max_idx] = 1.0

preds_arr = preds

In [None]:
df_arr = np.array(df_test.iloc[:, 2:]) # ground truth

In [None]:
emoji2id = {emoji: i for i, emoji in enumerate(df_test.iloc[:, 2:].columns)}
id2emoji = {i: emoji for i, emoji in enumerate(df_test.iloc[:, 2:].columns)}

In [None]:
print("Red Heart Precision:", precision_score(df_arr, preds_arr, average=None, zero_division=0)[emoji2id['❤️']])
print("Red Heart Recall:",    recall_score(df_arr, preds_arr, average=None, zero_division=0)[emoji2id['❤️']])
print("Red Heart F1 Score:",  f1_score(df_arr, preds_arr, average=None, zero_division=0)[emoji2id['❤️']])

Red Heart Precision: 0.24205378973105135
Red Heart Recall: 0.4782608695652174
Red Heart F1 Score: 0.32142857142857145


In [None]:
micro_f1 = f1_score(df_arr, preds_arr, average='micro')
macro_f1 = f1_score(df_arr, preds_arr, average='macro')

print("Micro F1 Score:", micro_f1, "Macro F1 Score:", macro_f1)

Micro F1 Score: 0.41747676530285227 Macro F1 Score: 0.4287946302369329


In [None]:
cosine_similarities = []

for i in range(df_arr.shape[0]):
    cosine_sim = cosine_similarity(df_arr[i].reshape(1, -1), preds_arr[i].reshape(1, -1))
    cosine_similarities.append(cosine_sim[0][0])

print("Cosine Similarity:", np.mean(cosine_similarities))

Cosine Similarity: 0.4192820468841289


In [None]:
file_path = 'model.pkl'

with open(file_path, 'wb') as f:
    pickle.dump(model, f)

print("Model saved to", file_path)

Model saved to model.pkl
