In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import datetime
import json
import os.path
import pickle
from copy import deepcopy

import matplotlib.pyplot as plt
import pandas as pd
import torch
import transformers
!pip install datasets
from datasets import Dataset
from sklearn.metrics import f1_score
!pip install scikit-multilearn as sklearn
!pip install sklearn.preprocessing
from sklearn.metrics import f1_score
from sklearn.preprocessing import MultiLabelBinarizer
#from skmultilearn.model_selection import IterativeStratification
from torch.nn import BCEWithLogitsLoss
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaConfig, TrainingArguments, Trainer, \
    TrainerState, TrainerControl

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
!pip install scikit-multilearn
from skmultilearn.model_selection import IterativeStratification

Collecting scikit-multilearn
  Using cached scikit_multilearn-0.2.0-py3-none-any.whl.metadata (6.0 kB)
Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/89.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.4/89.4 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0


In [None]:
class LogCallback(transformers.TrainerCallback):
    def __init__(self, trainer):
        super().__init__()
        self._trainer = trainer
        self.init_time = datetime.datetime.now().isoformat()
        self.file_name = f"log-train-{self.init_time}.jsonl"

    def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        super().on_log(args, state, control, **kwargs)
        with open(os.path.join(args.logging_dir, self.file_name), "a+") as out_file:
            out_file.write(json.dumps(state.log_history[-1]))
            out_file.write("\n")

    def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            return control_copy

In [None]:
def train_and_evaluate(model, tokenizer, mlb, train_dataset, val_dataset, num_epochs):
    training_args = TrainingArguments(
        output_dir='./models',
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        learning_rate=5e-5,
        do_train=True,
        do_eval=True,
        eval_strategy='epoch',
        eval_steps=1,
        num_train_epochs=num_epochs,
        logging_strategy='steps',
        logging_dir="./logs",
        logging_steps=10,
        remove_unused_columns=False,
        save_strategy='epoch',
    )

    loss_func = BCEWithLogitsLoss()
    trainer = Trainer(
        args=training_args,
        model=model,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_loss_func=lambda outputs, labels, num_items_in_batch: loss_func(outputs.logits, labels),
        compute_metrics=lambda pred: {"f1": f1_score(pred.label_ids, torch.sigmoid(torch.tensor(pred.predictions)).round(), average="macro"),},
        data_collator=lambda d: collate_data(d, tokenizer, mlb)
    )

    trainer.add_callback(LogCallback(trainer))

    trainer.train()

In [None]:
def collate_data(data, tokenizer, mlb):
    texts = [d["context"] for d in data]
    labels = mlb.transform([d["DA_list"] for d in data])

    batch = tokenizer(texts,
            add_special_tokens=True,
            padding="longest",
            truncation=False,
            return_attention_mask=True,
            return_tensors='pt')

    batch["labels"] = torch.tensor(labels, dtype=torch.float32)
    return batch

In [None]:
def k_fold_cross_validation(dataframe, tokenizer, k=5, epochs=10):
    mlb = MultiLabelBinarizer()
    labels = mlb.fit_transform(dataframe['DA_list'])

    stratifier = IterativeStratification(n_splits=k, order=1)
    splits = stratifier.split(dataframe, labels)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    for fold_number, (train_index, test_index) in enumerate(splits):
        train_df = dataframe.iloc[train_index]

        test_df = dataframe.iloc[test_index]
        train_dataset = Dataset.from_pandas(train_df)
        test_dataset = Dataset.from_pandas(test_df)

        # Model setup
        model = RobertaForSequenceClassification(
            RobertaConfig.from_pretrained('roberta-base', num_labels=len(mlb.classes_))
        )
        model.to(device)

        # Train and evaluate model across specified epochs
        train_and_evaluate(model, tokenizer, mlb, train_dataset, test_dataset, epochs)

In [None]:
def compare_label_distributions(label_counts, labels):
    results = []

    for i, fold_i in enumerate(label_counts):
        for j, fold_j in enumerate(label_counts):
            if i >= j:  # Avoid duplicate comparisons
                continue

            # Compare label distributions
            diff_train = {label: abs(fold_i['train'].get(label, 0) - fold_j['train'].get(label, 0)) for label in labels}
            diff_test = {label: abs(fold_i['test'].get(label, 0) - fold_j['test'].get(label, 0)) for label in labels}

            results.append({
                'Fold_1': fold_i['fold'],
                'Fold_2': fold_j['fold'],
                'Train_Diff': diff_train,
                'Test_Diff': diff_test,
            })

    # Convert to a DataFrame for easier visualization and saving
    comparison_df = pd.DataFrame(results)
    return comparison_df

In [None]:
def save_results_to_file(data, file_path, file_format='csv'):
    if file_format == 'csv':
        if isinstance(data, pd.DataFrame):
            data.to_csv(file_path, index=False)
        else:
            pd.DataFrame(data).to_csv(file_path, index=False)
    elif file_format == 'json':
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=4)
    else:
        raise ValueError("Unsupported file format. Use 'csv' or 'json'.")

In [None]:
def plot_label_distribution(label_counts, labels):
    train_counts = {label: [] for label in labels}
    test_counts = {label: [] for label in labels}

    for count in label_counts:
        for label in labels:
            train_counts[label].append(count['train'].get(label, 0))
            test_counts[label].append(count['test'].get(label, 0))

    fig, ax = plt.subplots(figsize=(14, 8))
    for label in labels:
        ax.plot(range(1, len(label_counts) + 1), train_counts[label], label=f'Train {label}')
        ax.plot(range(1, len(label_counts) + 1), test_counts[label], label=f'Test {label}', linestyle='--')

    ax.set_xlabel('Fold Number')
    ax.set_ylabel('Label Count')
    ax.set_title('Label Distribution Across Folds')
    ax.legend()
    plt.show()

In [None]:
def load_dataset(path):
    dataframe = pd.read_csv(path)
    dataframe['context'] = dataframe['context'].fillna('')  # Replace NaN with empty string
    dataframe['context'] = dataframe['context'].astype(str)  # Ensure all entries are strings
    dataframe['DA_list'] = dataframe['DA'].apply(
        lambda x: list(sorted(set(x.replace(" ", "").split(',')))) if isinstance(x, str) else [])

    return dataframe

57207cb4b386d722e7bacf22c2584befb6c34e78

In [None]:
def main():
    if not os.path.exists("/content/drive/MyDrive/HCI Master courses/THESIS/logs"):
        os.mkdir("/content/drive/MyDrive/HCI Master courses/THESIS/logs")

    #data_test = load_dataset('data/optimal_test.csv')
    #data_train = load_dataset('data/optimal_train.csv')
    data_test = load_dataset('/content/drive/MyDrive/HCI Master courses/THESIS/optimal_test.csv')
    data_train = load_dataset('/content/drive/MyDrive/HCI Master courses/THESIS/optimal_train.csv')

    mlb = MultiLabelBinarizer()
    mlb.fit(data_train['DA_list'])
    mlb.fit(data_test['DA_list'])

    #with open("models/mlb.pkl", "wb") as f:
    #    pickle.dump(mlb, f)

    dataset_train = Dataset.from_pandas(data_train)
    dataset_test = Dataset.from_pandas(data_test)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # Load tokenizer
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaForSequenceClassification(
        RobertaConfig.from_pretrained('roberta-base', num_labels=len(mlb.classes_))
    )
    model.to(device)

    train_and_evaluate(model, tokenizer, mlb, dataset_train, dataset_test, 30)

    # Execute
    # k_fold_cross_validation(dataframe, tokenizer, k=5, epochs=10)

if __name__ == '__main__':
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mdarya-zhookova[0m ([33mdarya-zhookova-bauhaus-university-weimar[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1
1,0.1252,0.11948,0.144123
2,0.0935,0.095599,0.304754
3,0.0877,0.089145,0.355106
4,0.0767,0.085087,0.369596
5,0.0639,0.08012,0.406435
6,0.0553,0.086763,0.3992
7,0.0562,0.081532,0.435444
8,0.0429,0.082527,0.483648
9,0.0431,0.087879,0.490273
10,0.0409,0.088811,0.52764


# Run_classifier

In [None]:
def run():
    # load binarizer
    with open("models/mlb.pkl", "rb") as f:
        mlb = pickle.load(f)

    # load models
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaForSequenceClassification.from_pretrained('models/checkpoint-7440')
    model.eval()

    example = "Thank you! Goodbye!"

    # example inference
    inputs = tokenizer(example, return_tensors="pt")
    outputs = model(**inputs)
    logits = outputs.logits
    label_ids = torch.sigmoid(logits).round().detach()
    labels = mlb.inverse_transform(label_ids)
    print(labels)
    # [('goodbye', 'thankyou')]

if __name__ == '__run__':
    run()