In [None]:
!pip install plotly transformers sklearn datasets

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

df=pd.read_csv("Data_Example/train.csv")
df2=pd.read_csv("Data_Example/dev.csv")
df = pd.concat([df, df2], ignore_index=True)
df.head()

Unnamed: 0,text,label
0,@Tom174_ @davidbest95 Meine Reaktion; |LBR| Ni...,0
1,"#Merkel rollt dem Emir von #Katar, der islamis...",0
2,„Merle ist kein junges unschuldiges Mädchen“ K...,0
3,@umweltundaktiv Asylantenflut bringt eben nur ...,1
4,@_StultaMundi Die Bibel enthält ebenfalls Gese...,0


In [3]:
import re
def remove_mentions(text):
    return re.sub(r'@\w+', '', text)

# Apply the function to your column
df['text'] = df['text'].apply(remove_mentions)


In [4]:
df.head()

Unnamed: 0,text,label
0,Meine Reaktion; |LBR| Nicht jeder Moslem ist...,0
1,"#Merkel rollt dem Emir von #Katar, der islamis...",0
2,„Merle ist kein junges unschuldiges Mädchen“ K...,0
3,Asylantenflut bringt eben nur negatives für D...,1
4,Die Bibel enthält ebenfalls Gesetze des Zivil...,0


In [5]:
def balance_dataset(df, target_col):
    """
    Balances a dataset by randomly removing instances from overrepresented classes.
    
    Parameters:
    - df: DataFrame containing the data.
    - target_col: The column name of the target variable.
    
    Returns:
    - A balanced DataFrame.
    """

    # Calculate the number of instances for each class
    class_counts = df[target_col].value_counts()
    
    # Find the smallest class size
    min_class_size = class_counts.min()
    
    # For each class, randomly sample instances to match the size of the smallest class
    balanced_dfs = [df[df[target_col] == class_label].sample(n=min_class_size, random_state=42) 
                   for class_label in class_counts.index]
    
    # Concatenate the dataframes
    balanced_df = pd.concat(balanced_dfs, ignore_index=True)
    
    return balanced_df


In [6]:
print(len(df[df['label']==1]))
print(len(df[df['label']==0]))



509
991


In [7]:
df = balance_dataset(df, 'label')

In [8]:
print(len(df[df['label']==1]))
print(len(df[df['label']==0]))

509
509


In [9]:

train_df, test_df = train_test_split(df, test_size=0.1, stratify=df['label'])
#train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'])
#val_dataset = Dataset.from_pandas(val_df)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
#full_dataset = Dataset.from_pandas(df)
dataset = {'train': train_dataset,'test': test_dataset}

In [10]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-german-cased")
#tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased')

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = {}
for key in dataset:
    tokenized_datasets[key] = dataset[key].map(tokenize_function, batched=True)

Map:   0%|          | 0/916 [00:00<?, ? examples/s]

Map:   0%|          | 0/102 [00:00<?, ? examples/s]

In [11]:
from transformers import AutoModelForSequenceClassification, AutoConfig
from sklearn.metrics import f1_score
import tempfile

checkpoint = "distilbert-base-german-cased"
config = AutoConfig.from_pretrained(checkpoint)
config.dropout = 0.5          
config.attention_dropout = 0.5
with tempfile.TemporaryDirectory() as temp_dir:
    config.save_pretrained(temp_dir)
    
    # Also save the model weights to the temp directory
    model_weights = AutoModelForSequenceClassification.from_pretrained(checkpoint)
    model_weights.save_pretrained(temp_dir)
    
    # Now load the model with the modified config from the temp directory
    model = AutoModelForSequenceClassification.from_pretrained(temp_dir, num_labels=2)


import numpy as np
from datasets import load_metric
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    f1 = f1_score(labels, predictions)  # Calculate the F1 score first
    acc_metrics = metric.compute(predictions=predictions, references=labels)  # This should give you accuracy.
    
    return {"f1": f1, **acc_metrics}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-german-cased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.weight', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  metric = load_metric("accuracy")


In [12]:
from transformers import TrainingArguments, Trainer, AutoConfig, EarlyStoppingCallback
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import optuna

def objective(trial):
    # Define search space
    dropout = trial.suggest_float("dropout", 0.2, 0.4)
    attention_dropout = trial.suggest_float("attention_dropout", 0.1, 0.5)
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 5)
    batch_size = trial.suggest_int("batch_size",8, 32, log=True)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)
    #max_grad_norm = trial.suggest_float("max_grad_norm", 0.5, 5.0) 

    config = AutoConfig.from_pretrained(checkpoint)
    config.dropout = dropout
    config.attention_dropout = attention_dropout
    config.num_labels = 2

    # Split dataset into training and validation subsets (80-20)
    
    labels = tokenized_datasets['train']['label']
    train_indices, val_indices = train_test_split(list(range(len(labels))), train_size=0.8, stratify=labels)

    train_dataset = tokenized_datasets["train"].select(train_indices)
    val_dataset = tokenized_datasets["train"].select(val_indices)

    model = AutoModelForSequenceClassification.from_config(config)

    args = TrainingArguments(
        output_dir="test_trainer",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        weight_decay=weight_decay,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        #max_grad_norm=max_grad_norm,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        load_best_model_at_end=True,
    )

    # Define the Trainer using the above args
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.0)],
    )

    trainer.train()
    metrics = trainer.evaluate()

    # Return the evaluation loss
    return metrics["eval_loss"]

# Run the study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

# Print the result
print(f"Best trial: {study.best_trial.params}")


[I 2023-09-05 23:58:34,709] A new study created in memory with name: no-name-f2d8b84d-2600-45b0-b638-479c25466e86


  0%|          | 0/62 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.6803334951400757, 'eval_f1': 0.518987341772152, 'eval_accuracy': 0.5869565217391305, 'eval_runtime': 0.9876, 'eval_samples_per_second': 186.305, 'eval_steps_per_second': 23.288, 'epoch': 1.0}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.679559051990509, 'eval_f1': 0.6567164179104478, 'eval_accuracy': 0.5, 'eval_runtime': 0.9778, 'eval_samples_per_second': 188.175, 'eval_steps_per_second': 23.522, 'epoch': 2.0}
{'train_runtime': 25.7785, 'train_samples_per_second': 56.792, 'train_steps_per_second': 2.405, 'train_loss': 0.6976509094238281, 'epoch': 2.0}


  0%|          | 0/23 [00:00<?, ?it/s]

[I 2023-09-05 23:59:04,767] Trial 0 finished with value: 0.679559051990509 and parameters: {'dropout': 0.28703961565591535, 'attention_dropout': 0.2311161128842185, 'learning_rate': 2.0959777622213153e-05, 'num_train_epochs': 2, 'batch_size': 24, 'weight_decay': 0.07079367968007336}. Best is trial 0 with value: 0.679559051990509.


  0%|          | 0/276 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.6742578744888306, 'eval_f1': 0.481203007518797, 'eval_accuracy': 0.625, 'eval_runtime': 1.0362, 'eval_samples_per_second': 177.564, 'eval_steps_per_second': 22.195, 'epoch': 1.0}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.704458475112915, 'eval_f1': 0.543046357615894, 'eval_accuracy': 0.625, 'eval_runtime': 0.9989, 'eval_samples_per_second': 184.203, 'eval_steps_per_second': 23.025, 'epoch': 2.0}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.7510197758674622, 'eval_f1': 0.6502463054187193, 'eval_accuracy': 0.6141304347826086, 'eval_runtime': 1.0167, 'eval_samples_per_second': 180.986, 'eval_steps_per_second': 22.623, 'epoch': 3.0}
{'train_runtime': 40.7434, 'train_samples_per_second': 53.898, 'train_steps_per_second': 6.774, 'train_loss': 0.6274708181187727, 'epoch': 3.0}


  0%|          | 0/23 [00:00<?, ?it/s]

[I 2023-09-05 23:59:48,457] Trial 1 finished with value: 0.6742578744888306 and parameters: {'dropout': 0.3997161149391317, 'attention_dropout': 0.11415730369043926, 'learning_rate': 5.304372528413789e-05, 'num_train_epochs': 3, 'batch_size': 8, 'weight_decay': 0.03643590292667439}. Best is trial 1 with value: 0.6742578744888306.


  0%|          | 0/335 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.690944254398346, 'eval_f1': 0.0, 'eval_accuracy': 0.5, 'eval_runtime': 0.9963, 'eval_samples_per_second': 184.682, 'eval_steps_per_second': 23.085, 'epoch': 1.0}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.6896563768386841, 'eval_f1': 0.02150537634408602, 'eval_accuracy': 0.5054347826086957, 'eval_runtime': 0.9929, 'eval_samples_per_second': 185.314, 'eval_steps_per_second': 23.164, 'epoch': 2.0}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.6876516938209534, 'eval_f1': 0.15841584158415842, 'eval_accuracy': 0.5380434782608695, 'eval_runtime': 0.9985, 'eval_samples_per_second': 184.279, 'eval_steps_per_second': 23.035, 'epoch': 3.0}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.687356173992157, 'eval_f1': 0.14, 'eval_accuracy': 0.532608695652174, 'eval_runtime': 0.9534, 'eval_samples_per_second': 193.003, 'eval_steps_per_second': 24.125, 'epoch': 4.0}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.686913013458252, 'eval_f1': 0.15841584158415842, 'eval_accuracy': 0.5380434782608695, 'eval_runtime': 0.9883, 'eval_samples_per_second': 186.181, 'eval_steps_per_second': 23.273, 'epoch': 5.0}
{'train_runtime': 65.2973, 'train_samples_per_second': 56.051, 'train_steps_per_second': 5.13, 'train_loss': 0.6973474758774487, 'epoch': 5.0}


  0%|          | 0/23 [00:00<?, ?it/s]

[I 2023-09-06 00:00:56,767] Trial 2 finished with value: 0.686913013458252 and parameters: {'dropout': 0.30415047584199445, 'attention_dropout': 0.32241238511475034, 'learning_rate': 1.341365888426662e-06, 'num_train_epochs': 5, 'batch_size': 11, 'weight_decay': 0.013133537587128753}. Best is trial 1 with value: 0.6742578744888306.


  0%|          | 0/184 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.6810665130615234, 'eval_f1': 0.17142857142857143, 'eval_accuracy': 0.5271739130434783, 'eval_runtime': 0.9846, 'eval_samples_per_second': 186.877, 'eval_steps_per_second': 23.36, 'epoch': 1.0}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.7620090842247009, 'eval_f1': 0.41600000000000004, 'eval_accuracy': 0.6032608695652174, 'eval_runtime': 0.9993, 'eval_samples_per_second': 184.123, 'eval_steps_per_second': 23.015, 'epoch': 2.0}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.7604450583457947, 'eval_f1': 0.6567164179104478, 'eval_accuracy': 0.625, 'eval_runtime': 0.9983, 'eval_samples_per_second': 184.312, 'eval_steps_per_second': 23.039, 'epoch': 3.0}
{'train_runtime': 40.9072, 'train_samples_per_second': 71.577, 'train_steps_per_second': 4.498, 'train_loss': 0.5971395520196445, 'epoch': 3.0}


  0%|          | 0/23 [00:00<?, ?it/s]

[I 2023-09-06 00:01:40,804] Trial 3 finished with value: 0.6810665130615234 and parameters: {'dropout': 0.33183013266753447, 'attention_dropout': 0.47271318011452257, 'learning_rate': 9.66513393281928e-05, 'num_train_epochs': 4, 'batch_size': 16, 'weight_decay': 0.016758511034193048}. Best is trial 1 with value: 0.6742578744888306.


  0%|          | 0/268 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.6939852237701416, 'eval_f1': 0.08333333333333333, 'eval_accuracy': 0.5217391304347826, 'eval_runtime': 0.9988, 'eval_samples_per_second': 184.22, 'eval_steps_per_second': 23.028, 'epoch': 1.0}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.6818825602531433, 'eval_f1': 0.4999999999999999, 'eval_accuracy': 0.6304347826086957, 'eval_runtime': 0.9963, 'eval_samples_per_second': 184.684, 'eval_steps_per_second': 23.086, 'epoch': 2.0}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.6653841137886047, 'eval_f1': 0.6666666666666666, 'eval_accuracy': 0.6739130434782609, 'eval_runtime': 0.9986, 'eval_samples_per_second': 184.258, 'eval_steps_per_second': 23.032, 'epoch': 3.0}


  0%|          | 0/23 [00:00<?, ?it/s]

{'eval_loss': 0.7727254033088684, 'eval_f1': 0.67, 'eval_accuracy': 0.6413043478260869, 'eval_runtime': 0.9856, 'eval_samples_per_second': 186.689, 'eval_steps_per_second': 23.336, 'epoch': 4.0}
{'train_runtime': 52.9327, 'train_samples_per_second': 55.316, 'train_steps_per_second': 5.063, 'train_loss': 0.5473685193417678, 'epoch': 4.0}


  0%|          | 0/23 [00:00<?, ?it/s]

[I 2023-09-06 00:02:36,976] Trial 4 finished with value: 0.6653841137886047 and parameters: {'dropout': 0.2137846880729603, 'attention_dropout': 0.4396165279427031, 'learning_rate': 4.41602670251735e-05, 'num_train_epochs': 4, 'batch_size': 11, 'weight_decay': 0.06928253033800241}. Best is trial 4 with value: 0.6653841137886047.


Best trial: {'dropout': 0.2137846880729603, 'attention_dropout': 0.4396165279427031, 'learning_rate': 4.41602670251735e-05, 'num_train_epochs': 4, 'batch_size': 11, 'weight_decay': 0.06928253033800241}


In [13]:
import optuna.visualization as vis

vis.plot_optimization_history(study)

In [18]:
vis.plot_param_importances(study)

In [19]:
vis.plot_slice(study)

In [20]:
vis.plot_parallel_coordinate(study)

In [14]:
"""

from transformers import TrainingArguments, Trainer, AutoConfig, EarlyStoppingCallback
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, KFold
import optuna


def objective(trial):
    # Define search space
    dropout = trial.suggest_float("dropout", 0.2, 0.4)
    attention_dropout = trial.suggest_float("attention_dropout", 0.1, 0.5)
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 5)
    batch_size = trial.suggest_int("batch_size",8, 32, log=True)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)
    #max_grad_norm = trial.suggest_float("max_grad_norm", 0.5, 5.0) 

    config = AutoConfig.from_pretrained(checkpoint)
    config.dropout = dropout
    config.attention_dropout = attention_dropout
    config.num_labels = 2

    # Use KFold for cross-validation
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    fold_losses = []

    labels = tokenized_datasets['train']['label']
    for train_idx, val_idx in kfold.split(tokenized_datasets["train"]["input_ids"], labels):
        train_subset = tokenized_datasets["train"].select(train_idx)
        val_subset = tokenized_datasets["train"].select(val_idx)

        model = AutoModelForSequenceClassification.from_config(config)

        args = TrainingArguments(
            output_dir="test_trainer",
            evaluation_strategy="epoch",
            save_strategy="epoch",
            weight_decay=weight_decay,
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            num_train_epochs=num_train_epochs,
            #max_grad_norm=max_grad_norm,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            load_best_model_at_end=True,
        )

        # Define the Trainer using the above args for the current fold
        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=train_subset,
            eval_dataset=val_subset,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.0)],
        )

        trainer.train()
        metrics = trainer.evaluate()
        fold_losses.append(metrics["eval_loss"])

    # Return the average loss across the folds
    return sum(fold_losses) / len(fold_losses)

# Run the study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

# Print the result
print(f"Best trial: {study.best_trial.params}")

"""

'\n\nfrom transformers import TrainingArguments, Trainer, AutoConfig, EarlyStoppingCallback\nfrom sklearn.metrics import f1_score\nfrom sklearn.model_selection import StratifiedKFold, KFold\nimport optuna\n\n\ndef objective(trial):\n    # Define search space\n    dropout = trial.suggest_float("dropout", 0.2, 0.4)\n    attention_dropout = trial.suggest_float("attention_dropout", 0.1, 0.5)\n    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)\n    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 5)\n    batch_size = trial.suggest_int("batch_size",8, 32, log=True)\n    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)\n    #max_grad_norm = trial.suggest_float("max_grad_norm", 0.5, 5.0) \n\n    config = AutoConfig.from_pretrained(checkpoint)\n    config.dropout = dropout\n    config.attention_dropout = attention_dropout\n    config.num_labels = 2\n\n    # Use KFold for cross-validation\n    kfold = StratifiedKFold(n_splits=10, shuffle=True, r

In [15]:
best_params = study.best_params

In [16]:
config = AutoConfig.from_pretrained(checkpoint)
config.dropout = best_params['dropout']
config.attention_dropout = best_params['attention_dropout']
config.num_labels = 2 
learning_rate = best_params["learning_rate"]
batch_size = best_params["batch_size"]
num_train_epochs = best_params['num_train_epochs']
weight_decay = best_params['weight_decay']
#max_grad_norm=best_params['max_grad_norm']

best_model = AutoModelForSequenceClassification.from_config(config)

# Train on entire set

In [17]:
from datasets import concatenate_datasets
full_dataset = concatenate_datasets([tokenized_datasets["train"], tokenized_datasets["val"]])

training_args = TrainingArguments(
     output_dir="test_trainer",
        #evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        per_device_train_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        #max_grad_norm=max_grad_norm,
        metric_for_best_model="eval_loss",
        greater_is_better=True,
        load_best_model_at_end=False,
        evaluation_strategy='no', #uncomment if i want to train on train val split
)

trainer = Trainer(
    model=best_model,
    args=training_args,
    train_dataset=full_dataset,
    #train_dataset=tokenized_datasets["train"],
    #eval_dataset=tokenized_datasets["val"],  # or you could leave this out if you just want to train on the full dataset
    compute_metrics=compute_metrics
)

trainer.train()

KeyError: 'val'

In [None]:
results = trainer.evaluate(tokenized_datasets["test"])
print(results)

In [None]:
predictions_clip = trainer.predict(tokenized_datasets["test"])
predicted_labels = np.argmax(predictions_clip.predictions, axis=-1)

In [None]:
print(predictions_clip)