In [None]:
!pip install plotly transformers sklearn datasets

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset

df=pd.read_csv("Data_Example/train.csv")
df2=pd.read_csv("Data_Example/dev.csv")
df = pd.concat([df, df2], ignore_index=True)
df.head()

In [None]:
import re
def remove_mentions(text):
    return re.sub(r'@\w+', '', text)

# Apply the function to your column
df['text'] = df['text'].apply(remove_mentions)


In [None]:
df.head()

In [None]:
def balance_dataset(df, target_col):
    """
    Balances a dataset by randomly removing instances from overrepresented classes.
    
    Parameters:
    - df: DataFrame containing the data.
    - target_col: The column name of the target variable.
    
    Returns:
    - A balanced DataFrame.
    """

    # Calculate the number of instances for each class
    class_counts = df[target_col].value_counts()
    
    # Find the smallest class size
    min_class_size = class_counts.min()
    
    # For each class, randomly sample instances to match the size of the smallest class
    balanced_dfs = [df[df[target_col] == class_label].sample(n=min_class_size, random_state=42) 
                   for class_label in class_counts.index]
    
    # Concatenate the dataframes
    balanced_df = pd.concat(balanced_dfs, ignore_index=True)
    
    return balanced_df


In [None]:
print(len(df[df['label']==1]))
print(len(df[df['label']==0]))



In [None]:
df = balance_dataset(df, 'label')

In [None]:
print(len(df[df['label']==1]))
print(len(df[df['label']==0]))

In [None]:

train_df, test_df = train_test_split(df, test_size=0.1, stratify=df['label'])
#train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'])
#val_dataset = Dataset.from_pandas(val_df)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
#full_dataset = Dataset.from_pandas(df)
dataset = {'train': train_dataset,'test': test_dataset}

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-german-cased")
#tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased')

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = {}
for key in dataset:
    tokenized_datasets[key] = dataset[key].map(tokenize_function, batched=True)

In [None]:
from transformers import AutoModelForSequenceClassification, AutoConfig
from sklearn.metrics import f1_score
import tempfile

checkpoint = "distilbert-base-german-cased"
config = AutoConfig.from_pretrained(checkpoint)
config.dropout = 0.5          
config.attention_dropout = 0.5
with tempfile.TemporaryDirectory() as temp_dir:
    config.save_pretrained(temp_dir)
    
    # Also save the model weights to the temp directory
    model_weights = AutoModelForSequenceClassification.from_pretrained(checkpoint)
    model_weights.save_pretrained(temp_dir)
    
    # Now load the model with the modified config from the temp directory
    model = AutoModelForSequenceClassification.from_pretrained(temp_dir, num_labels=2)


import numpy as np
from datasets import load_metric
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    f1 = f1_score(labels, predictions)  # Calculate the F1 score first
    acc_metrics = metric.compute(predictions=predictions, references=labels)  # This should give you accuracy.
    
    return {"f1": f1, **acc_metrics}

In [None]:
from transformers import TrainingArguments, Trainer, AutoConfig, EarlyStoppingCallback
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import optuna

def objective(trial):
    # Define search space
    dropout = trial.suggest_float("dropout", 0.2, 0.4)
    attention_dropout = trial.suggest_float("attention_dropout", 0.1, 0.5)
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 5)
    batch_size = trial.suggest_int("batch_size",8, 32, log=True)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)
    #max_grad_norm = trial.suggest_float("max_grad_norm", 0.5, 5.0) 

    config = AutoConfig.from_pretrained(checkpoint)
    config.dropout = dropout
    config.attention_dropout = attention_dropout
    config.num_labels = 2

    # Split dataset into training and validation subsets (80-20)
    
    labels = tokenized_datasets['train']['label']
    train_indices, val_indices = train_test_split(list(range(len(labels))), train_size=0.8, stratify=labels)

    train_dataset = tokenized_datasets["train"].select(train_indices)
    val_dataset = tokenized_datasets["train"].select(val_indices)

    model = AutoModelForSequenceClassification.from_config(config)

    args = TrainingArguments(
        output_dir="test_trainer",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        weight_decay=weight_decay,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        #max_grad_norm=max_grad_norm,
        metric_for_best_model="eval_loss",
        greater_is_better=False,
        load_best_model_at_end=True,
    )

    # Define the Trainer using the above args
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.0)],
    )

    trainer.train()
    metrics = trainer.evaluate()

    # Return the evaluation loss
    return metrics["eval_loss"]

# Run the study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

# Print the result
print(f"Best trial: {study.best_trial.params}")


In [None]:
import optuna.visualization as vis

vis.plot_optimization_history(study)

In [None]:
vis.plot_param_importances(study)

In [None]:
vis.plot_slice(study)

In [None]:
vis.plot_parallel_coordinate(study)

In [None]:
"""

from transformers import TrainingArguments, Trainer, AutoConfig, EarlyStoppingCallback
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, KFold
import optuna


def objective(trial):
    # Define search space
    dropout = trial.suggest_float("dropout", 0.2, 0.4)
    attention_dropout = trial.suggest_float("attention_dropout", 0.1, 0.5)
    learning_rate = trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 5)
    batch_size = trial.suggest_int("batch_size",8, 32, log=True)
    weight_decay = trial.suggest_float("weight_decay", 0.0, 0.1)
    #max_grad_norm = trial.suggest_float("max_grad_norm", 0.5, 5.0) 

    config = AutoConfig.from_pretrained(checkpoint)
    config.dropout = dropout
    config.attention_dropout = attention_dropout
    config.num_labels = 2

    # Use KFold for cross-validation
    kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    fold_losses = []

    labels = tokenized_datasets['train']['label']
    for train_idx, val_idx in kfold.split(tokenized_datasets["train"]["input_ids"], labels):
        train_subset = tokenized_datasets["train"].select(train_idx)
        val_subset = tokenized_datasets["train"].select(val_idx)

        model = AutoModelForSequenceClassification.from_config(config)

        args = TrainingArguments(
            output_dir="test_trainer",
            evaluation_strategy="epoch",
            save_strategy="epoch",
            weight_decay=weight_decay,
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            num_train_epochs=num_train_epochs,
            #max_grad_norm=max_grad_norm,
            metric_for_best_model="eval_loss",
            greater_is_better=False,
            load_best_model_at_end=True,
        )

        # Define the Trainer using the above args for the current fold
        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=train_subset,
            eval_dataset=val_subset,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.0)],
        )

        trainer.train()
        metrics = trainer.evaluate()
        fold_losses.append(metrics["eval_loss"])

    # Return the average loss across the folds
    return sum(fold_losses) / len(fold_losses)

# Run the study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=5)

# Print the result
print(f"Best trial: {study.best_trial.params}")

"""

In [None]:
best_params = study.best_params

In [None]:
config = AutoConfig.from_pretrained(checkpoint)
config.dropout = best_params['dropout']
config.attention_dropout = best_params['attention_dropout']
config.num_labels = 2 
learning_rate = best_params["learning_rate"]
batch_size = best_params["batch_size"]
num_train_epochs = best_params['num_train_epochs']
weight_decay = best_params['weight_decay']
#max_grad_norm=best_params['max_grad_norm']

best_model = AutoModelForSequenceClassification.from_config(config)

# Train on entire set

In [None]:
from datasets import concatenate_datasets
full_dataset = concatenate_datasets([tokenized_datasets["train"], tokenized_datasets["val"]])

training_args = TrainingArguments(
     output_dir="test_trainer",
        #evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        weight_decay=weight_decay,
        per_device_train_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        #max_grad_norm=max_grad_norm,
        metric_for_best_model="eval_loss",
        greater_is_better=True,
        load_best_model_at_end=False,
        evaluation_strategy='no', #uncomment if i want to train on train val split
)

trainer = Trainer(
    model=best_model,
    args=training_args,
    train_dataset=full_dataset,
    #train_dataset=tokenized_datasets["train"],
    #eval_dataset=tokenized_datasets["val"],  # or you could leave this out if you just want to train on the full dataset
    compute_metrics=compute_metrics
)

trainer.train()

In [None]:
results = trainer.evaluate(tokenized_datasets["test"])
print(results)

In [None]:
predictions_clip = trainer.predict(tokenized_datasets["test"])
predicted_labels = np.argmax(predictions_clip.predictions, axis=-1)

In [None]:
print(predictions_clip)