In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# Nous utilisions la librairie simpletransformers qui rend l'utilisation des modeles de type transformers super simple
from simpletransformers.classification import ClassificationModel, ClassificationArgs

# Preparing the data

In [None]:
data = pd.read_json('data/train.json').set_index('Id')
data = data.drop('gender', axis=1)
target = pd.read_csv('data/train_label.csv')
data['label'] = target['Category']
data.columns = ['text', 'label']

# Trainning

In [None]:
model_args = ClassificationArgs()
model_args.evaluate_during_training = False

# La recherche des parametres optimaux a ete faite avec l'application weights and biases, voir la section Best hyperparameter search
model_args.manual_seed = 2
model_args.learning_rate = 1.850322830319812e-05
model_args.num_train_epochs = 2
model_args.max_seq_length = 256
model_args.multiprocessing_chunksize = 5000
model_args.no_cache = True
model_args.no_save = True
model_args.reprocess_input_data = True
model_args.train_batch_size = 24
model_args.gradient_accumulation_steps = 2
model_args.train_custom_parameters_only = False

# Le modele transformers utilise est Roberta Large, pour le faire tourner il faut une carte avec plus de 16GB de memoire
model = ClassificationModel(
        "roberta",
        "roberta-large",
        num_labels=28,
        use_cuda=True,
        args=model_args
    )

model.train_model(
        data
        )

# Predictions

In [None]:
test = pd.read_json('data/test.json')
predictions = model.predict(test.description.values)

# Submission

In [None]:
test["Category"] = predictions[0]
predictionsDf = test[["Id","Category"]]
predictionsDf.to_csv("soumissions/simple_transformers_roberta_large_1.8_2epoch.csv", index=False)
predictionsDf

# Best hyperparameter search

In [None]:
import wandb
wandb.login(key='enter key here')

train_df, test_df = train_test_split(data, random_state=4)

In [None]:
model_args = ClassificationArgs()
model_args.eval_batch_size = 24
model_args.evaluate_during_training = True
model_args.evaluate_during_training_silent = False
model_args.evaluate_during_training_steps = 2300
model_args.manual_seed = 4
model_args.max_seq_length = 256
model_args.multiprocessing_chunksize = 5000
model_args.no_cache = True
model_args.no_save = True
model_args.num_train_epochs = 2
model_args.output_dir = '/save'
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.train_batch_size = 24
model_args.gradient_accumulation_steps = 2
model_args.train_custom_parameters_only = False

sweep_config = {
    "name": "vanilla-sweep-batch-16",
    "method": "bayes",
    "metric": {"name": "f1_score", "goal": "maximize"},
    "parameters": {
        "num_train_epochs": {"min": 1, "max": 3},
        "learning_rate": {"min": 1e-05, "max": 2.5e-05},
    },
    "early_terminate": {"type": "hyperband", "min_iter": 6,},
}

sweep_id = wandb.sweep(sweep_config, project="DEFI IA - Hyperparameter Optimization")

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='macro')

def train():
    # Initialize a new wandb run
    wandb.init()

    # Create a TransformerModel
    global model
    model = ClassificationModel(
        "roberta",
        "roberta-large",
        num_labels=28,
        use_cuda=True,
        args=model_args,
        sweep_config=wandb.config,
    )

    # Train the model
    model.train_model(
        train_df,
        eval_df=test_df,
        f1_score=f1_multiclass
        )

    # Sync wandb
    wandb.join()

wandb.agent(sweep_id, train)

# Cette recherche n'a pas pu etre completement executee car tres lente, 1 seule epoch faisait environ 1h20 sur une rtx 6000
# Les resultats peuvent etres visualises sur pdf, autre piece jointe du mail