# Kaggle Competition

## Auteurs : Oscar Pastural, Clément Florval, Louis Gauthier

---
## 0. Installation des dépendances nécessaires

In [None]:
!pip install transformers datasets scikit-learn evaluate pandas



## 0.5 Import des librairies

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
import evaluate

## 1. Import des données

In [None]:
train_df = pd.read_csv("train_submission.csv")
test_df = pd.read_csv("test_without_labels.csv")

# Nettoyage et conversion des colonnes de texte
train_df["Text"] = train_df["Text"].fillna("").astype(str)
test_df["Text"] = test_df["Text"].fillna("").astype(str)

## 2. Préparation du Dataset

In [None]:
# Encodage des labels
le = LabelEncoder()
train_df["Label"] = le.fit_transform(train_df["Label"])

In [None]:
# Conversion en Dataset Hugging Face
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Renommage de la colonne "Label" en "labels" pour l'entraînement
train_dataset = train_dataset.rename_column("Label", "labels")

In [None]:
# Tokenisation avec un modèle
model_checkpoint = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
def tokenize_function(example):
    return tokenizer(example["Text"], padding="max_length", truncation=True)

## 3. Division train, test, validation datasets

In [None]:
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/190599 [00:00<?, ? examples/s]

Map:   0%|          | 0/190567 [00:00<?, ? examples/s]

In [None]:
# Séparation en ensemble d'entraînement et d'évaluation
split_dataset = train_dataset.train_test_split(test_size=0.1, seed=42)
train_split = split_dataset["train"]
eval_split = split_dataset["test"]

In [None]:
# Chargement du modèle pour la classification
num_labels = len(le.classes_)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 4. Training et Résultats

In [None]:
# Définition des arguments d'entraînement
training_args = TrainingArguments(
    output_dir="results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_steps=50,
    report_to="none"  # Désactive le reporting vers wandb, problème généré sur Colab
)



In [None]:
# Définition de la métrique d'évaluation
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

In [None]:
# Création du Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_split,
    eval_dataset=eval_split,
    compute_metrics=compute_metrics,
)

In [37]:
# Entraînement du modèle
trainer.train()
trainer.save_model("modele_5_epochs")

Epoch,Training Loss,Validation Loss,Accuracy
1,0.5861,0.559507,0.830693


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5861,0.559507,0.830693
2,0.3989,0.468659,0.859391
3,0.3142,0.459929,0.863116
4,0.2035,0.481374,0.870514
5,0.1743,0.499953,0.869307


In [38]:
# Prédictions sur le jeu de test
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
preds_labels = le.inverse_transform(preds)

## Export de la solution

In [39]:
with open("pred_labels_5_epochs.csv", "w", encoding="utf-8") as f:
  f.write("ID,Label\n")
  for idx, label in enumerate(preds_labels):
      f.write(f"{idx+1},{label}\n")