# Training / Fine tuning

> Execution :  Dependencies → Label Mapping

## Before training

### **Splitting the Data**

```python
from sklearn.model_selection import train_test_split
from datasets import Dataset

# Assuming `df1` is your initial dataset
# Split into 80% training and 20% testing
train_df, test_df = train_test_split(df1, test_size=0.2, random_state=42)

# Further split the training set into 65% training and 15% validation
train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=42)

# Create Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")
print(f"Test set size: {len(test_dataset)}")
```

### Tokenization

```python
from transformers import AutoTokenizer

# Load the tokenizer for DarijaBERT
tokenizer = AutoTokenizer.from_pretrained("SI2M-Lab/DarijaBERT")

# Define the tokenization function
def tokenize_function(examples):
    return tokenizer(examples["cleaned_text"], padding="max_length", truncation=True, max_length=512)

# Apply tokenization to all datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
```

### Normalizing and Mapping Labels

```python
# Normalize labels
def normalize_label(label):
    label_mapping = {
        'negative': 'negative',
        'Negative': 'negative',
        'négatif': 'negative',
        'neutral': 'neutral',
        'Neutral': 'neutral',
        'positif': 'positive',
        'Positive': 'positive',
        'positive': 'positive',
        'mixed': 'mixed',
        'Mixed': 'mixed',
        'neutre': 'neutral'
    }
    return label_mapping.get(label.lower(), label)

# Map labels to numeric values
label_map = {'negative': 0, 'neutral': 1, 'positive': 2, 'mixed': 3}

# Apply normalization and mapping
train_dataset = train_dataset.map(lambda x: {'label': normalize_label(x['label'])})
val_dataset = val_dataset.map(lambda x: {'label': normalize_label(x['label'])})
test_dataset = test_dataset.map(lambda x: {'label': normalize_label(x['label'])})

train_dataset = train_dataset.map(lambda x: {'labels': label_map[x['label']]})
val_dataset = val_dataset.map(lambda x: {'labels': label_map[x['label']]})
test_dataset = test_dataset.map(lambda x: {'labels': label_map[x['label']]})
```

### Formatting Datasets

```python
# Set format for PyTorch tensors
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
```

**Data Preparation**:

- Split into `train`, `val`, `test`.
- Tokenized the text and normalized/mapped labels.

## Native

```python
from transformers import Trainer, TrainingArguments, pipeline, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
from datasets import Dataset
import numpy as np
import evaluate
import matplotlib.pyplot as plt

# Load DarijaBERT tokenizer and model for sequence classification (not masked LM)
tokenizer = AutoTokenizer.from_pretrained("SI2M-Lab/DarijaBERT")
model = AutoModelForSequenceClassification.from_pretrained("SI2M-Lab/DarijaBERT", num_labels=4)

# Ensure the dataset is in the right format
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define compute_metrics function
# Define compute_metrics function with zero_division parameter
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)  # Get the predicted class index

    # Load metrics
    accuracy_metric = evaluate.load("accuracy")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric = evaluate.load("f1")

    # Compute metrics with zero_division handling
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted", zero_division=1)
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted", zero_division=1)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted", zero_division=1)

    # Return metrics
    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"],
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./model',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",  # Disable WandB logging
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
train_output = trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()

```

## PEFT / `LoRA`

### v1

```python
	from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSequenceClassification
	from peft import LoraConfig, get_peft_model, TaskType
	from sklearn.model_selection import train_test_split
	import torch
	import pandas as pd
	from datasets import Dataset
	import numpy as np
	import evaluate
	import matplotlib.pyplot as plt

	# Load DarijaBERT tokenizer and model for sequence classification
	tokenizer = AutoTokenizer.from_pretrained("SI2M-Lab/DarijaBERT")
	model = AutoModelForSequenceClassification.from_pretrained("SI2M-Lab/DarijaBERT", num_labels=4)

	# Configure LoRA
	from peft import LoraConfig, get_peft_model

	lora_config = LoraConfig(
	    task_type=TaskType.SEQ_CLS,  # Sequence Classification
	    inference_mode=False,
	    r=8,  # Low-rank dimension
	    lora_alpha=32,  # Scaling factor
	    lora_dropout=0.1  # Dropout probability
	)

	# Wrap the model with LoRA
	model = get_peft_model(model, lora_config)
	print("LoRA Model Ready")

	# Ensure the dataset is in the right format
	train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
	val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

	# Define compute_metrics function
	def compute_metrics(eval_pred):
	    predictions, labels = eval_pred
	    predictions = np.argmax(predictions, axis=-1)  # Get the predicted class index

	    # Load metrics
	    accuracy_metric = evaluate.load("accuracy")
	    precision_metric = evaluate.load("precision")
	    recall_metric = evaluate.load("recall")
	    f1_metric = evaluate.load("f1")

	    # Compute metrics with zero_division handling
	    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
	    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted", zero_division=1)
	    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted", zero_division=1)
	    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted", zero_division=1)

	    return {
	        "accuracy": accuracy["accuracy"],
	        "precision": precision["precision"],
	        "recall": recall["recall"],
	        "f1": f1["f1"],
	    }

	# Define training arguments
	training_args = TrainingArguments(
	    output_dir='./model',
	    num_train_epochs=2,  # Set to 2 epochs for testing, adjust as needed
	    per_device_train_batch_size=8,
	    per_device_eval_batch_size=16,
	    warmup_steps=500,
	    weight_decay=0.01,
	    logging_dir='./logs',
	    logging_steps=10,
	    eval_strategy="epoch",
	    save_strategy="epoch",
	    load_best_model_at_end=True,
	    metric_for_best_model="accuracy",
	    report_to="none",  # Disable WandB logging
	)

	# Initialize the Trainer with LoRA-wrapped model
	trainer = Trainer(
	    model=model,
	    args=training_args,
	    train_dataset=train_dataset,
	    eval_dataset=val_dataset,
	    tokenizer=tokenizer,
	    compute_metrics=compute_metrics,
	)

	# Train the model
	train_output = trainer.train()

	# Evaluate the model
	eval_results = trainer.evaluate()

	# Save the LoRA fine-tuned model
	model.save_pretrained("lora_fine_tuned_darijaBERT")

	# Output results
	print("Training Complete")
	print("Evaluation Results:", eval_results)
```

- **LoRA Configuration**: Configured with `r=8`, `lora_alpha=32`, and `lora_dropout=0.1` for efficient fine-tuning.
- **PEFT Library**: Integrated LoRA using the `peft` library and wrapped the `AutoModelForSequenceClassification` with LoRA adapters.
- **Checkpointing**: Save the LoRA fine-tuned model at the end for future use.

### v2

```python
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
import evaluate
import numpy as np

# Load the model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("SI2M-Lab/DarijaBERT", num_labels=4)

# Configure LoRA for Sequence Classification
lora_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,  # Sequence Classification
    inference_mode=False,
    r=8,  # Low-rank dimension
    lora_alpha=32,
    lora_dropout=0.1,
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Define metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)
    
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    
    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"]
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./lora_finetuned_model",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",  # Disable wandb
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("lora_finetuned_darijaBERT")

# Evaluate on test set
test_results = trainer.evaluate(test_dataset)
print("Test Results:", test_results)
```

- **LoRA Fine-Tuning**:
    - Applied LoRA to optimize memory and training time.
    - Fine-tuned the model using the labeled dataset.
- **Evaluation**: Evaluated the model on the test set after fine-tuning.