In [None]:
pip install datasets evaluate

In [None]:
from transformers import Trainer, TrainingArguments, pipeline, AutoTokenizer, AutoModelForMaskedLM
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
from datasets import Dataset

In [None]:
# classification(comments)
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP Project/NLP Dataset/sentences/darija _clean.csv')

In [None]:
df1=df.copy()
df2=df.copy()

In [None]:
# Step 1: Split the data into 80% train and 20% test
train_df, test_df = train_test_split(df1, test_size=0.2, random_state=42)

# Step 2: Split the 80% training set into 65% train and 15% validation
train_df, val_df = train_test_split(train_df, test_size=0.15, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")
print(f"Test set size: {len(test_dataset)}")

In [None]:
# Load DarijaBERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("SI2M-Lab/DarijaBERT")
model = AutoModelForMaskedLM.from_pretrained("SI2M-Lab/DarijaBERT")

classification = pipeline(task="text-classification", model=model, tokenizer=tokenizer)

In [None]:
# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples["cleaned_text"], padding="max_length", truncation=True, max_length=512)

# Apply the tokenization to all splits (train, validation, and test)
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

In [None]:
train_dataset

In [None]:
val_dataset

In [None]:
test_dataset

In [None]:
df1["label"].value_counts()

In [None]:
# Normalize labels to lowercase
train_dataset = train_dataset.map(lambda x: {'label': x['label'].lower()})
val_dataset = val_dataset.map(lambda x: {'label': x['label'].lower()})
test_dataset = test_dataset.map(lambda x: {'label': x['label'].lower()})

label_map = {'negative': 0, 'neutral': 1, 'positive': 2, 'mixed': 3}

# Apply label encoding
train_dataset = train_dataset.map(lambda x: {'labels': label_map[x['label']]})
val_dataset = val_dataset.map(lambda x: {'labels': label_map[x['label']]})
test_dataset = test_dataset.map(lambda x: {'labels': label_map[x['label']]})


In [None]:
from transformers import AutoModelForSequenceClassification

# Load the pre-trained DarijaBERT model for classification
model = AutoModelForSequenceClassification.from_pretrained("SI2M-Lab/DarijaBERT", num_labels=4)

In [None]:
print(f"Unique labels in the train dataset: {set(train_dataset['labels'])}")
print(f"Unique labels in the validation dataset: {set(val_dataset['labels'])}")
print(f"Unique labels in the test dataset: {set(test_dataset['labels'])}")

In [None]:
from transformers import Trainer, TrainingArguments, pipeline, AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
from datasets import Dataset
import numpy as np
import evaluate
import matplotlib.pyplot as plt

# Load DarijaBERT tokenizer and model for sequence classification (not masked LM)
tokenizer = AutoTokenizer.from_pretrained("SI2M-Lab/DarijaBERT")
model = AutoModelForSequenceClassification.from_pretrained("SI2M-Lab/DarijaBERT", num_labels=4)

# Ensure the dataset is in the right format
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Define compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=-1)  # Get the predicted class index
    accuracy_metric = evaluate.load("accuracy")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    f1_metric = evaluate.load("f1")

    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

    return {
        "accuracy": accuracy["accuracy"],
        "precision": precision["precision"],
        "recall": recall["recall"],
        "f1": f1["f1"],
    }

# Define training arguments
training_args = TrainingArguments(
    output_dir='./darija_model',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",  # Disable WandB logging
    dataloader_num_workers=4,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train the model
train_output = trainer.train()

# Evaluate the model
eval_results = trainer.evaluate()

In [None]:
for log in log_history[:5]:  # Afficher les 5 premiers logs pour exemple
    print(log)

In [None]:
epochs = [log["epoch"] for log in log_history if "loss" in log]
train_losses = [log["loss"] for log in log_history if "loss" in log]
eval_losses = [log["eval_loss"] for log in log_history if "eval_loss" in log]
eval_epochs = [log["epoch"] for log in log_history if "eval_loss" in log]

# Vérifiez les tailles
print("Epochs:", len(epochs), "Train Losses:", len(train_losses))
print("Eval Epochs:", len(eval_epochs), "Eval Losses:", len(eval_losses))

# Plot Training vs Validation Loss
plt.figure(figsize=(10, 6))
plt.plot(epochs, train_losses, label="Training Loss", marker="o")
plt.plot(eval_epochs, eval_losses, label="Validation Loss", marker="o")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training vs Validation Loss")
plt.legend()
plt.grid()
plt.show()

# Si aucune accuracy n'est présente, informer l'utilisateur
if not eval_losses:
    print("Aucune métrique d'accuracy disponible à tracer.")

In [None]:

eval_accuracies = [log["eval_accuracy"] for log in log_history if "eval_accuracy" in log]
eval_epochs_accuracy = [log["epoch"] for log in log_history if "eval_accuracy" in log]

if eval_accuracies:
    plt.figure(figsize=(10, 6))
    plt.plot(eval_epochs_accuracy, eval_accuracies, label="Validation Accuracy", marker="o")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.title("Validation Accuracy")
    plt.legend()
    plt.grid()
    plt.show()
else:
    print("Aucune métrique de validation d'accuracy à tracer.")

In [None]:
# Evaluate the model on the test dataset
results = trainer.evaluate(test_dataset)

# Print the results
print(f"Test results: {results}")

In [None]:
# Save the trained model
save_path = "/content/drive/MyDrive/Colab Notebooks/NLP Project/SaveModel"

# Save the trained model and tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model and tokenizer saved to {save_path}")
