# This notebook is running in Google Colab with a GPU runtime. I have fine-tuned the model and tested it locally to ensure it works correctly.

In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
!pip install transformers datasets peft



In [1]:
import numpy as np
import pandas as pd
import torch
import os
from transformers import BertTokenizer, BertForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from peft import LoraConfig, get_peft_model, PeftModel
from torch.utils.data import DataLoader
from datasets import Dataset
pd.set_option('display.max_colwidth', None)


  from .autonotebook import tqdm as notebook_tqdm


# Data Preprocessing

In [4]:
data = pd.read_csv('/content/drive/MyDrive/ml_project/balanced_data.csv')
data.head()

Unnamed: 0,cleaned_text,rating
0,pretty famous hotel nashville right not nashville instead airport agree ton review sprawling word day come place let break little location outside town need car ride benefit airport minute away not benefit book day facility standard hotel ok well room stay definitely outdated room believe new wing well view outside fantastic view parking lot remind awful big box retail store suburbia interior room view massive atrium balcony shoot get place challenge size confusing layout think vegas hotel story tall spread area size kind like maze lot restaurant bar guess bad pool locate different part hotel impressive adequate gym include gym ok ok room hotel not massive resort room service ouch impression count opryland completely screw valet suck bellman basically quarantine luggage say cart touch seriously extremely poor long trip want room hold fall apart lazy valet gladly take tip move luggage cart foot impound detail absolutely infuriate wifey mad pretty calm understand people service resort well impression count know hotel nashville omni include right fee oh lord know come agree ashley h get rate pretty fair guess say bill come night resort fee crappy gym fail park super walmart like surface lot terrible valet day taxis taxis nickle dim crap suck no value atrium huge wing kind neat cruise way negative issue opryland business maybe look room downtown,Negative
1,happy service today go crappy place girl no idea wtf handle gel nail place call place booked wait time literally minute time drive happy ly nail cute sure care nail ask want coat gel etc well price basic gel color happy salon compare place reno thank ly,Positive
2,fyi charge certain item weight way sorta similar kiwi live literally block away time inside place walk past lot exactly motive try place mean bright orange mural giant chicken sorta walk get classic white dark meat chicken tender wishbone mac cheese pro cute homey decor give invite vibe meat chicken tender cook beautifully dry hard bite eat literally pull chicken tender apart hand put fight pretty nice variety sauce display new addition citrus honey butter honey butter literally gold know probably suppose biscuit eat chicken fall love combination legit take rest home get to mac cheese definitely star taste awesome bit heat end presentation use bit work noodle sorta fall apart enjoy con coating chicken taste like get lot flavor sauce dip get chipotle lime mayo pommery honey mustard rename chipotle mayo like chipotle mayo no creaminess absolutely no lime flavor honey mustard lack honey food wishbone definitely fare overpriced eat probably snack want spend butt ton money,Neutre
3,wait minute table sure reservation wait get small table corner far enjoy band get earl gray garden drink taste water enjoy dinner get ms moons bulgogi rice bowl thinly slice marinate ribeye steak spinach housemade kimchi add fried egg good portion size small scale lamb slider bit dry small scale,Neutre
4,grand opening chinese joint south broad street trenton postage stamp size dining area table eat chair wait away space clean kitchen shiny open little week menu extensive basically traditional chinese american emporium authentic want good chinois not park wheel broad place orange flavor chicken load styro container eat chicken fresh not deeply frozen sauce gloopy orange color taste citrus weak despite orange peel mix fried rice bright yellow dye bullion stock morning plenty roast pork veggie not egg scramble people fuzhounese province north old canton strait taiwan cuisine similar cantonese influence tend sweetness close allow success chinese war interested soon new clean owner interest get thing right remember deal not disappoint pro tip serve hot tea,Neutre


In [5]:
def categorize(rating):
    if rating == 'Negative':
        return 0
    elif rating == 'Neutre':
        return 1
    else:
        return 2
data['labels'] = data['rating'].apply(categorize)

In [6]:
data.head()

Unnamed: 0,cleaned_text,rating,labels
0,pretty famous hotel nashville right not nashville instead airport agree ton review sprawling word day come place let break little location outside town need car ride benefit airport minute away not benefit book day facility standard hotel ok well room stay definitely outdated room believe new wing well view outside fantastic view parking lot remind awful big box retail store suburbia interior room view massive atrium balcony shoot get place challenge size confusing layout think vegas hotel story tall spread area size kind like maze lot restaurant bar guess bad pool locate different part hotel impressive adequate gym include gym ok ok room hotel not massive resort room service ouch impression count opryland completely screw valet suck bellman basically quarantine luggage say cart touch seriously extremely poor long trip want room hold fall apart lazy valet gladly take tip move luggage cart foot impound detail absolutely infuriate wifey mad pretty calm understand people service resort well impression count know hotel nashville omni include right fee oh lord know come agree ashley h get rate pretty fair guess say bill come night resort fee crappy gym fail park super walmart like surface lot terrible valet day taxis taxis nickle dim crap suck no value atrium huge wing kind neat cruise way negative issue opryland business maybe look room downtown,Negative,0
1,happy service today go crappy place girl no idea wtf handle gel nail place call place booked wait time literally minute time drive happy ly nail cute sure care nail ask want coat gel etc well price basic gel color happy salon compare place reno thank ly,Positive,2
2,fyi charge certain item weight way sorta similar kiwi live literally block away time inside place walk past lot exactly motive try place mean bright orange mural giant chicken sorta walk get classic white dark meat chicken tender wishbone mac cheese pro cute homey decor give invite vibe meat chicken tender cook beautifully dry hard bite eat literally pull chicken tender apart hand put fight pretty nice variety sauce display new addition citrus honey butter honey butter literally gold know probably suppose biscuit eat chicken fall love combination legit take rest home get to mac cheese definitely star taste awesome bit heat end presentation use bit work noodle sorta fall apart enjoy con coating chicken taste like get lot flavor sauce dip get chipotle lime mayo pommery honey mustard rename chipotle mayo like chipotle mayo no creaminess absolutely no lime flavor honey mustard lack honey food wishbone definitely fare overpriced eat probably snack want spend butt ton money,Neutre,1
3,wait minute table sure reservation wait get small table corner far enjoy band get earl gray garden drink taste water enjoy dinner get ms moons bulgogi rice bowl thinly slice marinate ribeye steak spinach housemade kimchi add fried egg good portion size small scale lamb slider bit dry small scale,Neutre,1
4,grand opening chinese joint south broad street trenton postage stamp size dining area table eat chair wait away space clean kitchen shiny open little week menu extensive basically traditional chinese american emporium authentic want good chinois not park wheel broad place orange flavor chicken load styro container eat chicken fresh not deeply frozen sauce gloopy orange color taste citrus weak despite orange peel mix fried rice bright yellow dye bullion stock morning plenty roast pork veggie not egg scramble people fuzhounese province north old canton strait taiwan cuisine similar cantonese influence tend sweetness close allow success chinese war interested soon new clean owner interest get thing right remember deal not disappoint pro tip serve hot tea,Neutre,1


In [7]:
data['cleaned_text'].duplicated().sum()

np.int64(0)

In [8]:
data['cleaned_text'].isna().sum()

np.int64(1)

In [9]:
data.dropna(subset=['cleaned_text'], inplace=True)

In [10]:
data['cleaned_text'].isna().sum()

np.int64(0)

In [11]:
dataset = Dataset.from_pandas(data, preserve_index = False)
dataset = dataset.train_test_split(test_size=0.2, seed=42)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['cleaned_text', 'rating', 'labels'],
        num_rows: 119999
    })
    test: Dataset({
        features: ['cleaned_text', 'rating', 'labels'],
        num_rows: 30000
    })
})


## Data Tokenization

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_function(examples):
    return tokenizer(
        examples["cleaned_text"],
        padding="max_length",
        truncation = True,
        max_length=64
    )
tokenized_dataset = dataset.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/119999 [00:00<?, ? examples/s]

Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

In [13]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['cleaned_text', 'rating', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 119999
    })
    test: Dataset({
        features: ['cleaned_text', 'rating', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 30000
    })
})


In [14]:
tokenized_dataset = tokenized_dataset.remove_columns(["cleaned_text", "rating"])


In [15]:
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 119999
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 30000
    })
})


## Lora Configuration

In [16]:
num_labels = len(data['rating'].unique())

model= BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

lora_config = LoraConfig(
    task_type='SEQ_CLS',
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.2707


In [None]:
train_dataset = tokenized_dataset["train"]
test_dataset = tokenized_dataset["test"]

'\ntrain_loader = DataLoader(train_dataset, bach_size=16, shuffle=True)\ntest_dataset = DataLoader(test_dataset,bach_size=16)'

## Train The Model

In [18]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results_bert",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    push_to_hub=False,
    logging_dir="./logs_bert",
    logging_steps=10,
    fp16=True,
    dataloader_pin_memory=True,
    optim="adamw_torch",
    report_to="none"  # ✅ désactive W&B et tout autre outil de reporting
)


In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  # ou val_dataset si tu as une validation
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.5305,0.591657
2,0.6319,0.571758
3,0.6122,0.567811
4,0.5507,0.564988
5,0.4896,0.561348


TrainOutput(global_step=18750, training_loss=0.5914151452128092, metrics={'train_runtime': 1722.4852, 'train_samples_per_second': 348.331, 'train_steps_per_second': 10.885, 'total_flos': 1.980182057065344e+16, 'train_loss': 0.5914151452128092, 'epoch': 5.0})

## Evaluate The Model

In [20]:
results = trainer.evaluate()
print(results)
# Récupérer les prédictions finales
predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(-1)
y_true = predictions.label_ids

# Accuracy
acc = accuracy_score(y_true, y_pred)
print("Accuracy:", acc)

# Confusion Matrix
conf = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:\n", conf)

# Classification Report
target_classes = ["classe1", "classe2", "classe3"]  # à adapter à ton dataset
class_rep = classification_report(y_true, y_pred, target_names=target_classes)
print("Classification Report:\n", class_rep)


{'eval_loss': 0.5613481998443604, 'eval_runtime': 38.8215, 'eval_samples_per_second': 772.768, 'eval_steps_per_second': 48.298, 'epoch': 5.0}
Accuracy: 0.7578666666666667
Confusion Matrix:
 [[7940 1748  254]
 [1706 6520 1747]
 [ 246 1563 8276]]
Classification Report:
               precision    recall  f1-score   support

     classe1       0.80      0.80      0.80      9942
     classe2       0.66      0.65      0.66      9973
     classe3       0.81      0.82      0.81     10085

    accuracy                           0.76     30000
   macro avg       0.76      0.76      0.76     30000
weighted avg       0.76      0.76      0.76     30000



## Save Model and Tokenizer

In [24]:
output_dir = "/content/drive/MyDrive/ml_project/BERT"
trainer.save_model(output_dir)      # sauvegarde le modèle
tokenizer.save_pretrained(output_dir)  # sauvegarde aussi le tokenizer


('/content/drive/MyDrive/ml_project/BERT/tokenizer_config.json',
 '/content/drive/MyDrive/ml_project/BERT/special_tokens_map.json',
 '/content/drive/MyDrive/ml_project/BERT/vocab.txt',
 '/content/drive/MyDrive/ml_project/BERT/added_tokens.json')

## Test The Model

In [2]:
from transformers import BertForSequenceClassification, BertTokenizer

model_path = "BERT_LORA"

# Charger le modèle fine-tuné avec le bon nombre de labels
num_labels = 3  # ⚠️ doit correspondre à ton entraînement
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=num_labels)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Exemple de test
example_text = ["The food was okay, nothing special but not bad either."]

inputs = tokenizer(example_text, padding="max_length", truncation=True, max_length=64, return_tensors="pt")

model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = logits.argmax(dim=-1).item()

id2label = {0: "Negative", 1: "Neutre", 2: "Positive"}
print("Predicted class:", id2label[predicted_class_id])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Predicted class: Neutre
