In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install numpy pandas torch scikit-learn matplotlib datasets transformers tqdm accelerate

In [8]:
# Imports
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import shutil
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from ast import literal_eval
from datasets import Dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import accelerate
import gc

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Run on device:", device)

def save_dataset(dataset: Dataset, path) -> None:
    dataset.save_to_disk(path)

def load_tokenizer(model_name):
    return AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

def load_model(model_name):
    return AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name, num_labels=7)

def load_dataset(tokenizer=None):
    dataset_path = "/content/drive/MyDrive/final_reviews.csv"
    df = pd.read_csv(dataset_path)
    df['labels'] = df['labels'].apply(literal_eval)

    dataset = Dataset.from_pandas(df=df)
    dataset = preprocessing_dataset(dataset, tokenizer=tokenizer)

    dataset.set_format("torch", columns=['review_tokens','review_attention_mask','labels'])
    dataset_loader = DataLoader(dataset=dataset, batch_size=2, shuffle=True)

    return dataset_loader

def preprocessing(sample, tokenizer, max_length=400):
    review_tokens = tokenizer(sample['review'], max_length=max_length, padding='max_length', truncation=True, return_tensors='pt', return_attention_mask=True)

    return {
        "review_tokens": review_tokens["input_ids"].flatten(),
        "review_attention_mask": review_tokens["attention_mask"].flatten(),
        "labels": sample['labels']
    }

def preprocessing_dataset(dataset: Dataset, tokenizer):
    return dataset.map(lambda sample: preprocessing(sample, tokenizer))

def compute_loss(predictions, review_labels):
    loss = nn.CrossEntropyLoss()(predictions, review_labels.to(torch.float32))
    return loss

def compute_accuracy(predictions, review_labels):
    predictions = torch.round(predictions).to(torch.int64)
    correct_predictions = (predictions == review_labels).sum().item()
    accuracy = correct_predictions / review_labels.size(0)
    return accuracy

import os
from transformers import WEIGHTS_NAME, CONFIG_NAME, AutoConfig, AutoTokenizer

def save_model(model, save_directory="/content/drive/MyDrive/EcoModel"):
    # Créer le répertoire s'il n'existe pas
    os.makedirs(save_directory, exist_ok=True)

    # Sauvegarder le modèle
    model.save_pretrained(save_directory)

    # Sauvegarder le tokenizer et la configuration
    tokenizer = AutoTokenizer.from_pretrained(model.config._name_or_path)
    tokenizer.save_pretrained(save_directory)

    # Sauvegarder la configuration
    model_config = AutoConfig.from_pretrained(model.config._name_or_path)
    model_config.save_pretrained(save_directory)

def train(epochs, model, tokenizer, training_dataloader, optimizer, scheduler, accelerator):
    for epoch in range(epochs):
        losses = []
        accuracies = []
        correct_predictions = 0
        total_samples = 0

        for batch in tqdm(training_dataloader):
            accelerator.free_memory()
            optimizer.zero_grad()

            review_tokens = batch['review_tokens'].to(device)
            review_attention_mask = batch['review_attention_mask'].to(device)
            review_labels = batch['labels'].to(device)
            # Forward pass
            output = model(input_ids=review_tokens, attention_mask=review_attention_mask)
            predictions = output[0]

            # Compute the loss
            loss = compute_loss(predictions, review_labels)
            losses.append(loss.item())

            # Backward pass
            accelerator.backward(loss)
            optimizer.step()
            scheduler.step()

            # Accuracy calculation
            accuracy = compute_accuracy(predictions, review_labels)
            accuracies.append(accuracy)


        save_model(model)
        print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {np.mean(losses)}, Accuracy: {np.mean(accuracies)}")

        # Clear memory
        del output
        del loss
        del review_tokens
        del review_attention_mask
        del review_labels

        gc.collect()

Run on device: cuda


In [9]:
model_name = 'bert-base-uncased'
tokenizer = load_tokenizer(model_name=model_name)

dataset_loader = load_dataset(tokenizer=tokenizer)

model = load_model(model_name=model_name).to(device)
model.train()

model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

accelerator = accelerate.Accelerator()
model, optimizer, dataset_loader, scheduler = accelerator.prepare(
    model, optimizer, dataset_loader, scheduler
)

n = 10
train(n, model, tokenizer, dataset_loader, optimizer, scheduler, accelerator)

Map:   0%|          | 0/1193 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 597/597 [03:47<00:00,  2.62it/s]


Epoch 1/10, Average Loss: -7.100556375792677, Accuracy: 0.15159128978224456


100%|██████████| 597/597 [03:49<00:00,  2.60it/s]


Epoch 2/10, Average Loss: -7.102503481622118, Accuracy: 0.152428810720268


100%|██████████| 597/597 [03:49<00:00,  2.60it/s]


Epoch 3/10, Average Loss: -7.114442256028129, Accuracy: 0.15326633165829145


100%|██████████| 597/597 [03:49<00:00,  2.60it/s]


Epoch 4/10, Average Loss: -7.122710316824154, Accuracy: 0.1423785594639866


100%|██████████| 597/597 [03:49<00:00,  2.60it/s]


Epoch 5/10, Average Loss: -7.092774093650294, Accuracy: 0.152428810720268


100%|██████████| 597/597 [03:49<00:00,  2.60it/s]


Epoch 6/10, Average Loss: -7.107466927724867, Accuracy: 0.15159128978224456


100%|██████████| 597/597 [03:49<00:00,  2.60it/s]


Epoch 7/10, Average Loss: -7.12902499942524, Accuracy: 0.14991624790619765


100%|██████████| 597/597 [03:49<00:00,  2.60it/s]


Epoch 8/10, Average Loss: -7.086641360567243, Accuracy: 0.15326633165829145


100%|██████████| 597/597 [03:49<00:00,  2.60it/s]


Epoch 9/10, Average Loss: -7.079664824895524, Accuracy: 0.15996649916247907


100%|██████████| 597/597 [03:50<00:00,  2.59it/s]


Epoch 10/10, Average Loss: -7.106210904305105, Accuracy: 0.1507537688442211
