### Importation des différentes bibliothèques

In [None]:
import pandas as pd
from transformers import AutoTokenizer, BertForSequenceClassification
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
import pandas as pd
from huggingface_hub import notebook_login

### Importation du dataset

In [2]:
df = pd.read_csv('/kaggle/input/allocine-french-movie-reviews/train.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,film-url,review,polarity
0,0,http://www.allocine.fr/film/fichefilm-135259/c...,Si vous cherchez du cinéma abrutissant à tous ...,0
1,1,http://www.allocine.fr/film/fichefilm-172430/c...,"Trash, re-trash et re-re-trash...! Une horreur...",0
2,2,http://www.allocine.fr/film/fichefilm-15105/cr...,"Et si, dans les 5 premières minutes du film, l...",0
3,3,http://www.allocine.fr/film/fichefilm-188629/c...,Mon dieu ! Quelle métaphore filée ! Je suis ab...,0
4,4,http://www.allocine.fr/film/fichefilm-23514/cr...,"Premier film de la saga Kozure Okami, ""Le Sabr...",1
...,...,...,...,...
159995,159995,http://www.allocine.fr/film/fichefilm-132387/c...,Un rythme bien trop lent et un Ashton Kutcher ...,0
159996,159996,http://www.allocine.fr/film/fichefilm-53313/cr...,Monsieur Duchovny vous êtes aussi piètre acteu...,0
159997,159997,http://www.allocine.fr/film/fichefilm-248258/c...,Complètement différent des films de la série C...,1
159998,159998,http://www.allocine.fr/film/fichefilm-268731/c...,Alors franchement pour le moment c'est le meil...,1


### Création de la classe qui va lire le dataset et faire la tokenization des textes

In [None]:
class Data(Dataset):
    def __init__(self, csv_file, tokenizer, max_len):
        self.df = pd.read_csv(csv_file)
        self.max_len = max_len
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.df['review'][index]
        label = self.df['polarity'][index]
        inputs =  self.tokenizer(text=text,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt')
        return {
            'input_ids':inputs["input_ids"].squeeze(0),
            "attention_mask": inputs['attention_mask'].squeeze(0),
            'label': torch.tensor(label)
        }

### Conversion de notre dataset en dataLoader avec un batch et faire en même temps le shuffling

In [None]:
def dataloader(dataset, batch_size, shuffle):
    return  DataLoader(dataset=dataset, batch_size= batch_size, shuffle=shuffle)


### Fonction pour charger le model pré-entrainé

In [None]:
def model_maker(model_name):
    return BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

### Utilisation des différents classes et fonctions crées

In [None]:
tokenizer = AutoTokenizer.from_pretrained('camembert-base')
dataset = Data(csv_file="/kaggle/input/allocine-french-movie-reviews/train.csv", tokenizer, max_len=120)
data_loader = dataloader(dataset, 2, True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Après tout celà on va créer une fonction dans laquelle se fera l'entrainement de notre model

In [None]:

def main(num_epochs, learning_rate):
    model = model_maker("camembert-base")
    model.to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

            loss = outputs.loss

            loss.backward()
            optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')
    return model

### Utilisation de la fonction pour l'entrainement

In [5]:
model = main(5, 0.00001)

You are using a model of type camembert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


Downloading model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['encoder.layer.9.attention.output.LayerNorm.bias', 'encoder.layer.2.attention.self.query.weight', 'encoder.layer.4.output.LayerNorm.bias', 'encoder.layer.3.attention.output.dense.weight', 'encoder.layer.8.attention.output.dense.weight', 'encoder.layer.2.output.dense.bias', 'encoder.layer.6.output.LayerNorm.weight', 'encoder.layer.8.output.LayerNorm.weight', 'encoder.layer.2.attention.self.key.bias', 'encoder.layer.9.output.LayerNorm.bias', 'encoder.layer.3.attention.self.query.weight', 'encoder.layer.3.output.LayerNorm.weight', 'encoder.layer.3.output.LayerNorm.bias', 'encoder.layer.7.attention.self.value.weight', 'encoder.layer.9.attention.self.query.weight', 'encoder.layer.2.attention.output.dense.weight', 'encoder.layer.6.attention.self.query.bias', 'encoder.layer.5.attention.self.value.weight', 'encoder.layer.9.attention.self.value.bias', 'encod

Epoch [1/5], Loss: 0.6437475681304932
Epoch [2/5], Loss: 0.7277877330780029
Epoch [3/5], Loss: 0.6856991052627563
Epoch [4/5], Loss: 0.693649411201477
Epoch [5/5], Loss: 0.6931495666503906


In [6]:
from huggingface_hub import login
login("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")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [8]:

tokenizer = AutoTokenizer.from_pretrained('camembert-base')

In [7]:
model.push_to_hub('Alwaly/french_sentiment_analysis')

pytorch_model.bin:   0%|          | 0.00/443M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Alwaly/french_sentiment_analysis/commit/fe0dd55317fe4cdff7e07911f2320b92c59ccd97', commit_message='Upload BertForSequenceClassification', commit_description='', oid='fe0dd55317fe4cdff7e07911f2320b92c59ccd97', pr_url=None, pr_revision=None, pr_num=None)

In [9]:
tokenizer.push_to_hub('Alwaly/french_sentiment_analysis')

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Alwaly/french_sentiment_analysis/commit/1cbd0046a95ef70a1ba2347fe302c68c8e2a470e', commit_message='Upload tokenizer', commit_description='', oid='1cbd0046a95ef70a1ba2347fe302c68c8e2a470e', pr_url=None, pr_revision=None, pr_num=None)

In [12]:
token = tokenizer(text="Comment allez-vous",
            truncation=True,
            padding='max_length',
            return_tensors='pt')

In [17]:
inputs = token.to(device)

In [19]:
predicted_class

1

In [20]:
test = pd.read_csv('/kaggle/input/allocine-french-movie-reviews/test.csv')

In [24]:
del test["film-url"]
del test["Unnamed: 0"]

In [30]:
inputss = test['review']
label = test['polarity']

In [33]:
pred =[]
for i in inputss:
    token = tokenizer(text=i,
            truncation=True,
            padding='max_length',
            return_tensors='pt')
    inputs = token.to(device)
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted probabilities and class labels
    logits = outputs.logits
    predicted_class = logits.argmax().item()
    pred.append(predicted_class)

In [34]:
pred

0.4796

In [35]:
logits

tensor([[-0.0106, -0.0084]], device='cuda:0')