<a href="https://colab.research.google.com/github/BIGMOUSSA/corona_sent_anal_transformers/blob/master/DL2_assign_3_bert_model_multi_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q datasets evaluate transformers[sentencepiece]

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/519.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/519.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m512.0/519.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m95.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m

In [3]:
!pip install -q wandb

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m218.8/218.8 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for pathtools (setup.py) ... [?25l[?25hdone


In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
label_mapping = {
    "Extremely Negative": 0,
    "Negative" : 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive" : 4,
}

In [4]:
from torch.utils.data  import Dataset, DataLoader
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, PyTorchModelHubMixin
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch.nn as nn
from torch.optim import AdamW
import os
import wandb
import numpy as np

configs = {
    "model_name" : "roberta-base",
    "max_length" : 80,
    "hidden_state" : 768,
    "csv_fil" : "/content/drive/MyDrive/DIT_DL2/nlp_clean.csv",
    "batch_size" : 32,
    "learning_rate" : 2e-5,
    "n_epochs" : 3,
    "n_classes" : 5
}
class MyDataset(Dataset):
    def __init__(self, csv_file, tokenizer_name, max_length):
        self.df = pd.read_csv(csv_file, encoding='ISO-8859-1').iloc[:2000]
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.df["text"].to_list()
        label = self.df["labels"].to_list()
        inputs = self.tokenizer(
            text = text,
            max_length = self.max_length,
            padding = 'max_length',
            truncation = True,
            return_tensors = "pt"
        )


        return {
            "input_ids" : inputs["input_ids"][index],
            "attention_mask" : inputs["attention_mask"][index],
            "labels" : torch.tensor(label[index], dtype=torch.long)
        }

def dataloader(dataset, batch_size, shuffle):
    return DataLoader(dataset = dataset,
                      batch_size = batch_size,
                      shuffle = shuffle
                      )



class CustomModel(nn.Module, PyTorchModelHubMixin):
    def __init__(self, model_name, n_classes):
        super(CustomModel, self).__init__()
        self.config = config = AutoConfig.from_pretrained("roberta-base", num_labels=n_classes)
        self.pretrained_model = AutoModelForSequenceClassification.from_pretrained(model_name, config = self.config)

    def forward(self, input_ids, attention_mask):
        output = self.pretrained_model(input_ids = input_ids, attention_mask = attention_mask)

        return output





def train_step(model, train_loader, optimizer, loss_fn, device):
    model.train()

    total_loss  = 0

    for data in tqdm(train_loader, total=len(train_loader)):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        label = data['labels'].to(device)
        #print(label)
        optimizer.zero_grad()
        output = model(input_ids = input_ids, attention_mask = attention_mask)
        loss = loss_fn(output.logits, label)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(train_loader)


####
from sklearn.metrics import accuracy_score, classification_report

def validation_step(model, validation_loader, loss_fn, device):
    model.eval()
    predictions = []
    actual_labels = []
    valid_loss = []

    with torch.no_grad():
        for batch in validation_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)

            valid_loss.append(loss.item())

            _, preds = torch.max(outputs.logits, dim=1)

            predictions.extend(preds.cpu().tolist())

            actual_labels.extend(labels.cpu().tolist())


    accuracy = accuracy_score(actual_labels, predictions)


    return np.mean(valid_loss), accuracy

def save_checkpoint(model, checkpoint_filename):
    '''
        save the checkpoint after training
    '''

    state = {
        'classifier': model.classifier,
        'model_state_dict': model.state_dict(),
        'class_to_idx' : model.class_to_idx
    }

    torch.save(state, checkpoint_filename)

def main():
    wandb.init(project = "bert_corana_sent_anal")
    dataset = MyDataset(csv_file = configs["csv_fil"],
                        tokenizer_name= configs["model_name"],
                        max_length= configs["max_length"],
                        )
    train_dataset, test_dataset = train_test_split(dataset, test_size=0.3, shuffle=True)

    train_loader = dataloader(dataset=train_dataset,
                              batch_size= configs["batch_size"],
                              shuffle=True)

    valid_loader = dataloader(dataset=test_dataset, batch_size= configs["batch_size"], shuffle=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = CustomModel(model_name = configs["model_name"], n_classes= configs["n_classes"])

    model.to(device)

    loss_fn = nn.CrossEntropyLoss()

    optimizer = AdamW(model.parameters(), lr = configs["learning_rate"])
    for epoch in range(configs["n_epochs"]):
        loss_train = train_step(model, train_loader, optimizer, loss_fn, device)
        loss_valid, accuracy = validation_step(model = model, validation_loader = valid_loader, device=device, loss_fn=loss_fn)

        wandb.log({"loss_train":loss_train,
                  "loss_valid" : loss_valid,
                  "accuracy" : accuracy})


    #sauvegarder notre model
    labnames = list(label_mapping.keys())

    repo_name = "Peed911/Roberta_corona_class"
    tokenizer = tokenizer = AutoTokenizer.from_pretrained(configs['model_name'])
    tokenizer.push_to_hub(repo_id = repo_name)
    model.config.id2label = {str(i) : lbl for i, lbl in enumerate(labnames)}
    model.config.label2id = {lbl : str(i) for i, lbl in enumerate(labnames)}
    model.push_to_hub(repo_id = repo_name)
#if __name__ == "__main__":
    #main()

In [5]:
wandb.init(project = "bert_corana_sent_anal2")
dataset = MyDataset(csv_file = configs["csv_fil"],
                    tokenizer_name= configs["model_name"],
                    max_length= configs["max_length"],
                    )
train_dataset, test_dataset = train_test_split(dataset, test_size=0.3, shuffle=True)

train_loader = dataloader(dataset=train_dataset,
                          batch_size= configs["batch_size"],
                          shuffle=True)

valid_loader = dataloader(dataset=test_dataset, batch_size= configs["batch_size"], shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#model = CustomModel(model_name = configs["model_name"], n_classes= configs["n_classes"])

tokenizer = AutoTokenizer.from_pretrained("roberta-base")
config = AutoConfig.from_pretrained("roberta-base", num_labels = configs["n_classes"])
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", config=config)

model.to(device)

loss_fn = nn.CrossEntropyLoss()

optimizer = AdamW(model.parameters(), lr = configs["learning_rate"])
for epoch in range(configs["n_epochs"]):
    loss_train = train_step(model, train_loader, optimizer, loss_fn, device)
    loss_valid, accuracy = validation_step(model = model, validation_loader = valid_loader, device=device, loss_fn=loss_fn)

    wandb.log({"loss_train":loss_train,
              "loss_valid" : loss_valid,
              "accuracy" : accuracy})


labnames = list(label_mapping.keys())

# launch the model to the hub

repo_name = "Peed911/Roberta_corona_class"

tokenizer.push_to_hub(repo_id = repo_name)
model.config.id2label = {str(i) : lbl for i, lbl in enumerate(labnames)}
model.config.label2id = {lbl : str(i) for i, lbl in enumerate(labnames)}
model.push_to_hub(repo_id = repo_name)
#model.config.push_to_hub(repo_id = repo_name)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁▂█
loss_train,█▇▁
loss_valid,█▁▂

0,1
accuracy,0.34333
loss_train,1.34449
loss_valid,1.53383


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 44/44 [00:17<00:00,  2.48it/s]
100%|██████████| 44/44 [00:18<00:00,  2.43it/s]
100%|██████████| 44/44 [00:18<00:00,  2.39it/s]


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Peed911/Roberta_corona_class/commit/df8806feacc78d1dfaa2018225e89372ec3ea58e', commit_message='Upload RobertaForSequenceClassification', commit_description='', oid='df8806feacc78d1dfaa2018225e89372ec3ea58e', pr_url=None, pr_revision=None, pr_num=None)

## USE THE PRETRAINED MODEL WITH GRADIO FOR INFERENCE

In [6]:
!pip install -q gradio

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.1/20.1 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.2/298.2 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.4/75.4 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.9/139.9 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.5/59.5 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.9/129.9 kB[0m [31m12.6 MB/s[0

In [7]:
def predict(model, text):
  tokenizer = AutoTokenizer.from_pretrained("Peed911/Roberta_corona_class")
  toks = tokenizer(text)
  output = model(**toks)
  _, pred = torch.max(output, dim = 1)
  return pred

In [8]:
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the tokenizer and model
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("Peed911/Roberta_corona_class")
model = AutoModelForSequenceClassification.from_pretrained("Peed911/Roberta_corona_class")
label_mapping = {
    "Extremely Negative": 0,
    "Negative" : 1,
    "Neutral": 2,
    "Positive": 3,
    "Extremely Positive" : 4,
}
labels_name = [ "Extremely Negative", "Negative", "Neutral", "Positive", "Extremely Positive"]
# Define the prediction function
def classify_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    predicted_names = labels_name[predicted_class]
    return predicted_class,  predicted_names

# Create a Gradio interface
iface = gr.Interface(
    fn=classify_text,
    inputs=gr.inputs.Textbox(),
    outputs=[gr.outputs.Label(num_top_classes=5), gr.outputs.Textbox()],  # Adjust the number of top classes as needed
)

# Launch the Gradio interface
iface.launch()


Downloading (…)okenizer_config.json:   0%|          | 0.00/351 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/974 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

  inputs=gr.inputs.Textbox(),
  inputs=gr.inputs.Textbox(),
  inputs=gr.inputs.Textbox(),
  outputs=[gr.outputs.Label(num_top_classes=5), gr.outputs.Textbox()],  # Adjust the number of top classes as needed
  outputs=[gr.outputs.Label(num_top_classes=5), gr.outputs.Textbox()],  # Adjust the number of top classes as needed


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Note: opening Chrome Inspector may crash demo inside Colab notebooks.

To create a public link, set `share=True` in `launch()`.


<IPython.core.display.Javascript object>

