<a href="https://colab.research.google.com/github/BIGMOUSSA/corona_sent_anal_transformers/blob/master/DL2_assign_3_bert_model_multi_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!pip install -q datasets evaluate transformers[sentencepiece]

In [5]:
!pip install wandb



In [6]:
from huggingface_hub import PyTorchModelHubMixin

In [7]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [8]:
from torch.utils.data  import Dataset, DataLoader
from transformers import AutoTokenizer, BertModel
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import torch.nn as nn
#from transformers.optimization import AdamW
from torch.optim import AdamW
import os
import wandb
from transformers import AutoConfig

configs = {
    "model_name" : "bert-base-uncased",
    "max_length" : 80,
    "hidden_state" : 768,
    "csv_fil" : "/content/drive/MyDrive/DIT_DL2/nlp_clean.csv",
    "batch_size" : 16,
    "learning_rate" : 2e-5,
    "n_epochs" : 1
}
class MyDataset(Dataset):
    def __init__(self, csv_file, tokenizer_name, max_length):
        self.df = pd.read_csv(csv_file, encoding='ISO-8859-1').iloc[:20]
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length
    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        text = self.df["text"][index]
        # Convert numerical labels to one-hot encoded tensors
        num_classes = 5
        one_hot_labels = torch.zeros(len(self.df), num_classes)
        one_hot_labels.scatter_(1, torch.tensor(self.df["labels"]).unsqueeze(1), 1)
        label = one_hot_labels[index]
        inputs = self.tokenizer(
            text = text,
            max_length = self.max_length,
            padding = 'max_length',
            truncation = True,
            return_tensors = "pt"
        )

        return {
            "input_ids" : inputs["input_ids"],
            "attention_mask" : inputs["attention_mask"],
            "labels" : label
        }

def dataloader(dataset, batch_size, shuffle):
    return DataLoader(dataset = dataset,
                      batch_size = batch_size,
                      shuffle = shuffle
                      )


class CustomModel(nn.Module, PyTorchModelHubMixin):
    def __init__(self, model_name, n_classes):
        super(CustomModel, self).__init__()
        self.pretrained_model = BertModel.from_pretrained(model_name) #hidden_state 786 Bert_base
        self.classifier = nn.Linear(self.pretrained_model.config.hidden_size, n_classes)
        self.softmax = nn.Softmax(dim = 1)

    def forward(self, input_ids, attention_mask):
        output = self.pretrained_model(input_ids = input_ids, attention_mask = attention_mask)
        pooled_output = output.pooler_output
        #output = self.classifier(output.last_hidden_state)
        #output = self.softmax(output)
        output = self.classifier(pooled_output)
        return output




def train_step(model, train_loader, optimizer, loss_fn, device):
    model.train()

    total_loss  = 0

    for data in tqdm(train_loader, total=len(train_loader)):
        input_ids = data['input_ids'].squeeze(1).to(device)
        attention_mask = data['attention_mask'].to(device)
        label = data['labels'].to(device)
        #print(label)
        optimizer.zero_grad()
        output = model(input_ids = input_ids, attention_mask = attention_mask)
        loss = loss_fn(output, label)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    return total_loss / len(train_loader)


####
from sklearn.metrics import accuracy_score, classification_report

def validation_step(model, validation_loader, loss_fn, device):
    model.eval()
    predictions = []
    actual_labels = []
    valid_loss = []

    with torch.no_grad():
        for batch in validation_loader:
            input_ids = batch['input_ids'].squeeze(1).to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)  # Assuming labels are one-hot encoded

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs, labels)

            valid_loss.append(loss.item())

            _, preds = torch.max(outputs, dim=1)
            #print("pred : ", preds)
            predictions.extend(preds.cpu().tolist())

            # Convert one-hot encoded labels to class indices
            actual_indices = torch.argmax(labels, dim=1)
            #print("actual indice : ", actual_indices)
            actual_labels.extend(actual_indices.cpu().tolist())
            #print("actual label : ", actual_labels)

    accuracy = accuracy_score(actual_labels, predictions)
    #class_report = classification_report(actual_labels, predictions)

    return sum(valid_loss)/len(validation_loader), accuracy

def save_checkpoint(model, checkpoint_filename):
    '''
        save the checkpoint after training
    '''

    state = {
        'classifier': model.classifier,
        'model_state_dict': model.state_dict()
    }

    torch.save(state, checkpoint_filename)

def main():
    #csv_path = "data/cleaned/nlp_clean.csv"
    #csv_path = "/content/drive/MyDrive/DIT_DL2/nlp_clean.csv"
    wandb.init(project = "bert_corana_sent_anal")
    dataset = MyDataset(csv_file = configs["csv_fil"],
                        tokenizer_name= configs["model_name"],
                        max_length= configs["max_length"],
                        )
    train_dataset, test_dataset = train_test_split(dataset, test_size=0.3, shuffle=True)

    train_loader = dataloader(dataset=train_dataset,
                              batch_size= configs["batch_size"],
                              shuffle=True)

    valid_loader = dataloader(dataset=test_dataset, batch_size= configs["batch_size"], shuffle=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = CustomModel(model_name = configs["model_name"], n_classes=5)

    model.to(device)

    loss_fn = nn.CrossEntropyLoss()

    optimizer = AdamW(model.parameters(), lr = configs["learning_rate"])
    for epoch in range(configs["n_epochs"]):
        loss_train = train_step(model, train_loader, optimizer, loss_fn, device)
        loss_valid, accuracy = validation_step(model = model, validation_loader = valid_loader, device=device, loss_fn=loss_fn)
        wandb.log({"loss_train":loss_train,
                  "loss_valid" : loss_valid,
                  "accuracy" : accuracy})


    #sauvegarder notre model
    #save_checkpoint(model, "checkpoints.pt")
    labels_dict = {'Extremely Positive': 1, 'Positive': 4, 'Neutral': 3, 'Negative': 2, 'Extremely Negative' : 0}
    labnames = list(labels_dict.keys())
    #model.save_pretrained("bert-corana-sent_analysis")
    repo_name = "Peed911/bert_corona_tweet_sentiment_analysis"
    model.push_to_hub(repo_id=repo_name)
    tokenizer = AutoTokenizer.from_pretrained(configs["model_name"])
    tokenizer.push_to_hub(repo_id = repo_name)
    config = AutoConfig.from_pretrained(configs["model_name"], num_labels=5)
    config.id2label = {str(i) : lbl for i, lbl in enumerate(labnames)}
    config.label2id = {lbl : str(i) for i, lbl in enumerate(labnames)}
    config.push_to_hub(repo_id = repo_name)
if __name__ == "__main__":
    main()


[34m[1mwandb[0m: Currently logged in as: [33mdiallomous[0m. Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 1/1 [00:27<00:00, 27.65s/it]


pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [9]:
!pip install gradio

Collecting gradio
  Downloading gradio-3.41.2-py3-none-any.whl (20.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.1/20.1 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting fastapi (from gradio)
  Downloading fastapi-0.103.0-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.2/66.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ffmpy (from gradio)
  Downloading ffmpy-0.3.1.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.5.0 (from gradio)
  Downloading gradio_client-0.5.0-py3-none-any.whl (298 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.2/298.2 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx (from gradio)
  Downloading httpx-0.24.1-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [10]:
def predict(model, text):
  tokenizer = AutoTokenizer.from_pretrained("Peed911/bert_corona_tweet_sentiment_analysis")
  toks = tokenizer(text)
  output = model(**toks)
  _, pred = torch.max(output, dim = 1)
  return pred


In [11]:
model_pred = BertModel.from_pretrained("bert_corona_tweet_sentiment_analysis")
predict(model = model_pred, text = "this is insane")

OSError: ignored

In [None]:
import gradio as gr

def greet(name):
    return "Hello " + name + "!"

demo = gr.Interface(
    fn=greet,
    inputs=gr.Textbox(lines=2, placeholder="Name Here..."),
    outputs="text",
)
demo.launch()
