In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install datasets
!pip install transformers
!pip install torch
!pip install scikit-learn
! pip install -U accelerate
! pip install -U transformers

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
Successfully installed datasets-2.16.1 dill-0.3.7 multiprocess-0.70.15
Collecting accelerate
  Downloading accelerate-0.26.1-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected pack

In [3]:
import os
import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import torch
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
# Locations

# Datasets paths
DATASET_PATH = "/content/drive/MyDrive/Datasets/"
ORGANIC_DATASET_PATH = "/content/drive/MyDrive/Datasets/organic/"
CLIMATE_DATASET_PATH = "/content/drive/MyDrive/Datasets/climate/"
WATER_DATASET_PATH = "/content/drive/MyDrive/Datasets/water/"
SOCIAL_DATASET_PATH = "/content/drive/MyDrive/Datasets/social/"
GOVERNANCE_DATASET_PATH = "/content/drive/MyDrive/Datasets/governance/"
WASTE_DATASET_PATH = "/content/drive/MyDrive/Datasets/waste/"
ADVERSE_DATASET_PATH = "/content/drive/MyDrive/Datasets/adverse/"

# Models paths
MODELS_PATH = "/content/drive/MyDrive/EcoModels"

In [6]:
# Parameters
TEST_SIZE = 0.2
VAL_SIZE = 0.8
CATEGORIES = ['organic', 'climate', 'water', 'social', 'governance', 'waste', 'adverse']
LR = 1e-4
BATCH_SIZE = 1

In [7]:
class TrainerDataset(torch.utils.data.Dataset):
  def __init__(self, dataset: Dataset, tokenizer):
      self.reviews = dataset['review']
      self.rates = dataset['rate']
      self.input_ids = tokenizer(dataset['review'], padding='max_length', truncation=True, max_length=400, return_tensors='pt')['input_ids']
      self.labels = [rate + 1 for rate in self.rates]

  def __getitem__(self, idx):
      return {
          'input_ids': self.input_ids[idx],
          'labels': self.labels[idx]
      }

  def __len__(self):
      return len(self.reviews)

In [8]:
def load_tokenizer(model_name):
  return AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name)

def load_model(model_name):
  return AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name, num_labels=7)

def split_dataset(dataset: Dataset):
  train_dataset, temp_dataset = train_test_split(dataset, test_size=(TEST_SIZE), random_state=42)
  temp_dataset = Dataset.from_dict(temp_dataset)
  val_dataset, test_dataset = train_test_split(temp_dataset, test_size=(TEST_SIZE / (TEST_SIZE + VAL_SIZE)), random_state=42)
  return Dataset.from_dict(train_dataset), Dataset.from_dict(test_dataset), Dataset.from_dict(val_dataset)


def compute_metrics(eval_predictions):
  logits, labels = eval_predictions
  predictions = np.argmax(logits, axis=-1)
  f1 = f1_score(labels, predictions, average='weighted')
  return {"f1_score": f1}

In [11]:
model_name = 'bert-base-uncased'
tokenizer = load_tokenizer(model_name=model_name)

for category in CATEGORIES:
  model = load_model(model_name=model_name)
  dataset_path = DATASET_PATH + category + '/'
  dataset = Dataset.load_from_disk(dataset_path=dataset_path)

  # Splitting dataset in train test and validation datasets
  train_dataset, test_dataset, val_dataset = split_dataset(dataset=dataset)

  # Transforming datasets into torch Dataset objects
  train_dataset = TrainerDataset(dataset=train_dataset, tokenizer=tokenizer)
  test_dataset = TrainerDataset(dataset=test_dataset, tokenizer=tokenizer)
  val_dataset = TrainerDataset(dataset=val_dataset, tokenizer=tokenizer)

  # Training our model
  data_collator = DataCollatorWithPadding(tokenizer=tokenizer, max_length=400)

  trainer_args = TrainingArguments(output_dir=f'EynardM/sentiment-analysis-{category}',
                                    num_train_epochs=5,
                                    evaluation_strategy="epoch",
                                    save_strategy="epoch",
                                    learning_rate=LR,
                                    per_device_train_batch_size=5,
                                    load_best_model_at_end=True,
                                    push_to_hub=True)
  trainer = Trainer(
      model=model,
      args=trainer_args,
      train_dataset=train_dataset,
      eval_dataset=val_dataset,
      data_collator=data_collator,
      tokenizer=tokenizer,
      compute_metrics=compute_metrics
  )

  trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.98569,0.734372
2,No log,0.904594,0.734372
3,0.848500,0.894405,0.734372
4,0.848500,0.973965,0.734372
5,0.848500,0.451172,0.808799


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.842874,0.749006
2,No log,0.898398,0.749006
3,0.753700,0.94474,0.749006
4,0.753700,0.52483,0.808805
5,0.753700,0.515483,0.808805


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Score
1,No log,6.7e-05,1.0
2,No log,3.9e-05,1.0
3,0.010800,2.9e-05,1.0
4,0.010800,2.4e-05,1.0
5,0.010800,2.3e-05,1.0


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.686782,0.778488
2,No log,0.718848,0.778488
3,0.864600,0.547345,0.826485
4,0.864600,0.500915,0.827098
5,0.864600,0.44051,0.835929


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.692046,0.81573
2,No log,0.604793,0.81573
3,0.868100,0.710887,0.81573
4,0.868100,0.410567,0.861875
5,0.868100,0.377724,0.859538


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.716582,0.845825
2,No log,0.568472,0.845825
3,0.592100,0.636054,0.845825
4,0.592100,0.628126,0.845825
5,0.592100,0.626542,0.845825


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,F1 Score
1,No log,0.904262,0.690915
2,No log,0.920243,0.690915
3,0.958300,0.921221,0.690915
4,0.958300,0.920741,0.690915
5,0.958300,0.923563,0.690915


