# **Step 1: Import necessary libraries**

In [1]:
!pip install datasets
!pip install evaluate

from datasets import Dataset, DatasetDict
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig, Trainer, TrainingArguments
import torch
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import os
import matplotlib.pyplot as plt

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

# **Step 2: Load datasets from the data folder**

In [2]:
# Load the datasets from CSV files
train_df = pd.read_csv('data/train_data.csv')
test_df = pd.read_csv('data/test_data.csv')
validation_df = pd.read_csv('data/validation_data.csv')

# Convert DataFrames to Huggingface Datasets
dataset = DatasetDict(
    {'train': Dataset.from_pandas(train_df, preserve_index=False),
     'test': Dataset.from_pandas(test_df, preserve_index=False),
     'validation': Dataset.from_pandas(validation_df, preserve_index=False)
     }
)


# **Step 3: Tokenization setup**

In [3]:
# Load the tokenizer from the pre-trained BERT model
model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Function to tokenize the text data
def tokenize(batch):
    return tokenizer(batch['ticket'], padding=True, truncation=True)

# Tokenize the datasets
emotion_encoded = dataset.map(tokenize, batched=True, batch_size=None)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/840 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

# **Step 4: Label mapping**

In [6]:
# Create a mapping between sentiment labels and numerical labels
label2id = {x['sentiment']: x['label'] for x in dataset['train']}
id2label= {v: k for k, v in label2id.items()}


# **Step 5: Model setup**

In [7]:
# Configure the model for sequence classification
num_labels = len(label2id)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
config = AutoConfig.from_pretrained(model_ckpt, label2id=label2id, id2label=id2label)
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **Step 6: Training arguments setup**

In [8]:
# Define the training arguments for the model
batch_size = 64
training_dir = "bert_base_train_dir"

training_args = TrainingArguments(
    output_dir=training_dir,
    overwrite_output_dir=True,
    num_train_epochs=20,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    logging_strategy='steps',
    logging_steps=1,  # Log every 10 steps
    save_strategy='epoch',
    disable_tqdm=False,  # Disable the default tqdm progress bar from Trainer
    logging_dir='./logs'  # Directory for storing logs
)



# **Step 7: Metrics computation**

In [9]:
# Load the evaluation metric (accuracy) and define the function to compute metrics
accuracy = evaluate.load("accuracy")

def compute_metrics_evaluate(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

# Define a function to compute accuracy and F1 score
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

# **Step 8: Trainer setup and training**

In [10]:
# Set up the Trainer with the model, arguments, and data
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=emotion_encoded['train'],
    eval_dataset=emotion_encoded['validation'],  # Include validation dataset for logging
    tokenizer=tokenizer
)

# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.5589,1.35136,0.533333,0.465215
2,0.9363,1.089012,0.65,0.571502
3,0.8154,0.853787,0.716667,0.693885
4,0.6993,0.682125,0.75,0.741676
5,0.518,0.581748,0.775,0.764141
6,0.2642,0.544,0.783333,0.773908
7,0.2162,0.462272,0.816667,0.814857
8,0.2846,0.45972,0.8,0.800192
9,0.1276,0.539304,0.783333,0.772808
10,0.102,0.474833,0.816667,0.815887


TrainOutput(global_step=280, training_loss=0.3503583537481193, metrics={'train_runtime': 630.2245, 'train_samples_per_second': 26.657, 'train_steps_per_second': 0.444, 'total_flos': 673417995868800.0, 'train_loss': 0.3503583537481193, 'epoch': 20.0})

# **Step 9: Save the trained model**


In [13]:
import os
from transformers import AutoModelForSequenceClassification, AutoTokenizer

def get_next_model_dir(base_dir):
    """
    Finds the next available model directory name by checking the existing folders
    and incrementing the number in the name.
    """
    # Get a list of existing model directories
    existing_dirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]

    # Filter out directories that don't match the 'model_' pattern
    model_dirs = [d for d in existing_dirs if d.startswith("model_")]

    # Extract numbers from the directory names and find the highest one
    if model_dirs:
        model_numbers = [int(d.split("_")[1]) for d in model_dirs]
        next_number = max(model_numbers) + 1
    else:
        next_number = 1  # Start with 1 if no model directories exist

    # Return the new directory name
    return os.path.join(base_dir, f"model_{next_number}")

def save_model_and_tokenizer(model, tokenizer, base_dir="models"):
    """
    Saves the model and tokenizer in a uniquely numbered folder inside the base_dir.
    """
    # Ensure the base directory exists
    os.makedirs(base_dir, exist_ok=True)

    # Get the next model directory
    model_dir = get_next_model_dir(base_dir)

    # Save the model and tokenizer in the default format (not using safetensors)
    model.save_pretrained(model_dir)
    tokenizer.save_pretrained(model_dir)

    print(f"Model and tokenizer saved in {model_dir}")


save_model_and_tokenizer(model, tokenizer)


Model and tokenizer saved in models/model_2


# **Step 10: Upload to HuggingFace**

In [14]:
!pip install huggingface_hub



In [16]:
from huggingface_hub import login

huggingface_token = "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"
login(token=huggingface_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [18]:
from huggingface_hub import HfApi

api = HfApi()

model_name = "5class_sentimentClassifier"  # Choose a name for your model

# Create a new repository on Hugging Face
api.create_repo(repo_id=model_name)

# Push the model and tokenizer to Hugging Face
model.push_to_hub(model_name)
tokenizer.push_to_hub(model_name)

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/BharathBOLT/5class_sentimentClassifier/commit/6606d3ec0af676697619caa300959a9341faabd0', commit_message='Upload tokenizer', commit_description='', oid='6606d3ec0af676697619caa300959a9341faabd0', pr_url=None, pr_revision=None, pr_num=None)