<a href="https://colab.research.google.com/github/Bareeraq/sentiment-analysis-tool/blob/main/Sentiment_analysis_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
print(torch.version.cuda)  # Should output the installed CUDA version
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.device_count())  # Should show the number of GPUs available

12.6
True
1


In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
!pip install datasets



In [4]:
!pip install transformers



In [5]:
!pip install huggingface_hub



In [6]:
!apt-get install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


# **PREPROCESSING**

In [7]:
from datasets import load_dataset

ds = load_dataset("Sp1786/multiclass-sentiment-analysis-dataset")

print(ds)

README.md: 0.00B [00:00, ?B/s]

train_df.csv: 0.00B [00:00, ?B/s]

val_df.csv: 0.00B [00:00, ?B/s]

test_df.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/31232 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5205 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5206 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 31232
    })
    validation: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 5205
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'sentiment'],
        num_rows: 5206
    })
})


In [8]:
from datasets import DatasetDict
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load your dataset splits
train_dataset = ds["train"]  # Original train split
validation_dataset = ds["validation"]  # Original validation split
test_dataset = ds["test"]  # Original test split

# **TOKENIZE THE DS**

In [9]:
# from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [10]:
def tokenize_function(examples):
    # Ensure 'text' is a string or a list of strings, and handle None values
    texts = examples["text"]
    if texts is None:  # Handle None values
        texts = [""]  # Replace None with empty string
    elif isinstance(texts, str):
        texts = [texts]  # Wrap single string in a list
    # Filter out any non-string elements in the list
    texts = [t for t in texts if isinstance(t, str)]

    return tokenizer(texts, padding="max_length", truncation=True, max_length=512)

# Now proceed with mapping the tokenize_function:
train_dataset = train_dataset.map(tokenize_function, batched=True)
validation_dataset = validation_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batch_size=1000, drop_last_batch=True)

Map:   0%|          | 0/31232 [00:00<?, ? examples/s]

Map:   0%|          | 0/5205 [00:00<?, ? examples/s]

Map:   0%|          | 0/5206 [00:00<?, ? examples/s]

In [11]:
print(train_dataset.column_names)
print(validation_dataset.column_names)
print(test_dataset.column_names)

['id', 'text', 'label', 'sentiment', 'input_ids', 'attention_mask']
['id', 'text', 'label', 'sentiment', 'input_ids', 'attention_mask']
['id', 'text', 'label', 'sentiment', 'input_ids', 'attention_mask']


In [12]:
from transformers import DataCollatorWithPadding
# Prepare the data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Load the pretrained model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)  # Our dataset contains 3 classes

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# **TRAINING THE MODEL**

In [13]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


### COMPUTE METRICS FUNCTION

In [14]:
!pip install scikit-learn



In [15]:
import sklearn
print(sklearn.__version__)

1.6.1


In [16]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from evaluate import load

# Define the compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Handle single-class edge cases
    if len(set(labels)) == 1:  # Only one class in references
        return {"accuracy": 0.0, "f1": 0.0}  # Fallback values

    # Compute accuracy
    accuracy = accuracy_score(labels, predictions)

    # Compute F1 score manually without relying on `average`
    try:
        f1 = f1_score(labels, predictions, average=None).mean()  # Mean of F1 scores across all classes
    except Exception as e:
        print(f"F1 computation error: {e}")
        f1 = 0.0  # Fallback value

    return {"accuracy": accuracy, "f1": f1}

In [17]:
!pip install wandb



In [18]:
# Complete secure training setup
from google.colab import userdata
import os
import wandb

In [19]:
# Define training arguments

repo_name = "Sentiment-analysis-tool"

training_args = TrainingArguments(
    run_name = "sentiment_analysis",    #specify a run name
    output_dir= repo_name,              # Directory for saving model checkpoints
    eval_strategy="epoch",              # Evaluate at the end of every epoch
    learning_rate=2e-5,                 # Learning rate
    per_device_train_batch_size=16,     # Batch size for training
    per_device_eval_batch_size=16,      # Batch size for evaluation
    num_train_epochs=3,                 # Number of training epochs
    weight_decay=0.01,                  # Weight decay for regularization
    logging_dir="./logs",               # Directory for logging
    logging_steps=10,                   # Log every 10 steps
    save_total_limit=2,                 # Keep only the last 2 checkpoints
    save_strategy="epoch",              # Save model checkpoints after every epoch
    push_to_hub=True,
)

In [20]:
#initialize WandB
try:
    os.environ['WANDB_API_KEY'] = userdata.get('WANDB_API_KEY')
    wandb.login()
    wandb.init(
        project="sentiment-analysis-tool",
        name="sentiment-analysis",
        config={
            "model": "bareeraqrsh/Sentiment-analysis-tool",
            "batch_size": training_args.per_device_train_batch_size,
            "learning_rate": training_args.learning_rate,
            "epochs": training_args.num_train_epochs
        }
    )
except:
    print("WandB disabled - proceeding without logging")
    os.environ['WANDB_DISABLED'] = 'true'

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Currently logged in as: [33mbareera-qureishi[0m ([33mbareera-qureishi-personal[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [21]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [22]:
print(test_dataset.column_names)
print(test_dataset[0])  # Look at one example

['id', 'text', 'label', 'sentiment', 'input_ids', 'attention_mask']
{'id': 9235, 'text': 'getting cds ready for tour', 'label': 1, 'sentiment': 'neutral', 'input_ids': [[101, 2893, 14340, 3201, 2005, 2778, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [23]:
def flatten_nested(example):
    example["input_ids"] = example["input_ids"][0]
    example["attention_mask"] = example["attention_mask"][0]
    return example

test_dataset = test_dataset.map(flatten_nested)

Map:   0%|          | 0/5206 [00:00<?, ? examples/s]

In [24]:
# Train and evaluate
print("Starting training...")
trainer.train()

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5476,0.579515,0.758117,0.760073
2,0.5589,0.584073,0.761191,0.764025
3,0.3174,0.635443,0.760615,0.763309


TrainOutput(global_step=5856, training_loss=0.5023157529003633, metrics={'train_runtime': 4531.951, 'train_samples_per_second': 20.675, 'train_steps_per_second': 1.292, 'total_flos': 1.2411886728904704e+16, 'train_loss': 0.5023157529003633, 'epoch': 3.0})

In [25]:
# Evaluate on the test set
results = trainer.evaluate(test_dataset)
print("Test results:", results)

Test results: {'eval_loss': 0.6251811385154724, 'eval_accuracy': 0.7685363042643104, 'eval_f1': 0.771479143347921, 'eval_runtime': 77.9535, 'eval_samples_per_second': 66.783, 'eval_steps_per_second': 4.182, 'epoch': 3.0}


In [26]:
# Finish WandB
if 'WANDB_DISABLED' not in os.environ:
    wandb.finish()

print("Training completed!")

0,1
eval/accuracy,▁▃▃█
eval/f1,▁▃▃█
eval/loss,▁▂█▇
eval/runtime,▂▃▁█
eval/samples_per_second,▇▅█▁
eval/steps_per_second,▇▅█▁
train/epoch,▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇▇▇██
train/global_step,▁▁▁▁▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇██
train/grad_norm,▆▇▃▅▄▃▅▂▄▄▃▃▄▁▃▂▅▃▄▅▆▅▆▃▃▅▂▃▂▄▄█▃▅▆▄▂▄▆▂
train/learning_rate,██████▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▄▄▄▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▁

0,1
eval/accuracy,0.76854
eval/f1,0.77148
eval/loss,0.62518
eval/runtime,77.9535
eval/samples_per_second,66.783
eval/steps_per_second,4.182
total_flos,1.2411886728904704e+16
train/epoch,3
train/global_step,5856
train/grad_norm,8.3236


Training completed!
