In [4]:
# 📦 Install libraries (uncomment if needed)
# !pip install transformers scikit-learn pandas torch

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm import tqdm

# ✅ Step 1: Load your dataset
df = pd.read_csv("sentiment_analysis.csv")
print("🟢 Loaded dataset with columns:", df.columns)

# ✅ Step 2: Encode sentiment labels (e.g., positive → 0, negative → 1)
label_map = {label: i for i, label in enumerate(df['sentiment'].unique())}
df['label'] = df['sentiment'].map(label_map)

# ✅ Step 3: Split into train and validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

# ✅ Step 4: Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# ✅ Step 5: Dataset class
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# ✅ Step 6: Create datasets and loaders
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# ✅ Step 7: Model initialization
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_map)
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ✅ Step 8: Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 50
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# ✅ Step 9: Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}")

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        progress_bar.set_postfix(loss=loss.item())

    print(f"✅ Epoch {epoch+1} complete. Avg Loss: {total_loss / len(train_loader):.4f}")

# ✅ Step 10: Save model and tokenizer
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")
print("✅ Model training complete and saved to ./sentiment_model")


🟢 Loaded dataset with columns: Index(['Year', 'Month', 'Day', 'Time of Tweet', 'text', 'sentiment',
       'Platform'],
      dtype='object')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 25/25 [00:03<00:00,  6.52it/s, loss=0.829]


✅ Epoch 1 complete. Avg Loss: 0.9633


Epoch 2: 100%|██████████| 25/25 [00:03<00:00,  6.61it/s, loss=0.601]


✅ Epoch 2 complete. Avg Loss: 0.5591


Epoch 3: 100%|██████████| 25/25 [00:03<00:00,  6.56it/s, loss=0.0559]


✅ Epoch 3 complete. Avg Loss: 0.2166


Epoch 4: 100%|██████████| 25/25 [00:03<00:00,  6.53it/s, loss=0.0203]


✅ Epoch 4 complete. Avg Loss: 0.0754


Epoch 5: 100%|██████████| 25/25 [00:03<00:00,  6.51it/s, loss=0.0109]


✅ Epoch 5 complete. Avg Loss: 0.0322


Epoch 6: 100%|██████████| 25/25 [00:03<00:00,  6.44it/s, loss=0.00649]


✅ Epoch 6 complete. Avg Loss: 0.0176


Epoch 7: 100%|██████████| 25/25 [00:03<00:00,  6.42it/s, loss=0.00417]


✅ Epoch 7 complete. Avg Loss: 0.0111


Epoch 8: 100%|██████████| 25/25 [00:03<00:00,  6.39it/s, loss=0.097]


✅ Epoch 8 complete. Avg Loss: 0.0163


Epoch 9: 100%|██████████| 25/25 [00:03<00:00,  6.26it/s, loss=0.00315]


✅ Epoch 9 complete. Avg Loss: 0.0100


Epoch 10: 100%|██████████| 25/25 [00:04<00:00,  6.25it/s, loss=0.00238]


✅ Epoch 10 complete. Avg Loss: 0.0068


Epoch 11: 100%|██████████| 25/25 [00:04<00:00,  6.18it/s, loss=0.00193]


✅ Epoch 11 complete. Avg Loss: 0.0088


Epoch 12: 100%|██████████| 25/25 [00:04<00:00,  6.16it/s, loss=0.0302]


✅ Epoch 12 complete. Avg Loss: 0.0643


Epoch 13: 100%|██████████| 25/25 [00:04<00:00,  6.17it/s, loss=0.00317]


✅ Epoch 13 complete. Avg Loss: 0.0381


Epoch 14: 100%|██████████| 25/25 [00:04<00:00,  6.16it/s, loss=0.00268]


✅ Epoch 14 complete. Avg Loss: 0.0098


Epoch 15: 100%|██████████| 25/25 [00:04<00:00,  6.14it/s, loss=0.0028]


✅ Epoch 15 complete. Avg Loss: 0.0096


Epoch 16: 100%|██████████| 25/25 [00:04<00:00,  6.19it/s, loss=0.0021]


✅ Epoch 16 complete. Avg Loss: 0.0052


Epoch 17: 100%|██████████| 25/25 [00:04<00:00,  6.18it/s, loss=0.00155]


✅ Epoch 17 complete. Avg Loss: 0.0063


Epoch 18: 100%|██████████| 25/25 [00:04<00:00,  6.16it/s, loss=0.00258]


✅ Epoch 18 complete. Avg Loss: 0.0088


Epoch 19: 100%|██████████| 25/25 [00:04<00:00,  6.22it/s, loss=0.00188]


✅ Epoch 19 complete. Avg Loss: 0.0067


Epoch 20: 100%|██████████| 25/25 [00:03<00:00,  6.27it/s, loss=0.00133]


✅ Epoch 20 complete. Avg Loss: 0.0050


Epoch 21: 100%|██████████| 25/25 [00:03<00:00,  6.27it/s, loss=0.00113]


✅ Epoch 21 complete. Avg Loss: 0.0062


Epoch 22: 100%|██████████| 25/25 [00:03<00:00,  6.31it/s, loss=0.0468]


✅ Epoch 22 complete. Avg Loss: 0.0043


Epoch 23: 100%|██████████| 25/25 [00:03<00:00,  6.36it/s, loss=0.00112]


✅ Epoch 23 complete. Avg Loss: 0.0051


Epoch 24: 100%|██████████| 25/25 [00:03<00:00,  6.35it/s, loss=0.000991]


✅ Epoch 24 complete. Avg Loss: 0.0047


Epoch 25: 100%|██████████| 25/25 [00:03<00:00,  6.42it/s, loss=0.000924]


✅ Epoch 25 complete. Avg Loss: 0.0062


Epoch 26: 100%|██████████| 25/25 [00:03<00:00,  6.39it/s, loss=0.000899]


✅ Epoch 26 complete. Avg Loss: 0.0042


Epoch 27: 100%|██████████| 25/25 [00:03<00:00,  6.39it/s, loss=0.000722]


✅ Epoch 27 complete. Avg Loss: 0.0033


Epoch 28: 100%|██████████| 25/25 [00:03<00:00,  6.40it/s, loss=0.00073]


✅ Epoch 28 complete. Avg Loss: 0.0064


Epoch 29: 100%|██████████| 25/25 [00:03<00:00,  6.39it/s, loss=0.000794]


✅ Epoch 29 complete. Avg Loss: 0.0053


Epoch 30: 100%|██████████| 25/25 [00:03<00:00,  6.39it/s, loss=0.000926]


✅ Epoch 30 complete. Avg Loss: 0.0062


Epoch 31: 100%|██████████| 25/25 [00:03<00:00,  6.38it/s, loss=0.000833]


✅ Epoch 31 complete. Avg Loss: 0.0051


Epoch 32: 100%|██████████| 25/25 [00:03<00:00,  6.37it/s, loss=0.00443]


✅ Epoch 32 complete. Avg Loss: 0.0175


Epoch 33: 100%|██████████| 25/25 [00:03<00:00,  6.27it/s, loss=0.00365]


✅ Epoch 33 complete. Avg Loss: 0.0130


Epoch 34: 100%|██████████| 25/25 [00:03<00:00,  6.31it/s, loss=0.000755]


✅ Epoch 34 complete. Avg Loss: 0.0054


Epoch 35: 100%|██████████| 25/25 [00:03<00:00,  6.29it/s, loss=0.000727]


✅ Epoch 35 complete. Avg Loss: 0.0047


Epoch 36: 100%|██████████| 25/25 [00:03<00:00,  6.26it/s, loss=0.000952]


✅ Epoch 36 complete. Avg Loss: 0.0039


Epoch 37: 100%|██████████| 25/25 [00:03<00:00,  6.25it/s, loss=0.00071]


✅ Epoch 37 complete. Avg Loss: 0.0054


Epoch 38: 100%|██████████| 25/25 [00:04<00:00,  6.24it/s, loss=0.000613]


✅ Epoch 38 complete. Avg Loss: 0.0048


Epoch 39: 100%|██████████| 25/25 [00:04<00:00,  6.20it/s, loss=0.000775]


✅ Epoch 39 complete. Avg Loss: 0.0042


Epoch 40: 100%|██████████| 25/25 [00:03<00:00,  6.25it/s, loss=0.000706]


✅ Epoch 40 complete. Avg Loss: 0.0046


Epoch 41: 100%|██████████| 25/25 [00:04<00:00,  6.24it/s, loss=0.000591]


✅ Epoch 41 complete. Avg Loss: 0.0058


Epoch 42: 100%|██████████| 25/25 [00:04<00:00,  6.23it/s, loss=0.000539]


✅ Epoch 42 complete. Avg Loss: 0.0048


Epoch 43: 100%|██████████| 25/25 [00:03<00:00,  6.25it/s, loss=0.000641]


✅ Epoch 43 complete. Avg Loss: 0.0040


Epoch 44: 100%|██████████| 25/25 [00:03<00:00,  6.26it/s, loss=0.000666]


✅ Epoch 44 complete. Avg Loss: 0.0046


Epoch 45: 100%|██████████| 25/25 [00:04<00:00,  6.22it/s, loss=0.000611]


✅ Epoch 45 complete. Avg Loss: 0.0146


Epoch 46: 100%|██████████| 25/25 [00:03<00:00,  6.29it/s, loss=0.000711]


✅ Epoch 46 complete. Avg Loss: 0.0053


Epoch 47: 100%|██████████| 25/25 [00:03<00:00,  6.29it/s, loss=0.000848]


✅ Epoch 47 complete. Avg Loss: 0.0043


Epoch 48: 100%|██████████| 25/25 [00:03<00:00,  6.30it/s, loss=0.00062]


✅ Epoch 48 complete. Avg Loss: 0.0054


Epoch 49: 100%|██████████| 25/25 [00:03<00:00,  6.28it/s, loss=0.000571]


✅ Epoch 49 complete. Avg Loss: 0.0050


Epoch 50: 100%|██████████| 25/25 [00:03<00:00,  6.30it/s, loss=0.0518]


✅ Epoch 50 complete. Avg Loss: 0.0047
✅ Model training complete and saved to ./sentiment_model


In [8]:
# ✅ Step 10: Save model and tokenizer
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")
print("✅ Model training complete and saved to ./sentiment_model")

✅ Model training complete and saved to ./sentiment_model


In [9]:
# ✅ Step 11: Load the trained model and tokenizer
loaded_model = BertForSequenceClassification.from_pretrained("./sentiment_model")
loaded_tokenizer = BertTokenizer.from_pretrained("./sentiment_model")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)


print("✅ Model and tokenizer loaded successfully.")

# ✅ Step 14: Test with random text
def predict_sentiment(text):
    inputs = loaded_tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = loaded_model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

    # Get the sentiment label back from the mapped index
    # Reverse the label_map to get label name from index
    reverse_label_map = {i: label for label, i in label_map.items()}
    predicted_label = reverse_label_map[predictions.item()]
    return predicted_label

# Example usage:
test_text = "This is a great product! I love it."
sentiment = predict_sentiment(test_text)
print(f"The sentiment of the text '{test_text}' is: {sentiment}")

test_text_2 = "I am not happy with the service."
sentiment_2 = predict_sentiment(test_text_2)
print(f"The sentiment of the text '{test_text_2}' is: {sentiment_2}")

test_text_3 = "It is what it is."
sentiment_3 = predict_sentiment(test_text_3)
print(f"The sentiment of the text '{test_text_3}' is: {sentiment_3}")

✅ Model and tokenizer loaded successfully.
The sentiment of the text 'This is a great product! I love it.' is: positive
The sentiment of the text 'I am not happy with the service.' is: negative
The sentiment of the text 'It is what it is.' is: neutral


In [14]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ✅ Step 12: Evaluation loop
loaded_model.eval()
val_labels_list = []
val_preds_list = []

with torch.no_grad():
    for batch in val_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = loaded_model(**batch)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        val_labels_list.extend(batch['labels'].cpu().numpy())
        val_preds_list.extend(predictions.cpu().numpy())

# ✅ Step 13: Calculate and print metrics
accuracy = accuracy_score(val_labels_list, val_preds_list)
precision, recall, f1, _ = precision_recall_fscore_support(val_labels_list, val_preds_list, average='weighted')

print(f"✅ Validation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

✅ Validation Metrics:
Accuracy: 0.8600
Precision: 0.8645
Recall: 0.8600
F1 Score: 0.8614


In [12]:
from google.colab import drive
import shutil
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define the source and destination paths
source_dir = "/content/sentiment_model"
destination_dir = "/content/drive/MyDrive/model" # You can change the destination folder name

# Create the destination directory if it doesn't exist
if not os.path.exists(destination_dir):
    os.makedirs(destination_dir)

# Copy the model directory to Google Drive
try:
    shutil.copytree(source_dir, destination_dir)
    print(f"✅ Model saved successfully to {destination_dir}")
except FileExistsError:
    print(f"Directory '{destination_dir}' already exists. Model may have been saved previously.")
except Exception as e:
    print(f"❌ Error saving model: {e}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Directory '/content/drive/MyDrive/model' already exists. Model may have been saved previously.
