In [None]:
# Task 1: BERT-based Customer Feedback Sentiment Classification
# Install required libraries first (run in Kaggle notebook cell)
"""
!pip install transformers datasets torch scikit-learn matplotlib seaborn
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# ============================================
# STEP 1: Data Loading and Exploration
# ============================================

# Load the dataset
# For Kaggle: Add the dataset to your notebook first
# Path: /kaggle/input/customer-feedback-dataset/sentiment-analysis.csv
df = pd.read_csv('/kaggle/input/customer-feedback-dataset/sentiment-analysis.csv')

print("Dataset shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset info:")
print(df.info())
print("\nColumn names:")
print(df.columns.tolist())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Display unique values in sentiment column
if 'Sentiment' in df.columns:
    print("\nSentiment distribution:")
    print(df['Sentiment'].value_counts())
elif 'sentiment' in df.columns:
    print("\nSentiment distribution:")
    print(df['sentiment'].value_counts())

# ============================================
# STEP 2: Data Preprocessing
# ============================================

# Adjust column names based on actual dataset
# Common column names: ['text', 'sentiment'], ['feedback', 'sentiment'], ['review', 'label']
# Let's handle multiple possibilities

def preprocess_dataframe(df):
    """Standardize dataframe columns"""
    # Convert all column names to lowercase for consistency
    df.columns = df.columns.str.lower()
    
    # Identify text column
    text_cols = ['text', 'feedback', 'review', 'comment', 'message', 'opinion']
    text_col = None
    for col in text_cols:
        if col in df.columns:
            text_col = col
            break
    
    # Identify sentiment column
    sentiment_cols = ['sentiment', 'label', 'rating', 'emotion']
    sentiment_col = None
    for col in sentiment_cols:
        if col in df.columns:
            sentiment_col = col
            break
    
    if text_col is None or sentiment_col is None:
        print("Available columns:", df.columns.tolist())
        raise ValueError("Could not identify text or sentiment columns")
    
    # Create standardized dataframe
    processed_df = pd.DataFrame({
        'text': df[text_col],
        'sentiment': df[sentiment_col]
    })
    
    return processed_df

df = preprocess_dataframe(df)

# Remove duplicates
df = df.drop_duplicates()

# Remove missing values
df = df.dropna()

# Clean text data
def clean_text(text):
    """Basic text cleaning"""
    if not isinstance(text, str):
        text = str(text)
    text = text.strip()
    text = ' '.join(text.split())  # Remove extra whitespace
    return text

df['text'] = df['text'].apply(clean_text)

# Map sentiment labels to integers
# Handle different sentiment formats
unique_sentiments = df['sentiment'].unique()
print(f"\nUnique sentiments: {unique_sentiments}")

# Create label mapping
if df['sentiment'].dtype == 'object':
    # For string labels like 'positive', 'negative', 'neutral'
    sentiment_mapping = {}
    sentiment_names = []
    
    for idx, sentiment in enumerate(sorted(df['sentiment'].unique())):
        sentiment_lower = str(sentiment).lower()
        sentiment_mapping[sentiment] = idx
        sentiment_names.append(sentiment_lower)
    
    df['label'] = df['sentiment'].map(sentiment_mapping)
else:
    # For numeric labels
    df['label'] = df['sentiment']
    sentiment_names = [f"class_{i}" for i in sorted(df['label'].unique())]

num_labels = len(df['label'].unique())
print(f"\nNumber of classes: {num_labels}")
print(f"Label mapping: {sentiment_mapping if 'sentiment_mapping' in locals() else 'Using numeric labels'}")

# Visualize class distribution
plt.figure(figsize=(10, 5))
df['label'].value_counts().plot(kind='bar')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment Label')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig('sentiment_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nFinal dataset shape: {df.shape}")
print("\nClass distribution:")
print(df['label'].value_counts())
print("\nSample texts:")
print(df.head(10))

In [None]:
# === BERT training + evaluation cell ===
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

PRETRAINED = "bert-base-uncased"
try:
    tokenizer = AutoTokenizer.from_pretrained(PRETRAINED, use_fast=True)
    print("Loaded tokenizer:", PRETRAINED)
except Exception as e:
    print("AutoTokenizer load failed:", e)
    raise

from tqdm.auto import tqdm
import seaborn as sns

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device:", device)

# Load processed DataFrame if not in memory
try:
    processed
except NameError:
    processed = pd.read_csv('processed_sentiment.csv')
    print("Loaded processed_sentiment.csv")

# Ensure columns: text, label
assert 'text' in processed.columns and 'label' in processed.columns, "processed must have 'text' and 'label' columns"

# --- Hyperparameters ---
PRETRAINED = "bert-base-uncased"   # change if you prefer another model
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 3
LR = 2e-5
WEIGHT_DECAY = 0.01
SEED = 42

torch.manual_seed(SEED)
np.random.seed(SEED)

# Train/Val split (stratify to keep class balance)
train_df, val_df = train_test_split(processed, test_size=0.15, stratify=processed['label'], random_state=SEED)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
num_labels = int(processed['label'].nunique())
print(f"Train shape: {train_df.shape}, Val shape: {val_df.shape}, num_labels: {num_labels}")

# --- Tokenizer ---
tokenizer = BertTokenizer.from_pretrained(PRETRAINED)

# --- Dataset class ---
class FeedbackDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        item = {k: v.squeeze(0) for k, v in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

# --- DataLoaders ---
train_dataset = FeedbackDataset(train_df['text'], train_df['label'], tokenizer, max_len=MAX_LEN)
val_dataset = FeedbackDataset(val_df['text'], val_df['label'], tokenizer, max_len=MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

# --- Model ---
model = BertForSequenceClassification.from_pretrained(PRETRAINED, num_labels=num_labels)
model.to(device)

# --- Optimizer & Scheduler ---
optimizer = AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader) * EPOCHS
warmup_steps = int(0.06 * total_steps)  # small warmup
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

# --- Training loop with validation ---
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    losses = []
    preds = []
    targets = []
    loop = tqdm(dataloader, leave=False)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        losses.append(loss.item())
        batch_preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
        preds.extend(batch_preds.tolist())
        targets.extend(labels.detach().cpu().numpy().tolist())
        loop.set_description(f"Train loss: {np.mean(losses):.4f}")
    acc = accuracy_score(targets, preds)
    f1 = f1_score(targets, preds, average='macro')
    return np.mean(losses), acc, f1

def eval_epoch(model, dataloader, device):
    model.eval()
    losses = []
    preds = []
    targets = []
    with torch.no_grad():
        loop = tqdm(dataloader, leave=False)
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            losses.append(loss.item())
            batch_preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            preds.extend(batch_preds.tolist())
            targets.extend(labels.detach().cpu().numpy().tolist())
            loop.set_description(f"Val loss: {np.mean(losses):.4f}")
    acc = accuracy_score(targets, preds)
    f1 = f1_score(targets, preds, average='macro')
    return np.mean(losses), acc, f1, targets, preds

history = {'train_loss': [], 'train_acc': [], 'train_f1': [], 'val_loss': [], 'val_acc': [], 'val_f1': []}

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    train_loss, train_acc, train_f1 = train_epoch(model, train_loader, optimizer, scheduler, device)
    val_loss, val_acc, val_f1, val_targets, val_preds = eval_epoch(model, val_loader, device)
    print(f"Train loss: {train_loss:.4f} | acc: {train_acc:.4f} | f1: {train_f1:.4f}")
    print(f"Val   loss: {val_loss:.4f} | acc: {val_acc:.4f} | f1: {val_f1:.4f}")
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['train_f1'].append(train_f1)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    history['val_f1'].append(val_f1)

# --- Final evaluation & reporting ---
print("\n=== Final Validation Report ===")
print(classification_report(val_targets, val_preds, digits=4))

cm = confusion_matrix(val_targets, val_preds)
plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix (Validation)')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300)
plt.show()

# Plot training curves
plt.figure(figsize=(10,4))
plt.subplot(1,2,1)
plt.plot(history['train_loss'], label='train_loss')
plt.plot(history['val_loss'], label='val_loss')
plt.title('Loss')
plt.legend()
plt.subplot(1,2,2)
plt.plot(history['train_f1'], label='train_f1')
plt.plot(history['val_f1'], label='val_f1')
plt.title('Macro F1')
plt.legend()
plt.tight_layout()
plt.savefig('training_curves.png', dpi=300)
plt.show()

# Save model & tokenizer
out_dir = "bert_sentiment_model"
os.makedirs(out_dir, exist_ok=True)
model.save_pretrained(out_dir)
tokenizer.save_pretrained(out_dir)
print(f"Saved model and tokenizer to {out_dir}")

# Save label mapping (if you created textual mapping)
if 'sentiment_mapping' in locals():
    import json
    with open(os.path.join(out_dir, 'label_map.json'), 'w') as f:
        json.dump(sentiment_mapping, f)
    print("Saved label mapping to label_map.json")

print("Done.")
