# Sentiment Analysis Model Training

**Objetivo:** Entrenar un modelo de analisis de sentimientos usando Sentiment140 dataset (1.6M tweets)

**Workflow:**
1. Cargar dataset de Kaggle (Sentiment140)
2. Preprocesar y dividir datos
3. Entrenar modelo DistilBERT
4. Evaluar performance
5. Guardar modelo para uso local

**Modelo resultante:**
- Acepta 1 o multiples textos
- Funciona con tweets, reviews, cualquier texto corto
- Listo para inferencia rapida en produccion

## Setup and Imports

In [None]:
# Check Internet Connection
import urllib.request

def check_internet():
    try:
        urllib.request.urlopen('https://huggingface.co', timeout=3)
        print("Internet connection: OK")
        return True
    except:
        print("Internet connection: FAILED")
        print("\nIMPORTANT: Enable Internet in Kaggle!")
        print("1. Settings (top right) -> Internet -> ON")
        print("2. Save & Run All")
        return False

has_internet = check_internet()

if not has_internet:
    print("\nCannot proceed without internet to download models.")
    print("Please enable internet and restart the notebook.")

In [None]:
# Download tokenizer files manually to avoid template search bug
import requests
import os
import json

print("Downloading tokenizer files manually...")

# Create local directory
tokenizer_dir = "/kaggle/working/distilbert_tokenizer"
os.makedirs(tokenizer_dir, exist_ok=True)

# Files to download from HuggingFace
base_url = "https://huggingface.co/distilbert-base-uncased/resolve/main"
files = {
    "tokenizer_config.json": f"{base_url}/tokenizer_config.json",
    "vocab.txt": f"{base_url}/vocab.txt", 
    "tokenizer.json": f"{base_url}/tokenizer.json",
    "special_tokens_map.json": f"{base_url}/special_tokens_map.json",
    "config.json": f"{base_url}/config.json"
}

for filename, url in files.items():
    filepath = os.path.join(tokenizer_dir, filename)
    try:
        response = requests.get(url, timeout=30)
        if response.status_code == 200:
            with open(filepath, 'wb') as f:
                f.write(response.content)
            print(f"[OK] {filename}")
        else:
            print(f"[ERROR] {filename} - Status {response.status_code}")
    except Exception as e:
        print(f"[ERROR] {filename} - Error: {e}")

print(f"\nTokenizer files ready at: {tokenizer_dir}")

In [None]:
# Early import of torch to check GPU
import torch

print("PyTorch check:")
print(f"  Version: {torch.__version__}")
print(f"  CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"  GPU: {torch.cuda.get_device_name(0)}")
    print(f"  GPU Count: {torch.cuda.device_count()}")
else:
    print("  WARNING: No GPU detected!")

In [None]:
# Standard libraries
import os
import json
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime

# ML & NLP
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    DistilBertTokenizerFast,  # Use specific tokenizer to avoid template search
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("All imports successful!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## Configuration

In [None]:
# Model Configuration
MODEL_NAME = "distilbert-base-uncased"
MODEL_VERSION = "v1.0"
MAX_LENGTH = 128  # Token length for tweets/short texts
BATCH_SIZE = 64  # Increased from 32 to 64 for faster training
LEARNING_RATE = 2e-5
EPOCHS = 3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Data Configuration  
TRAIN_SIZE = 0.8
VAL_SIZE = 0.1
TEST_SIZE = 0.1

# Sample size (set to None to use full dataset)
SAMPLE_SIZE = 100000  # Use 100k for faster training, None for full 1.6M

# Output Configuration
OUTPUT_DIR = "./shameless_sentiment_model"

print("="*50)
print("CONFIGURATION")
print("="*50)
print(f"Model: {MODEL_NAME}")
print(f"Device: {DEVICE}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Batch Size: {BATCH_SIZE} (optimized for speed)")
print(f"Epochs: {EPOCHS}")
print(f"Sample Size: {SAMPLE_SIZE if SAMPLE_SIZE else 'Full dataset'}")
print(f"FP16: {torch.cuda.is_available()}")
print("="*50)

## Step 1: Load Dataset

**Dataset:** Sentiment140 (1.6M tweets)
- URL: https://www.kaggle.com/datasets/kazanova/sentiment140
- Add this dataset to your notebook in Kaggle before running

Este dataset contiene 1.6 millones de tweets etiquetados automaticamente:
- 0 = Negative sentiment
- 4 = Positive sentiment (convertiremos a 1)

In [None]:
# Load Sentiment140 dataset from Kaggle
# Dataset: https://www.kaggle.com/datasets/kazanova/sentiment140

print("Loading Sentiment140 dataset...")

# First, check what files are available
print("\nAvailable input files:")
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Try to find the correct path
possible_paths = [
    '/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv',
    '/kaggle/input/sentiment140/testdata.manual.2009.06.14.csv',
    '/kaggle/input/sentiment140/sentiment140.csv',
]

csv_path = None
for path in possible_paths:
    if os.path.exists(path):
        csv_path = path
        print(f"\nFound dataset at: {csv_path}")
        break

# If not found in expected paths, use the first CSV found
if csv_path is None:
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            if filename.endswith('.csv'):
                csv_path = os.path.join(dirname, filename)
                print(f"\nUsing dataset: {csv_path}")
                break
        if csv_path:
            break

if csv_path is None:
    raise FileNotFoundError("No CSV file found in /kaggle/input/. Please add the Sentiment140 dataset.")

# Load the CSV
df = pd.read_csv(csv_path, 
                 encoding='latin-1', 
                 header=None,
                 names=['sentiment', 'id', 'date', 'query', 'user', 'text'])

# Convert sentiment: 0 (negative) stays 0, 4 (positive) becomes 1
df['sentiment'] = df['sentiment'].replace(4, 1)

# Keep only text and sentiment
df = df[['text', 'sentiment']]

# Sample for faster training (remove for full dataset)
if SAMPLE_SIZE:
    df = df.sample(n=min(SAMPLE_SIZE, len(df)), random_state=42).reset_index(drop=True)
    print(f"\nUsing sample of {len(df):,} texts")
else:
    print(f"\nUsing full dataset: {len(df):,} texts")

print(f"\nSentiment distribution:")
print(df['sentiment'].value_counts())
print(f"\nSample texts:")
print(df.head())

## Step 2: Data Preprocessing

Crear un dataset PyTorch **agnostico** que funcione con cualquier texto.

In [None]:
class SentimentDataset(Dataset):
    """
    Dataset agnostico para analisis de sentimientos.
    Funciona con cualquier texto de entrada (tweets, reviews, etc.)
    """
    
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

print("Dataset class created")

In [None]:
# Split dataset
print("Splitting dataset...")

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['text'].values,
    df['sentiment'].values,
    test_size=(VAL_SIZE + TEST_SIZE),
    random_state=42,
    stratify=df['sentiment'].values
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=TEST_SIZE/(VAL_SIZE + TEST_SIZE),
    random_state=42,
    stratify=temp_labels
)

print(f"Train: {len(train_texts)}, Val: {len(val_texts)}, Test: {len(test_texts)}")

In [None]:
# Load tokenizer from local directory (avoids template search)
print(f"Loading tokenizer from local files...")

tokenizer_dir = "/kaggle/working/distilbert_tokenizer"
tokenizer = DistilBertTokenizerFast.from_pretrained(
    tokenizer_dir,
    local_files_only=True
)

print("Tokenizer loaded successfully")

# Create datasets with torch format for optimization
print("\nCreating optimized datasets...")
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer, MAX_LENGTH)

print(f"Datasets created: Train={len(train_dataset)}, Val={len(val_dataset)}, Test={len(test_dataset)}")
print(f"Expected steps per epoch: {len(train_dataset) // BATCH_SIZE}")

## Step 3: Initialize Model

In [None]:
# Load pre-trained model
print(f"Loading model: {MODEL_NAME}")

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,  # Binary classification: positive/negative
    problem_type="single_label_classification"
)

# Move model to device
print(f"\nMoving model to {DEVICE}...")
model.to(DEVICE)

# Verify model is on correct device
if torch.cuda.is_available():
    print(f"Model device: {next(model.parameters()).device}")
    print(f"GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.2f} MB")
    
print(f"\nModel loaded successfully on {DEVICE}")

## Step 4: Training Configuration

In [None]:
# Define training arguments (OPTIMIZED for Kaggle GPU)
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    
    # Evaluation strategy
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    
    # Early stopping (optional)
    # Uncomment to stop if no improvement for 1 epoch:
    # early_stopping_patience=1,
    
    # Logging - less frequent for cleaner output
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=50,  # Changed from 10 to 50 for less noise
    logging_first_step=True,
    
    # Checkpointing
    save_total_limit=2,  # Keep only best 2 checkpoints
    
    # Performance optimizations
    fp16=torch.cuda.is_available(),  # Half precision for speed
    dataloader_num_workers=2,  # Parallel data loading
    gradient_accumulation_steps=1,  # No accumulation (batch size is good)
    
    # Disable wandb if not needed
    report_to="none",  # Disable wandb/tensorboard logging
)

# Define metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

print("Trainer configured with optimizations:")
print(f"  - Batch size: {BATCH_SIZE} (2x faster)")
print(f"  - FP16: Enabled")
print(f"  - Logging every: 50 steps")
print(f"  - Data workers: 2")
print(f"  - Early stopping: Disabled (can enable)")

In [None]:
## Step 5: Train Model

In [None]:
# Train the model with progress tracking
print("="*60)
print("STARTING TRAINING")
print("="*60)
print(f"Total training samples: {len(train_dataset)}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Steps per epoch: {len(train_dataset) // BATCH_SIZE}")
print(f"Total steps: {(len(train_dataset) // BATCH_SIZE) * EPOCHS}")
print(f"Total epochs: {EPOCHS}")
print(f"Device: {DEVICE}")
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
print("="*60)
print("\nNOTE: In Kaggle, progress bars may not show in real-time.")
print("You will see loss/metrics printed every 50 steps.")
print("Expected time: ~10-15 minutes with GPU (batch_size=64)")
print("="*60 + "\n")

# Force output flush
import sys
sys.stdout.flush()

# Start training
start_time = datetime.now()
print(f"Training started at: {start_time.strftime('%H:%M:%S')}\n")

try:
    train_result = trainer.train()
    
    end_time = datetime.now()
    training_time = (end_time - start_time).total_seconds()
    
    print(f"\n{'='*60}")
    print("TRAINING COMPLETED SUCCESSFULLY!")
    print(f"{'='*60}")
    print(f"Training time: {training_time:.2f} seconds ({training_time/60:.2f} minutes)")
    print(f"\nFinal training metrics:")
    for key, value in train_result.metrics.items():
        print(f"  {key}: {value:.4f}" if isinstance(value, float) else f"  {key}: {value}")
    
except Exception as e:
    print(f"\nERROR during training: {e}")
    import traceback
    traceback.print_exc()
    raise

## Step 6: Evaluate Model

In [None]:
# Evaluate on test set
print("Evaluating on test set...")
test_results = trainer.evaluate(test_dataset)

print(f"\nTest Results:")
for metric, value in test_results.items():
    print(f"  {metric}: {value:.4f}")

# Get predictions for detailed analysis
print("Getting predictions for detailed analysis...")

predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)
true_labels = predictions.label_ids

# Classification report
print("\nClassification Report:")
print(classification_report(
    true_labels,
    pred_labels,
    target_names=['Negative', 'Positive']
))

In [None]:
# Confusion Matrix
cm = confusion_matrix(true_labels, pred_labels)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

print("Evaluation complete!")

In [None]:
## Step 7: Save Model

Guardar el modelo entrenado para crear un Kaggle Dataset.

In [None]:
# Create output directory
output_path = Path(OUTPUT_DIR) / MODEL_VERSION
output_path.mkdir(parents=True, exist_ok=True)

print(f"Saving model to: {output_path}")

# Save model and tokenizer
model.save_pretrained(output_path / "model")
tokenizer.save_pretrained(output_path / "tokenizer")

# Save configuration
config = {
    "model_name": MODEL_NAME,
    "model_version": MODEL_VERSION,
    "max_length": MAX_LENGTH,
    "num_labels": 2,
    "label_mapping": {"0": "negative", "1": "positive"},
    "training_date": datetime.now().isoformat(),
    "training_time_seconds": training_time,
    "device_used": DEVICE,
    "epochs": EPOCHS,
    "batch_size": BATCH_SIZE,
    "learning_rate": LEARNING_RATE,
    "dataset_size": len(df)
}

with open(output_path / "config.json", "w") as f:
    json.dump(config, f, indent=2)

# Save metrics
metrics_data = {
    "test_accuracy": float(test_results.get("eval_accuracy", 0)),
    "test_f1": float(test_results.get("eval_f1", 0)),
    "test_loss": float(test_results.get("eval_loss", 0)),
    "train_loss": float(train_result.metrics.get("train_loss", 0)),
    "classification_report": classification_report(
        true_labels, pred_labels, 
        target_names=['Negative', 'Positive'],
        output_dict=True
    )
}

with open(output_path / "metrics.json", "w") as f:
    json.dump(metrics_data, f, indent=2)

print("Model saved successfully!")
print(f"\nModel files:")
print(f"  - model/")
print(f"  - tokenizer/")
print(f"  - config.json")
print(f"  - metrics.json")
print(f"\nTest Accuracy: {metrics_data['test_accuracy']:.4f}")
print(f"Test F1 Score: {metrics_data['test_f1']:.4f}")

## Next Steps: Create Kaggle Dataset

Para usar este modelo localmente:

1. **Guardar como Kaggle Dataset:**
   - En este notebook, ve a "Save Version"
   - El modelo se guarda en `/kaggle/working/shameless_sentiment_model/v1.0/`
   - Crea un nuevo Dataset desde este output

2. **Descargar localmente:**
   ```bash
   kaggle datasets download YOUR_USERNAME/shameless-sentiment-model
   unzip shameless-sentiment-model.zip -d data/models/v1.0/
   ```

3. **Usar en tu aplicacion:**
   ```python
   from sentiment_analyser.models import SentimentAnalyzer
   analyzer = SentimentAnalyzer(use_kaggle_model=True, kaggle_model_version="v1.0")
   result = analyzer.analyze("I love this!")
   ```

**Training Complete!**