# Sentiment Analysis Model Training for Kaggle

**Objetivo:** Entrenar un modelo de análisis de sentimientos que sea **agnóstico a la fuente de datos** y pueda analizar 1..n textos de entrada.

**Workflow:**
1. ⚙️ **Training en Kaggle** → GPU gratuita + datasets públicos
2. 💾 **Export modelo** → Guardar en Kaggle Dataset
3. 🏠 **Deploy local** → Descargar via Kaggle API y usar en producción

**Características del modelo:**
- ✅ Acepta 1 o múltiples textos
- ✅ Independiente de la fuente (Twitter, texto genérico, etc.)
- ✅ Optimizado para inferencia rápida
- ✅ Fácil de exportar e importar

## 📦 Setup and Imports

In [None]:
# Standard libraries
import os
import json
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime

# ML & NLP
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✅ All imports successful!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

## ⚙️ Configuration

In [None]:
# Model Configuration
MODEL_NAME = "distilbert-base-uncased"  # Fast and efficient
MODEL_VERSION = "v1.0"
MAX_LENGTH = 128  # Token length for tweets/short texts
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
EPOCHS = 3
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Data Configuration
DATASET_NAME = "sentiment140"  # Change to your dataset in Kaggle
TRAIN_SIZE = 0.8
VAL_SIZE = 0.1
TEST_SIZE = 0.1

# Output Configuration
OUTPUT_DIR = "./shameless_sentiment_model"
SAVE_TO_KAGGLE = True  # Set to True to save as Kaggle Dataset

print(f"🤖 Model: {MODEL_NAME}")
print(f"🎯 Device: {DEVICE}")
print(f"📊 Batch Size: {BATCH_SIZE}")
print(f"🔄 Epochs: {EPOCHS}")

## 📊 Step 1: Load Dataset

**Nota:** En Kaggle, usa datasets públicos como Sentiment140, Twitter Sentiment Analysis, etc.
Este código es **agnóstico** - funciona con cualquier dataset que tenga columnas 'text' y 'sentiment'.

In [None]:
# Example: Loading Sentiment140 dataset (modify path for Kaggle)
# In Kaggle, use: /kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv

def load_sentiment140_sample():
    """
    Load and prepare a sample dataset.
    In Kaggle, replace this with actual dataset loading.
    """
    # For demonstration - create sample data
    # In Kaggle: df = pd.read_csv('/kaggle/input/sentiment140/...', encoding='latin-1')
    
    sample_data = {
        'text': [
            "I love this product! It's amazing!",
            "This is terrible, worst experience ever",
            "Not sure how I feel about this",
            "Absolutely fantastic, highly recommend",
            "Disappointed with the quality",
            "Pretty decent, nothing special",
            "Outstanding service and support!",
            "Waste of money, do not buy",
            "It's okay, works as expected",
            "Best purchase I've ever made!"
        ],
        'sentiment': [1, 0, 1, 1, 0, 1, 1, 0, 1, 1]  # 1=positive, 0=negative
    }
    
    return pd.DataFrame(sample_data)

# Load data
print("📥 Loading dataset...")
df = load_sentiment140_sample()

print(f"✅ Dataset loaded: {len(df)} samples")
print(f"\nDataset structure:")
print(df.head())
print(f"\nSentiment distribution:")
print(df['sentiment'].value_counts())

In [None]:
## 🔄 Step 2: Data Preprocessing

Crear un dataset PyTorch **agnóstico** que funcione con cualquier texto.

class SentimentDataset(Dataset):
    """
    Dataset agnóstico para análisis de sentimientos.
    Funciona con cualquier texto de entrada (tweets, reviews, etc.)
    """
    
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

print("✅ Dataset class created")

In [None]:
# Split dataset
print("📊 Splitting dataset...")

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df['text'].values,
    df['sentiment'].values,
    test_size=(VAL_SIZE + TEST_SIZE),
    random_state=42,
    stratify=df['sentiment'].values
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts,
    temp_labels,
    test_size=TEST_SIZE/(VAL_SIZE + TEST_SIZE),
    random_state=42,
    stratify=temp_labels
)

print(f"✅ Train: {len(train_texts)}, Val: {len(val_texts)}, Test: {len(test_texts)}")

In [None]:
# Initialize tokenizer
print(f"🔤 Loading tokenizer: {MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Create datasets
train_dataset = SentimentDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
val_dataset = SentimentDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)
test_dataset = SentimentDataset(test_texts, test_labels, tokenizer, MAX_LENGTH)

print("✅ Datasets created successfully")

## 🤖 Step 3: Initialize Model

In [None]:
# Load pre-trained model
print(f"🤖 Loading model: {MODEL_NAME}")

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,  # Binary classification: positive/negative
    problem_type="single_label_classification"
)

model.to(DEVICE)
print(f"✅ Model loaded on {DEVICE}")

## 🎯 Step 4: Training Configuration

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_dir=f"{OUTPUT_DIR}/logs",
    logging_steps=10,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),  # Mixed precision if GPU available
)

# Define metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

print("✅ Trainer configured")

In [None]:
## 🏋️ Step 5: Train Model

In [None]:
# Train the model
print("🏋️ Starting training...")
start_time = datetime.now()

train_result = trainer.train()

end_time = datetime.now()
training_time = (end_time - start_time).total_seconds()

print(f"\n✅ Training completed!")
print(f"⏱️ Training time: {training_time:.2f} seconds ({training_time/60:.2f} minutes)")
print(f"📊 Final metrics: {train_result.metrics}")

## 📈 Step 6: Evaluate Model

In [None]:
# Evaluate on test set
print("📊 Evaluating on test set...")
test_results = trainer.evaluate(test_dataset)

print(f"\n🎯 Test Results:")
for metric, value in test_results.items():
    print(f"  {metric}: {value:.4f}")

# Get predictions for detailed analysis
print("🔍 Getting predictions for detailed analysis...")

predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(-1)
true_labels = predictions.label_ids

# Classification report
print("\n📋 Classification Report:")
print(classification_report(
    true_labels,
    pred_labels,
    target_names=['Negative', 'Positive']
))

In [None]:
# Confusion Matrix
cm = confusion_matrix(true_labels, pred_labels)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

print("✅ Evaluation complete!")

In [None]:
## 🧪 Step 7: Test with Sample Texts (Data-Agnostic)

Probar el modelo con 1..n textos de cualquier fuente.

In [None]:
# Create inference pipeline
print("🔮 Creating inference pipeline...")
sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device=0 if DEVICE == "cuda" else -1
)

# Test with single text
single_text = "I absolutely love this! Best thing ever!"
result = sentiment_pipeline(single_text)[0]
print(f"\n✅ Single text prediction:")
print(f"Text: {single_text}")
print(f"Result: {result}")

# Test with multiple texts (1..n texts)
multiple_texts = [
    "This is amazing, highly recommend!",
    "Terrible experience, very disappointed",
    "Not bad, could be better",
    "Absolutely fantastic product!",
    "Waste of time and money"
]

print(f"\n✅ Multiple texts prediction ({len(multiple_texts)} texts):")
results = sentiment_pipeline(multiple_texts)
for text, result in zip(multiple_texts, results):
    print(f"Text: {text[:50]}...")
    print(f"Result: {result['label']} (confidence: {result['score']:.4f})")
    print("-" * 80)

## 💾 Step 8: Save Model for Production

Guardar el modelo entrenado para ser usado localmente.

In [None]:
# Create model directory
output_path = Path(OUTPUT_DIR) / MODEL_VERSION
output_path.mkdir(parents=True, exist_ok=True)

print(f"💾 Saving model to: {output_path}")

# Save model and tokenizer
model.save_pretrained(output_path / "model")
tokenizer.save_pretrained(output_path / "tokenizer")

# Save configuration and metrics
config = {
    "model_name": MODEL_NAME,
    "model_version": MODEL_VERSION,
    "max_length": MAX_LENGTH,
    "num_labels": 2,
    "label_mapping": {0: "negative", 1: "positive"},
    "training_date": datetime.now().isoformat(),
    "training_time_seconds": training_time,
    "device_used": DEVICE,
    "epochs": EPOCHS,
    "batch_size": BATCH_SIZE,
    "learning_rate": LEARNING_RATE
}

with open(output_path / "config.json", "w") as f:
    json.dump(config, f, indent=2)

# Save metrics
metrics = {
    "test_accuracy": float(test_results.get("eval_accuracy", 0)),
    "test_f1": float(test_results.get("eval_f1", 0)),
    "test_loss": float(test_results.get("eval_loss", 0)),
    "train_loss": float(train_result.metrics.get("train_loss", 0)),
    "classification_report": classification_report(
        true_labels, pred_labels, 
        target_names=['Negative', 'Positive'],
        output_dict=True
    )
}

with open(output_path / "metrics.json", "w") as f:
    json.dump(metrics, f, indent=2)

print(f"✅ Model saved successfully!")
print(f"\nSaved files:")
print(f"  - {output_path / 'model'}")
print(f"  - {output_path / 'tokenizer'}")
print(f"  - {output_path / 'config.json'}")
print(f"  - {output_path / 'metrics.json'}")

## 📦 Step 9: Package for Kaggle Dataset (Optional)

Si estás en Kaggle, crea un dataset con el modelo entrenado.

In [None]:
# Create README for the model
readme_content = f"""# Shameless Sentiment Model {MODEL_VERSION}

## Model Information
- **Base Model**: {MODEL_NAME}
- **Version**: {MODEL_VERSION}
- **Training Date**: {datetime.now().strftime('%Y-%m-%d')}
- **Task**: Binary Sentiment Classification (Positive/Negative)

## Performance
- **Test Accuracy**: {metrics['test_accuracy']:.4f}
- **Test F1 Score**: {metrics['test_f1']:.4f}
- **Test Loss**: {metrics['test_loss']:.4f}

## Usage

### In Kaggle
1. Save this notebook's output as a Kaggle Dataset
2. Make it public or private
3. Note the dataset name (e.g., `username/shameless-sentiment-models`)

### Download to Local
```bash
# Install Kaggle API
pip install kaggle

# Download dataset (replace with your dataset name)
kaggle datasets download username/shameless-sentiment-models
unzip shameless-sentiment-models.zip -d data/models/
```

### Load Model Locally
```python
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

# Load model
model = AutoModelForSequenceClassification.from_pretrained("data/models/{MODEL_VERSION}/model")
tokenizer = AutoTokenizer.from_pretrained("data/models/{MODEL_VERSION}/tokenizer")

# Create pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Predict single text
result = sentiment_analyzer("I love this product!")
print(result)  # [{{'label': 'LABEL_1', 'score': 0.99}}]

# Predict multiple texts (data-agnostic: works with 1..n texts)
texts = ["Great product!", "Terrible experience", "Not bad"]
results = sentiment_analyzer(texts)
print(results)
```

## Label Mapping
- LABEL_0 = Negative
- LABEL_1 = Positive

## Training Configuration
- Epochs: {EPOCHS}
- Batch Size: {BATCH_SIZE}
- Learning Rate: {LEARNING_RATE}
- Max Length: {MAX_LENGTH} tokens
- Device: {DEVICE}

## Data-Agnostic Design
This model is designed to be **source-agnostic**:
- ✅ Works with tweets
- ✅ Works with reviews
- ✅ Works with any short text
- ✅ Accepts 1 or multiple texts
- ✅ No preprocessing required (model handles it internally)
"""

with open(output_path / "README.md", "w") as f:
    f.write(readme_content)

print("✅ README.md created")
print("\n" + "="*80)
print("📦 MODEL PACKAGE READY FOR KAGGLE DATASET")
print("="*80)
print(f"\nTo create Kaggle Dataset:")
print(f"1. In Kaggle, go to 'Data' → 'New Dataset'")
print(f"2. Upload the folder: {output_path}")
print(f"3. Name it: 'shameless-sentiment-models'")
print(f"4. Set visibility (public/private)")
print(f"5. Click 'Create'")
print("\n" + "="*80)

## 🏠 Step 10: How to Use Model Locally

Instrucciones para descargar y usar el modelo en tu aplicación local.

In [None]:
# Example: How to integrate with local application
integration_example = """
# ============================================================================
# INTEGRATION WITH LOCAL APPLICATION (Shameless)
# ============================================================================

# 1. Download model from Kaggle Dataset
# --------------------------------------
# Command line:
#   kaggle datasets download username/shameless-sentiment-models -p data/models/
#   unzip data/models/shameless-sentiment-models.zip -d data/models/

# 2. Create Model Loader (models/model_loader.py)
# ------------------------------------------------
from pathlib import Path
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import json

class KaggleModelLoader:
    def __init__(self, models_dir="data/models"):
        self.models_dir = Path(models_dir)
    
    def load_model(self, version="v1.0"):
        model_path = self.models_dir / version / "model"
        tokenizer_path = self.models_dir / version / "tokenizer"
        config_path = self.models_dir / version / "config.json"
        
        # Load configuration
        with open(config_path, 'r') as f:
            config = json.load(f)
        
        # Load model and tokenizer
        model = AutoModelForSequenceClassification.from_pretrained(model_path)
        tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
        
        # Create pipeline
        sentiment_pipeline = pipeline(
            "sentiment-analysis",
            model=model,
            tokenizer=tokenizer,
            device=-1  # CPU, use 0 for GPU
        )
        
        return sentiment_pipeline, config

# 3. Use in SentimentAnalyzer (models/inference/sentiment_analyzer.py)
# ---------------------------------------------------------------------
class SentimentAnalyzer:
    def __init__(self, model_version="v1.0", use_kaggle_model=True):
        if use_kaggle_model:
            loader = KaggleModelLoader()
            self.pipeline, self.config = loader.load_model(model_version)
            self.label_mapping = self.config.get("label_mapping", {})
        else:
            # Use HuggingFace model
            self.pipeline = pipeline("sentiment-analysis")
    
    def analyze(self, text):
        result = self.pipeline(text)[0]
        return {
            'sentiment': self.label_mapping.get(str(result['label']), result['label']),
            'score': result['score']
        }
    
    def analyze_batch(self, texts, batch_size=32):
        # Data-agnostic: accepts 1..n texts
        results = self.pipeline(texts, batch_size=batch_size)
        return [
            {
                'sentiment': self.label_mapping.get(str(r['label']), r['label']),
                'score': r['score']
            }
            for r in results
        ]

# 4. Use in application
# ---------------------
from sentiment_analyser.models import SentimentAnalyzer

# Initialize with Kaggle model
analyzer = SentimentAnalyzer(use_kaggle_model=True, model_version="v1.0")

# Analyze single text
result = analyzer.analyze("I love this product!")
print(result)  # {'sentiment': 'positive', 'score': 0.99}

# Analyze multiple texts (data-agnostic)
tweets = ["Great service!", "Terrible experience", "Not bad at all"]
results = analyzer.analyze_batch(tweets)
for tweet, result in zip(tweets, results):
    print(f"{tweet}: {result['sentiment']} ({result['score']:.2f})")

# Works with ANY text source - tweets, reviews, comments, etc.
"""

print(integration_example)

print("\n✅ Integration example ready!")
print("\n" + "="*80)
print("🎉 NOTEBOOK COMPLETE!")
print("="*80)
print(f"\nNext Steps:")
print(f"1. ✅ Train model in Kaggle (this notebook)")
print(f"2. 📦 Create Kaggle Dataset with trained model")
print(f"3. 🏠 Download to local application")
print(f"4. 🚀 Use in production with ANY text source")
print("\n" + "="*80)