# Train Hybrid RF Model

## Files to Upload
1. `training_data.csv` (exported from the app)

In [1]:
# Install dependencies
!pip install -q sentence-transformers scikit-learn pandas numpy

In [None]:
# Download feature_engineering.py automatically
import urllib.request
print("Downloading feature_engineering.py...")
urllib.request.urlretrieve(
    'https://static.philippdubach.com/apps/feature_engineering.py',
    'feature_engineering.py'
)
print("✓ Downloaded feature_engineering.py")

In [2]:
# Check GPU
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

CUDA available: True
GPU: NVIDIA A100-SXM4-40GB


In [None]:
# Upload training data
from google.colab import files
print("Upload: training_data.csv (exported from app Settings > Export)")
uploaded = files.upload()

In [4]:
import pandas as pd
import numpy as np
import pickle
from datetime import datetime
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler, label_binarize
import warnings
warnings.filterwarnings('ignore')

from feature_engineering import FeaturePipeline

## 1. Load Data

In [5]:
df = pd.read_csv('training_data.csv')
print(f"Loaded {len(df)} samples")
print(f"\nVote distribution:")
print(df['vote'].value_counts(normalize=True))

Loaded 2357 samples

Vote distribution:
vote
dislike    0.491303
like       0.359779
neutral    0.148918
Name: proportion, dtype: float64


## 2. Generate MPNet Embeddings

In [6]:
# Load MPNet
print("Loading MPNet...")
mpnet = SentenceTransformer('all-mpnet-base-v2')
print(f"Embedding dim: {mpnet.get_sentence_embedding_dimension()}")

Loading MPNet...
Embedding dim: 768


In [7]:
# Generate embeddings
texts = (df['title'].fillna('') + ' ' + df['description'].fillna('')).tolist()

print(f"Generating embeddings for {len(texts)} articles...")
embeddings = mpnet.encode(
    texts,
    batch_size=32,
    show_progress_bar=True,
    convert_to_numpy=True
)
print(f"Embeddings shape: {embeddings.shape}")

Generating embeddings for 2357 articles...


Batches:   0%|          | 0/74 [00:00<?, ?it/s]

Embeddings shape: (2357, 768)


## 3. Extract Engineered Features

In [8]:
# Prepare labels
vote_map = {'dislike': 0, 'neutral': 1, 'like': 2}
y = df['vote'].map(vote_map).values

# Fit feature pipeline
print("Fitting feature pipeline...")
feature_pipeline = FeaturePipeline()
feature_pipeline.fit(df, y)

# Transform
engineered_features = feature_pipeline.transform(df)
print(f"Engineered features shape: {engineered_features.shape}")

Fitting feature pipeline...
Engineered features shape: (2357, 639)


In [9]:
# Combine embeddings + engineered features
X = np.hstack([embeddings, engineered_features])
print(f"Combined features: {X.shape}")
print(f"  - Embeddings: 768")
print(f"  - Engineered: {engineered_features.shape[1]}")

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Combined features: (2357, 1407)
  - Embeddings: 768
  - Engineered: 639


## 4. Train with Cross-Validation

In [10]:
# Cross-validation
n_splits = 5
cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

fold_results = []

for fold, (train_idx, val_idx) in enumerate(cv.split(X_scaled, y), 1):
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Train Random Forest
    model = RandomForestClassifier(
        n_estimators=200,
        max_depth=5,
        min_samples_split=5,
        min_samples_leaf=2,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)

    # Evaluate
    y_pred = model.predict(X_val)
    y_proba = model.predict_proba(X_val)

    # ROC-AUC
    y_val_bin = label_binarize(y_val, classes=[0, 1, 2])
    roc_auc = roc_auc_score(y_val_bin, y_proba, average='macro', multi_class='ovr')
    accuracy = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average='macro')

    fold_results.append({'fold': fold, 'roc_auc': roc_auc, 'accuracy': accuracy, 'f1': f1})
    print(f"Fold {fold}: ROC-AUC={roc_auc:.4f}, Accuracy={accuracy:.4f}, F1={f1:.4f}")

# Summary
mean_auc = np.mean([r['roc_auc'] for r in fold_results])
std_auc = np.std([r['roc_auc'] for r in fold_results])
print(f"\n{'='*50}")
print(f"ROC-AUC: {mean_auc:.4f} (+/- {std_auc:.4f})")
print(f"{'='*50}")

Fold 1: ROC-AUC=0.7793, Accuracy=0.6525, F1=0.5511
Fold 2: ROC-AUC=0.7445, Accuracy=0.5784, F1=0.4836
Fold 3: ROC-AUC=0.7446, Accuracy=0.5902, F1=0.4759
Fold 4: ROC-AUC=0.7725, Accuracy=0.6200, F1=0.5080
Fold 5: ROC-AUC=0.7279, Accuracy=0.5435, F1=0.4357

ROC-AUC: 0.7537 (+/- 0.0192)


## 5. Train Final Model

In [11]:
# Train on all data
print("Training final model on all data...")
final_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
final_model.fit(X_scaled, y)
print("Done!")

Training final model on all data...
Done!


## 6. Save Model

In [12]:
# Save model package
model_data = {
    'model': final_model,
    'feature_pipeline': feature_pipeline,
    'scaler': scaler,
    'config': {
        'model_type': 'random_forest',
        'n_estimators': 200,
        'max_depth': 5,
        'random_state': 42,
        'embedding_model': 'sentence-transformers/all-mpnet-base-v2',
        'embedding_dim': 768
    },
    'results': {
        'mean_roc_auc': mean_auc,
        'std_roc_auc': std_auc,
        'n_samples': len(y),
        'n_features': X.shape[1]
    },
    'saved_at': datetime.now().isoformat()
}

with open('hybrid_rf.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("Saved to hybrid_rf.pkl")

Saved to hybrid_rf.pkl


In [13]:
# Download
files.download('hybrid_rf.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 7. Summary

In [14]:
print(f"\n{'='*50}")
print("TRAINING SUMMARY")
print(f"{'='*50}")
print(f"Samples: {len(y)}")
print(f"Features: {X.shape[1]} (768 embeddings + {X.shape[1]-768} engineered)")
print(f"ROC-AUC: {mean_auc:.4f} (+/- {std_auc:.4f})")

print(f"\nCopy hybrid_rf.pkl to: ml/models/hybrid_rf.pkl")


TRAINING SUMMARY
Samples: 2357
Features: 1407 (768 embeddings + 639 engineered)
ROC-AUC: 0.7537 (+/- 0.0192)

Copy hybrid_rf.pkl to: ml/models/hybrid_rf.pkl
