In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e12/sample_submission.csv
/kaggle/input/playground-series-s5e12/train.csv
/kaggle/input/playground-series-s5e12/test.csv


## 1. Setup and Data Loading

In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import time
from datetime import datetime

In [3]:
start_time = time.time()
print("=" * 80)
print(" COMPLETE 8-MODEL EVALUATION PIPELINE")
print(f"Started: {datetime.now().strftime('%H:%M:%S')}")
print("=" * 80)

print("\n1. LOADING DATA...")
train_df = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv')
test_df = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv')
print(" Loaded from Kaggle")

# Use 150K samples for evaluation
SAMPLE_SIZE = 150000
if len(train_df) > SAMPLE_SIZE:
    train_df = train_df.sample(SAMPLE_SIZE, random_state=42)
    print(f" Using {SAMPLE_SIZE:,} samples for evaluation")

print(f"Train: {train_df.shape}, Test: {test_df.shape}")
print(f"Time: {time.time()-start_time:.1f}s")

 COMPLETE 8-MODEL EVALUATION PIPELINE
Started: 04:35:51

1. LOADING DATA...
 Loaded from Kaggle
 Using 150,000 samples for evaluation
Train: (150000, 26), Test: (300000, 25)
Time: 3.6s


## 2. Data Analysis

In [4]:
print("\n" + "=" * 80)
print("2. DATA ANALYSIS")
print("=" * 80)

print("Target distribution:")
diabetes_rate = train_df['diagnosed_diabetes'].mean()
print(f"  Diabetes: {diabetes_rate:.2%}")
print(f"  Non-diabetes: {1-diabetes_rate:.2%}")
print(f"  Ratio: {(1-diabetes_rate)/diabetes_rate:.1f}:1")

print("\nFeature types:")
num_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = train_df.select_dtypes(include=['object']).columns.tolist()
num_cols = [c for c in num_cols if c not in ['id', 'diagnosed_diabetes']]

print(f"  Numerical: {len(num_cols)}")
print(f"  Categorical: {len(cat_cols)}")
print(f"Time: {time.time()-start_time:.1f}s")


2. DATA ANALYSIS
Target distribution:
  Diabetes: 62.40%
  Non-diabetes: 37.60%
  Ratio: 0.6:1

Feature types:
  Numerical: 18
  Categorical: 6
Time: 3.7s


## 3. Feature Engineering

In [5]:
print("\n" + "=" * 80)
print("3. FEATURE ENGINEERING")
print("=" * 80)

def create_features(df):
    df = df.copy()
    # Health risk features
    df['bmi_age'] = df['bmi'] * df['age'] / 100
    df['waist_bmi'] = df['waist_to_hip_ratio'] * df['bmi']
    
    # Blood pressure
    df['pulse_pressure'] = df['systolic_bp'] - df['diastolic_bp']
    
    # Cholesterol ratios
    df['chol_hdl_ratio'] = df['cholesterol_total'] / (df['hdl_cholesterol'] + 1)
    df['trig_hdl_ratio'] = df['triglycerides'] / (df['hdl_cholesterol'] + 1)
    
    # Lifestyle score
    df['health_score'] = (
        df['diet_score'] * 0.3 +
        np.log1p(df['physical_activity_minutes_per_week']) * 0.3 +
        (8 - df['sleep_hours_per_day']).clip(0, 4) * 0.2 +
        (6 - df['screen_time_hours_per_day']).clip(0, 4) * 0.2
    )
    
    # Risk flags
    df['is_senior'] = (df['age'] >= 60).astype(int)
    df['is_obese'] = (df['bmi'] >= 30).astype(int)
    
    return df

train_df = create_features(train_df)
test_df = create_features(test_df)
print(f"Added 9 engineered features")
print(f"Total features: {train_df.shape[1] - 2}")
print(f"Time: {time.time()-start_time:.1f}s")


3. FEATURE ENGINEERING
Added 9 engineered features
Total features: 32
Time: 3.9s


## 4. Data Preparation

In [6]:
print("\n" + "=" * 80)
print("4. DATA PREPARATION")
print("=" * 80)

X = train_df.drop(['id', 'diagnosed_diabetes'], axis=1)
y = train_df['diagnosed_diabetes']
X_test = test_df.drop('id', axis=1)
test_ids = test_df['id']

# Update column lists
cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(f"Training features: {X.shape}")
print(f"Test features: {X_test.shape}")
print(f"Target: {y.shape}")
print(f"Numerical: {len(num_cols)}, Categorical: {len(cat_cols)}")
print(f"Time: {time.time()-start_time:.1f}s")


4. DATA PREPARATION
Training features: (150000, 32)
Test features: (300000, 32)
Target: (150000,)
Numerical: 26, Categorical: 6
Time: 4.1s


## 5. Preprocessing Pipeline

In [7]:
print("\n" + "=" * 80)
print("5. PREPROCESSING PIPELINE")
print("=" * 80)

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Simple preprocessing
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
])

print("Preprocessor configured")
print(f"Will process {len(num_cols)} numerical features")
print(f"Will process {len(cat_cols)} categorical features")
print(f"Time: {time.time()-start_time:.1f}s")


5. PREPROCESSING PIPELINE
Preprocessor configured
Will process 26 numerical features
Will process 6 categorical features
Time: 6.5s


## 6. 8-Model Evaluation

In [8]:
print("\n" + "=" * 80)
print("6. 8-MODEL EVALUATION")
print("=" * 80)


from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold


# Define 5 different models
models = {
    'LogisticRegression': LogisticRegression(
        random_state=42, max_iter=1000, class_weight='balanced',
        C=0.1, solver='liblinear', n_jobs=1
    ),
    
    'RandomForest': RandomForestClassifier(
        random_state=42, n_estimators=150, class_weight='balanced',
        max_depth=12, min_samples_split=10, n_jobs=-1
    ),
    
    'ExtraTrees': ExtraTreesClassifier(
        random_state=42, n_estimators=150, class_weight='balanced',
        max_depth=12, min_samples_split=10, n_jobs=-1
    ),
    
    'GradientBoosting': GradientBoostingClassifier(
        random_state=42, n_estimators=150,
        learning_rate=0.05, max_depth=6, subsample=0.8
    ),
    
    'XGBoost': XGBClassifier(
        random_state=42, n_estimators=150,
        learning_rate=0.05, max_depth=8,
        subsample=0.8, colsample_bytree=0.8,
        eval_metric='logloss', use_label_encoder=False,
        verbosity=0, n_jobs=-1
    ),
    
    'LightGBM': LGBMClassifier(
        random_state=42, n_estimators=150,
        learning_rate=0.05, num_leaves=31, max_depth=8,
        subsample=0.8, colsample_bytree=0.8,
        verbosity=-1, n_jobs=-1
    ),
    
    'KNeighbors': KNeighborsClassifier(
        n_neighbors=50, weights='distance',
        n_jobs=-1
    ),
    
    'SGDClassifier': SGDClassifier(
        random_state=42, max_iter=1000,
        loss='log_loss', class_weight='balanced',
        n_jobs=-1, early_stopping=True
    )
}



6. 8-MODEL EVALUATION


In [9]:
print("Evaluating 8 models with 3-fold cross-validation...")
print("-" * 60)

best_score = 0
best_model = None
best_model_name = ""
results = {}

for i, (name, model) in enumerate(models.items(), 1):
    model_start = time.time()
    
    print(f"\n[{i}/8] {name}...")
    
    try:
        # Create pipeline
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
        
        # 3-fold CV
        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        scores = cross_val_score(pipeline, X, y, cv=cv, 
                               scoring='roc_auc', n_jobs=-1)
        
        elapsed = time.time() - model_start
        
        results[name] = {
            'auc': scores.mean(),
            'std': scores.std(),
            'time': elapsed,
            'scores': scores
        }
        
        print(f"  AUC: {scores.mean():.6f} (±{scores.std():.6f})")
        print(f"  Time: {elapsed:.1f}s")
        
        if scores.mean() > best_score:
            best_score = scores.mean()
            best_model = model
            best_model_name = name
            print(f"  New best!")
            
    except Exception as e:
        print(f"  Error: {str(e)[:80]}")

print("\n" + "=" * 60)
print(f"BEST MODEL: {best_model_name}")
print(f"Best AUC: {best_score:.6f}")
print("=" * 60)
print(f"Total evaluation time: {time.time()-start_time:.1f}s")

Evaluating 8 models with 3-fold cross-validation...
------------------------------------------------------------

[1/8] LogisticRegression...
  AUC: 0.694917 (±0.001055)
  Time: 6.5s
  New best!

[2/8] RandomForest...
  AUC: 0.696339 (±0.001574)
  Time: 46.1s
  New best!

[3/8] ExtraTrees...
  AUC: 0.685132 (±0.000679)
  Time: 34.8s

[4/8] GradientBoosting...
  AUC: 0.711093 (±0.000573)
  Time: 210.7s
  New best!

[5/8] XGBoost...
  AUC: 0.713700 (±0.001436)
  Time: 8.2s
  New best!

[6/8] LightGBM...




  AUC: 0.714791 (±0.000811)
  Time: 157.3s
  New best!

[7/8] KNeighbors...
  AUC: 0.667917 (±0.001690)
  Time: 63.7s

[8/8] SGDClassifier...
  AUC: 0.672362 (±0.001770)
  Time: 2.5s

BEST MODEL: LightGBM
Best AUC: 0.714791
Total evaluation time: 546.5s


## 7. Results Comparison

In [10]:
print("\n" + "=" * 80)
print("7. RESULTS ANALYSIS")
print("=" * 80)

# Create ranking
ranking = []
for name, res in results.items():
    ranking.append({
        'Model': name,
        'AUC': f"{res['auc']:.6f}",
        'Std': f"{res['std']:.6f}",
        'Time_s': f"{res['time']:.1f}",
        'Diff': f"{res['auc'] - best_score:+.6f}"
    })

ranking_df = pd.DataFrame(ranking)
ranking_df = ranking_df.sort_values('AUC', ascending=False)

print("\nMODEL RANKING:")
print(ranking_df.to_string(index=False))


7. RESULTS ANALYSIS

MODEL RANKING:
             Model      AUC      Std Time_s      Diff
          LightGBM 0.714791 0.000811  157.3 +0.000000
           XGBoost 0.713700 0.001436    8.2 -0.001091
  GradientBoosting 0.711093 0.000573  210.7 -0.003699
      RandomForest 0.696339 0.001574   46.1 -0.018452
LogisticRegression 0.694917 0.001055    6.5 -0.019874
        ExtraTrees 0.685132 0.000679   34.8 -0.029659
     SGDClassifier 0.672362 0.001770    2.5 -0.042429
        KNeighbors 0.667917 0.001690   63.7 -0.046875


In [11]:
# Insights
print(f"\nINSIGHTS:")
print(f"1. Best model: {best_model_name} (AUC: {best_score:.6f})")
print(f"2. Number of models >0.71 AUC: {sum(float(r['AUC']) > 0.71 for r in ranking)}")
print(f"3. Fastest model: {ranking_df.iloc[ranking_df['Time_s'].astype(float).argmin()]['Model']}")
print(f"4. Total evaluation time: {sum(float(r['Time_s']) for r in ranking):.1f}s")

print(f"\nTime: {time.time()-start_time:.1f}s")


INSIGHTS:
1. Best model: LightGBM (AUC: 0.714791)
2. Number of models >0.71 AUC: 3
3. Fastest model: SGDClassifier
4. Total evaluation time: 529.8s

Time: 546.6s


## 8. Load Full Dataset

In [12]:
print("\n" + "=" * 80)
print("8. LOADING FULL DATASET")
print("=" * 80)

print("Loading complete dataset for final training...")

full_train = pd.read_csv('/kaggle/input/playground-series-s5e12/train.csv')
full_test = pd.read_csv('/kaggle/input/playground-series-s5e12/test.csv')

# Apply features
full_train = create_features(full_train)
full_test = create_features(full_test)

X_full = full_train.drop(['id', 'diagnosed_diabetes'], axis=1)
y_full = full_train['diagnosed_diabetes']
X_test_full = full_test.drop('id', axis=1)

print(f"Full dataset loaded")
print(f"Training: {X_full.shape}, Test: {X_test_full.shape}")
print(f"Time: {time.time()-start_time:.1f}s")


8. LOADING FULL DATASET
Loading complete dataset for final training...
Full dataset loaded
Training: (700000, 32), Test: (300000, 32)
Time: 550.2s


## 9. Train Best Model

In [13]:
print("\n" + "=" * 80)
print(f"9. TRAINING {best_model_name} ON FULL DATA")
print("=" * 80)

# Create final pipeline
final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', best_model)
])

print(f"Training on {len(X_full):,} samples...")
train_start = time.time()
final_pipeline.fit(X_full, y_full)
train_time = time.time() - train_start

print(f"Training completed in {train_time:.1f}s")
print(f"Total time: {time.time()-start_time:.1f}s")


9. TRAINING LightGBM ON FULL DATA
Training on 700,000 samples...
Training completed in 12.0s
Total time: 562.3s


## 10. Model Saving (Optional):

In [14]:
# print("\n" + "=" * 80)
# print("SAVING MODEL...")
# print("=" * 80)

# import joblib
# import json
# import os

# # Save the trained pipeline
# model_filename = f'/kaggle/working/diabetes_model_{best_model_name}.pkl'
# joblib.dump(final_pipeline, model_filename, compress=3)
# print(f"Model saved: {model_filename}")

# # Save feature names
# feature_names = list(X_full.columns)
# with open('/kaggle/working/feature_names.json', 'w') as f:
#     json.dump(feature_names, f)
# print(f"Feature names saved: /kaggle/working/feature_names.json")

# # Save model metadata
# metadata = {
#     'model_name': best_model_name,
#     'model_type': type(best_model).__name__,
#     'cv_score': best_score,
#     'validation_auc': None,  # Will update after validation
#     'training_samples': len(X_full),
#     'training_features': len(feature_names),
#     'training_time': train_time,
#     'features': feature_names,
#     'date_saved': time.strftime("%Y-%m-%d %H:%M:%S")
# }

# joblib.dump(metadata, '/kaggle/working/model_metadata.pkl')
# print(f"Metadata saved: /kaggle/working/model_metadata.pkl")

# print(f"\nModel size: {os.path.getsize(model_filename)/1024:.1f} KB")
# print(f"Features used: {len(feature_names)}")
# print(f"Best model: {best_model_name} (AUC: {best_score:.6f})")

# print("\n" + "=" * 80)
# print("MODEL SAVED!")
# print("=" * 80)

## 11. Validation Check

In [15]:
print("\n" + "=" * 80)
print("10. VALIDATION CHECK")
print("=" * 80)

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

# Quick validation
X_train_val, X_val, y_train_val, y_val = train_test_split(
    X_full, y_full, test_size=0.1, random_state=42, stratify=y_full
)

val_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', best_model)
])

val_pipeline.fit(X_train_val, y_train_val)
val_preds = val_pipeline.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, val_preds)

print(f"Validation AUC: {val_auc:.6f}")
print(f"CV AUC was: {best_score:.6f}")
print(f"Difference: {val_auc - best_score:+.6f}")

if abs(val_auc - best_score) < 0.005:
    print("Results consistent")
else:
    print("Some variation")

print(f"\nTime: {time.time()-start_time:.1f}s")


10. VALIDATION CHECK
Validation AUC: 0.720114
CV AUC was: 0.714791
Difference: +0.005323
Some variation

Time: 574.1s


## 12. Make Predictions

In [16]:
print("\n" + "=" * 80)
print("11. MAKING PREDICTIONS")
print("=" * 80)

print(f"Predicting {len(X_test_full):,} test samples...")
pred_start = time.time()
test_predictions = final_pipeline.predict_proba(X_test_full)[:, 1]
pred_time = time.time() - pred_start

print(f"Predictions completed in {pred_time:.1f}s")

print(f"\nPrediction stats:")
print(f"  Mean: {test_predictions.mean():.6f}")
print(f"  Std:  {test_predictions.std():.6f}")
print(f"  Min:  {test_predictions.min():.6f}")
print(f"  Max:  {test_predictions.max():.6f}")
print(f"  <0.1: {np.mean(test_predictions < 0.1):.1%}")
print(f"  >0.9: {np.mean(test_predictions > 0.9):.1%}")

print(f"\nTime: {time.time()-start_time:.1f}s")


11. MAKING PREDICTIONS
Predicting 300,000 test samples...
Predictions completed in 2.0s

Prediction stats:
  Mean: 0.603075
  Std:  0.180347
  Min:  0.098513
  Max:  0.969827
  <0.1: 0.0%
  >0.9: 4.8%

Time: 576.2s


## 13. Create Submission

In [17]:
print("\n" + "=" * 80)
print("12. CREATE SUBMISSION FILE")
print("=" * 80)

submission = pd.DataFrame({
    'id': full_test['id'],
    'diagnosed_diabetes': test_predictions
})

# Ensure valid probabilities
submission['diagnosed_diabetes'] = submission['diagnosed_diabetes'].clip(0.00001, 0.99999)

submission_file = '/kaggle/working/submission.csv'
submission.to_csv(submission_file, index=False)

print(f"Submission saved: {submission_file}")
print(f"Size: {submission.shape}")
print(f"Memory: {submission.memory_usage(deep=True).sum()/1024:.1f} KB")

print("\nSample predictions:")
print(submission.head().to_string(index=False))
print(f"\nTime: {time.time()-start_time:.1f}s")


12. CREATE SUBMISSION FILE
Submission saved: /kaggle/working/submission.csv
Size: (300000, 2)
Memory: 4687.6 KB

Sample predictions:
    id  diagnosed_diabetes
700000            0.521200
700001            0.617628
700002            0.760267
700003            0.450236
700004            0.878922

Time: 576.9s


## 14. Final Summary

In [18]:
print("\n" + "=" * 80)
print("13. FINAL SUMMARY")
print("=" * 80)

total_time = time.time() - start_time
minutes = total_time / 60

print(f"8-MODEL PIPELINE COMPLETED IN {minutes:.1f} MINUTES")
print("-" * 50)

print(f"\nBEST MODEL: {best_model_name}")
print(f"   CV AUC: {best_score:.6f}")
print(f"   Validation AUC: {val_auc:.6f}")

print(f"\nTOP 3 MODELS:")
sorted_models = sorted(results.items(), key=lambda x: x[1]['auc'], reverse=True)
for i, (name, res) in enumerate(sorted_models[:3], 1):
    star = " ★" if i == 1 else ""
    print(f"   {i}. {name:20s}: {res['auc']:.6f} ({res['time']:.1f}s){star}")

print(f"\nSUBMISSION:")
print(f"   File: {submission_file}")
print(f"   Predictions: {len(test_predictions):,}")
print(f"   Mean probability: {test_predictions.mean():.4f}")

print(f"\nTIMING:")
print(f"   Model evaluation: {sum(r['time'] for r in results.values()):.1f}s")
print(f"   Full training: {train_time:.1f}s")
print(f"   Predictions: {pred_time:.1f}s")
print(f"   Total: {total_time:.1f}s ({minutes:.1f} minutes)")

print(f"\nEXPECTED KAGGLE SCORE:")
expected_range = f"{best_score-0.005:.4f} - {best_score+0.005:.4f}"
print(f"   AUC likely between: {expected_range}")

print("\n" + "=" * 80)


print(f"\nTotal time: {minutes:.1f} minutes")
print(f"Finished at: {datetime.now().strftime('%H:%M:%S')}")


13. FINAL SUMMARY
8-MODEL PIPELINE COMPLETED IN 9.6 MINUTES
--------------------------------------------------

BEST MODEL: LightGBM
   CV AUC: 0.714791
   Validation AUC: 0.720114

TOP 3 MODELS:
   1. LightGBM            : 0.714791 (157.3s) ★
   2. XGBoost             : 0.713700 (8.2s)
   3. GradientBoosting    : 0.711093 (210.7s)

SUBMISSION:
   File: /kaggle/working/submission.csv
   Predictions: 300,000
   Mean probability: 0.6031

TIMING:
   Model evaluation: 529.8s
   Full training: 12.0s
   Predictions: 2.0s
   Total: 576.9s (9.6 minutes)

EXPECTED KAGGLE SCORE:
   AUC likely between: 0.7098 - 0.7198


Total time: 9.6 minutes
Finished at: 04:45:28


## 15. Check and download output files (optional)

In [19]:
import os

print("CHECKING FILES IN /kaggle/working:")
print("=" * 50)

files = os.listdir('/kaggle/working')
if not files:
    print("Folder is EMPTY!")
    print("This means the saving code didn't work properly.")
else:
    for file in files:
        path = f'/kaggle/working/{file}'
        size_kb = os.path.getsize(path) / 1024 if os.path.exists(path) else 0
        print(f"{file} ({size_kb:.1f} KB)")

CHECKING FILES IN /kaggle/working:
__notebook__.ipynb (56.5 KB)
submission.csv (7627.6 KB)


In [20]:
# import os
# import shutil
# from IPython.display import HTML, display

# print(" CREATING DOWNLOAD LINKS...")
# print("=" * 60)

# # Create a simple HTML page with download links
# files = ['diabetes_model_LightGBM.pkl', 'feature_names.json', 'model_metadata.pkl', 'submission_file.csv']

# html_content = "<h3> Download Your Files:</h3><ul>"

# for file in files:
#     path = f'/kaggle/working/{file}'
#     if os.path.exists(path):
#         # Create a data URL for direct download
#         import base64
#         with open(path, 'rb') as f:
#             data = f.read()
#             b64 = base64.b64encode(data).decode()
        
#         html_content += f"""
#         <li>
#             <a href="data:application/octet-stream;base64,{b64}" download="{file}">
#                 {file} ({len(data)/1024:.1f} KB)
#             </a>
#         </li>
#         """
#     else:
#         html_content += f"<li> {file} not found</li>"

# html_content += "</ul>"
# display(HTML(html_content))