In [None]:
"""
Titanic Kaggle Challenge - Auto-Gluon

For detailed explanations please refer to the Typst documentation file in the GitHub repository.
"""

# Summary:
# The file should have exactly 2 columns:
# PassengerId (sorted in any order)
# Survived (contains your binary predictions: 1 for survived, 0 for deceased)

############################ INSTALLATION ############################

# !pip install autogluon  # If you don't have AutoGluon installed (even on Google Colab)
import pandas as pd
from autogluon.tabular import TabularPredictor

train_data = pd.read_csv('train.csv')


############################ CONFIG & TRAINING ############################

TARGET_COLUMN = 'Survived'

'''
Here it’s a basic configuration for quickly testing AutoGluon: “medium-quality”.
About the metric used hre : For Binary classification (Survived: 1, else: 0), roc_auc is the best metric.
---> See notes in the typst file.

NB :
Moreover in the log it said:
If 'binary' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init
(You may specify problem_type as one of: ['binary', 'multiclass', 'regression', 'quantile'])
'''

predictor = TabularPredictor(   # predictor = model, just like RandomForestRegressor etc., here only defined
    label=TARGET_COLUMN,
    eval_metric='roc_auc',
    path='./autogluon_models'   # Save directory
).fit(                          # Fit = train the model
    train_data=train_data,
    time_limit=600,
    presets='medium_quality'    # Options: 'best_quality', 'high_quality', 'good_quality', 'medium_quality'
)

'''
About path:
When AutoGluon trains your models, it creates a folder containing:

./autogluon_models/
├── models/
│   ├── LightGBM/             # The trained LightGBM model
│   ├── CatBoost/             # The trained CatBoost model
│   ├── NeuralNetTorch/       # The trained neural network
│   ├── WeightedEnsemble_L2/  # The ensemble model (combination)
│   └── ...                   # All other models
├── utils/
│   └── feature_generator.pkl # Data preprocessing transformations
├── predictor.pkl             # Predictor configuration
└── SummaryOfModels.html      # HTML summary report

With:
path='./autogluon_models'
     ││  │
     ││  └─ Folder name (you can change it)
     │└─── Current directory (where the code runs)
     └──── Relative path
'''


############################ RESULTS & ANALYSIS FOR MEDIUM QUALITY ############################

leaderboard = predictor.leaderboard(train_data, silent=True)
# Creates a DataFrame ranking all models from best to worst with their scores, training time, etc.
# This is a built-in AutoGluon function.

print("\n" + "="*60)
print("MODEL LEADERBOARD")
print("="*60)
print(leaderboard)

# Feature importance
# ---> Computes a score for each column (Age, Sex, Pclass...) showing its importance for predictions.
# The higher the score, the more important the feature.

feature_importance = predictor.feature_importance(train_data)
print("\n" + "="*60)
print("FEATURE IMPORTANCE")
print("="*60)
print(feature_importance.head(10))

# Information about the best model
print("\n" + "="*60)
print("BEST MODEL:", predictor.model_best)
print("="*60)
# Example of output: BEST MODEL: WeightedEnsemble_L2

# Training summary
print("\nTraining Summary:")
print(predictor.fit_summary())

"""
fit_summary() returns a complete summary of the training process.
It contains: number of models, total time, best score, etc.
Displays a Python dictionary with all training information. ( JSON :) )
"""


In [None]:
############################ PREDICTIONS ############################

test_data = pd.read_csv('test.csv')
predictions = predictor.predict(test_data)  # Stores results (0 or 1 for each passenger)
print("\nPredictions:", predictions.head())

# Create submission file
submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': predictions
})
submission.to_csv('submission.csv', index=False) #Download the file


In [None]:
############################ ADVANCED AUTOGLUON ############################

'''
# For deeper training :

predictor_advanced = TabularPredictor(
    label=TARGET_COLUMN,
    eval_metric='roc_auc',  # Optimal for binary classification
    path='./autogluon_models_advanced'
).fit(
    train_data=train_data,
    time_limit=3600,  # 1h: AutoGluon will optimize many parameters for maximum quality
    presets='best_quality',

    # Optional custom hyperparameters
    hyperparameters={
        'GBM': {},        # LightGBM: Gradient Boosting Machine
        'CAT': {},        # CatBoost: good for categorical data
        'XGB': {},        # XGBoost
        'RF': {},         # Random Forest
        'NN_TORCH': {},   # PyTorch neural network
    },
    # IMPORTANT -----> This tells AutoGluon to train these 5 model types with their default hyperparameters.
    # -----> It does NOT explore all possible hyperparameter combinations (contrary to what I thought).

    # To automatically explore different combinations, enable hyperparameter tuning, e.g.:
    # predictor.fit(train_data, hyperparameter_tune_kwargs='auto',)
)

# That enables automatic hyperparameter search (classic AutoML).


## Cross-validation for more robustness
# -----> See notes about bagging, stacking, etc.

    num_bag_folds=5,     # 5-fold bagging
    num_bag_sets=1,      # Usually left as 1
    num_stack_levels=1,  # Model stacking

    # Other useful options
    auto_stack=True,
    excluded_model_types=['KNN'],  # Exclude specific models
)
#

"""
We could also do:

hyperparameters={
    'GBM': {'num_leaves': 128, 'learning_rate': 0.05},
    'CAT': {'iterations': 1000},
}

Why exclude KNN?

- Very slow on large datasets
- Often less performant than others
- High memory consumption
We could also exclude Linear Regression.
"""
'''