In [1]:
import pandas as pd
import joblib
import os
import sys
from sklearn.metrics import accuracy_score, precision_score, classification_report

# 1. Point Python to your root folder so it can find 'src'
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

from src.model_selector import AutoModelSelector

# 2. Load the Training Data
train_data_path = os.path.join(parent_dir, 'datasets', 'titanic', 'train.csv')
df_train = pd.read_csv(train_data_path)

X_train = df_train.drop(columns=['Survived'])
y_train = df_train['Survived']

print(f"üìÇ Loaded training data: {X_train.shape[0]} passengers.")

# 3. Boot up the Engine and Train!
print("üöÄ Starting AutoML Training...")
automl = AutoModelSelector(n_trials=20, task='auto')
automl.fit(X_train, y_train)

# 4. Save the freshly trained model to your hard drive
model_path = os.path.join(parent_dir, 'models', 'titanic_model.pkl')
os.makedirs(os.path.dirname(model_path), exist_ok=True) # Ensures the models folder exists
joblib.dump(automl, model_path)

print(f"Model successfully trained and saved to: {model_path}")

  from .autonotebook import tqdm as notebook_tqdm
[32m[I 2026-02-19 23:51:10,880][0m A new study created in memory with name: no-name-597c9e91-972f-4e27-b92c-221f03381668[0m


üìÇ Loaded training data: 891 passengers.
üöÄ Starting AutoML Training...
Auto-configured for CLASSIFICATION
Starting AutoML (CLASSIFICATION) | Scoring: accuracy | Trials: 20


[32m[I 2026-02-19 23:51:11,509][0m Trial 0 finished with value: 0.7384960718294051 and parameters: {'num_strategy': 'mean', 'use_scaler': False, 'use_log': False, 'use_poly': True, 'poly_degree': 2, 'use_pca': False, 'model_type': 'rf', 'rf_n_estimators': 194, 'rf_max_depth': 4}. Best is trial 0 with value: 0.7384960718294051.[0m
[32m[I 2026-02-19 23:51:11,989][0m Trial 1 finished with value: 0.7575757575757577 and parameters: {'num_strategy': 'mean', 'use_scaler': True, 'use_log': False, 'use_poly': True, 'poly_degree': 2, 'use_pca': False, 'model_type': 'xgb', 'xgb_n_estimators': 61, 'xgb_max_depth': 9, 'xgb_lr': 0.19991295387721655}. Best is trial 1 with value: 0.7575757575757577.[0m
[32m[I 2026-02-19 23:51:12,851][0m Trial 2 finished with value: 0.7800224466891134 and parameters: {'num_strategy': 'median', 'use_scaler': True, 'use_log': False, 'use_poly': True, 'poly_degree': 2, 'use_pca': False, 'model_type': 'svm', 'svm_C': 0.4810303371642078}. Best is trial 2 with value:

Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.


[32m[I 2026-02-19 23:51:18,988][0m Trial 6 finished with value: 0.6992143658810326 and parameters: {'num_strategy': 'median', 'use_scaler': True, 'use_log': True, 'use_poly': False, 'use_pca': False, 'model_type': 'xgb', 'xgb_n_estimators': 421, 'xgb_max_depth': 7, 'xgb_lr': 0.1752559991263364}. Best is trial 4 with value: 0.8249158249158249.[0m
[32m[I 2026-02-19 23:51:20,430][0m Trial 7 finished with value: 0.7429854096520763 and parameters: {'num_strategy': 'median', 'use_scaler': True, 'use_log': False, 'use_poly': False, 'use_pca': False, 'model_type': 'xgb', 'xgb_n_estimators': 286, 'xgb_max_depth': 9, 'xgb_lr': 0.015596964598482225}. Best is trial 4 with value: 0.8249158249158249.[0m


Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.


[32m[I 2026-02-19 23:51:21,282][0m Trial 8 finished with value: 0.8136924803591471 and parameters: {'num_strategy': 'median', 'use_scaler': True, 'use_log': True, 'use_poly': False, 'use_pca': True, 'pca_components': 0.9391046148479433, 'model_type': 'svm', 'svm_C': 0.7373835211270326}. Best is trial 4 with value: 0.8249158249158249.[0m


Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.


[32m[I 2026-02-19 23:51:21,988][0m Trial 9 finished with value: 0.7991021324354658 and parameters: {'num_strategy': 'median', 'use_scaler': False, 'use_log': True, 'use_poly': False, 'use_pca': True, 'pca_components': 0.8586067529376473, 'model_type': 'svm', 'svm_C': 0.11821074730960675}. Best is trial 4 with value: 0.8249158249158249.[0m


Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.


[32m[I 2026-02-19 23:51:23,305][0m Trial 10 finished with value: 0.8215488215488215 and parameters: {'num_strategy': 'mean', 'use_scaler': True, 'use_log': True, 'use_poly': True, 'poly_degree': 1, 'use_pca': True, 'pca_components': 0.8123099526395844, 'model_type': 'rf', 'rf_n_estimators': 300, 'rf_max_depth': 7}. Best is trial 4 with value: 0.8249158249158249.[0m


Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.


[32m[I 2026-02-19 23:51:24,680][0m Trial 11 finished with value: 0.819304152637486 and parameters: {'num_strategy': 'mean', 'use_scaler': True, 'use_log': True, 'use_poly': True, 'poly_degree': 1, 'use_pca': True, 'pca_components': 0.8046182188901987, 'model_type': 'rf', 'rf_n_estimators': 299, 'rf_max_depth': 7}. Best is trial 4 with value: 0.8249158249158249.[0m


Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.


[32m[I 2026-02-19 23:51:25,984][0m Trial 12 finished with value: 0.8260381593714926 and parameters: {'num_strategy': 'mean', 'use_scaler': True, 'use_log': True, 'use_poly': True, 'poly_degree': 1, 'use_pca': True, 'pca_components': 0.8191444722457162, 'model_type': 'rf', 'rf_n_estimators': 290, 'rf_max_depth': 12}. Best is trial 12 with value: 0.8260381593714926.[0m


Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.


[32m[I 2026-02-19 23:51:26,996][0m Trial 13 finished with value: 0.8204264870931537 and parameters: {'num_strategy': 'mean', 'use_scaler': True, 'use_log': True, 'use_poly': True, 'poly_degree': 1, 'use_pca': True, 'pca_components': 0.8522737597790285, 'model_type': 'rf', 'rf_n_estimators': 176, 'rf_max_depth': 16}. Best is trial 12 with value: 0.8260381593714926.[0m
[32m[I 2026-02-19 23:51:27,757][0m Trial 14 finished with value: 0.8114478114478114 and parameters: {'num_strategy': 'mean', 'use_scaler': True, 'use_log': False, 'use_poly': False, 'use_pca': True, 'pca_components': 0.8337657759969066, 'model_type': 'rf', 'rf_n_estimators': 71, 'rf_max_depth': 13}. Best is trial 12 with value: 0.8260381593714926.[0m


Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.


[32m[I 2026-02-19 23:51:28,992][0m Trial 15 finished with value: 0.8204264870931538 and parameters: {'num_strategy': 'mean', 'use_scaler': True, 'use_log': True, 'use_poly': True, 'poly_degree': 1, 'use_pca': True, 'pca_components': 0.8865404230119182, 'model_type': 'rf', 'rf_n_estimators': 237, 'rf_max_depth': 11}. Best is trial 12 with value: 0.8260381593714926.[0m
[32m[I 2026-02-19 23:51:29,892][0m Trial 16 finished with value: 0.818181818181818 and parameters: {'num_strategy': 'mean', 'use_scaler': True, 'use_log': False, 'use_poly': True, 'poly_degree': 1, 'use_pca': True, 'pca_components': 0.8374566737300703, 'model_type': 'rf', 'rf_n_estimators': 110, 'rf_max_depth': 19}. Best is trial 12 with value: 0.8260381593714926.[0m


Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.


[32m[I 2026-02-19 23:51:31,097][0m Trial 17 finished with value: 0.8237934904601572 and parameters: {'num_strategy': 'mean', 'use_scaler': True, 'use_log': True, 'use_poly': False, 'use_pca': True, 'pca_components': 0.884928778024729, 'model_type': 'rf', 'rf_n_estimators': 234, 'rf_max_depth': 11}. Best is trial 12 with value: 0.8260381593714926.[0m
[32m[I 2026-02-19 23:51:32,303][0m Trial 18 finished with value: 0.7205387205387206 and parameters: {'num_strategy': 'mean', 'use_scaler': False, 'use_log': False, 'use_poly': True, 'poly_degree': 2, 'use_pca': True, 'pca_components': 0.8273029630698348, 'model_type': 'rf', 'rf_n_estimators': 246, 'rf_max_depth': 3}. Best is trial 12 with value: 0.8260381593714926.[0m


Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.
Skipping Log Transform: Data contains negative values.


[32m[I 2026-02-19 23:51:33,228][0m Trial 19 finished with value: 0.8260381593714926 and parameters: {'num_strategy': 'mean', 'use_scaler': True, 'use_log': True, 'use_poly': False, 'use_pca': True, 'pca_components': 0.8645947519088879, 'model_type': 'rf', 'rf_n_estimators': 132, 'rf_max_depth': 8}. Best is trial 12 with value: 0.8260381593714926.[0m



BEST TRIAL FOUND:
{'num_strategy': 'mean', 'use_scaler': True, 'use_log': True, 'use_poly': True, 'poly_degree': 1, 'use_pca': True, 'pca_components': 0.8191444722457162, 'model_type': 'rf', 'rf_n_estimators': 290, 'rf_max_depth': 12}

Skipping Log Transform: Data contains negative values.
Final Pipeline Retrained and Ready.
Model successfully trained and saved to: c:\Users\feder\Desktop\Projects\auto-ml-pipeline\models\titanic_model.pkl


In [2]:


current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Load the model
model_path = os.path.join(parent_dir, "models", 'titanic_model.pkl')
print(f"Waking up model from: {model_path}")

automl = joblib.load(model_path)
print("Model loaded and ready!")

Waking up model from: c:\Users\feder\Desktop\Projects\auto-ml-pipeline\models\titanic_model.pkl
Model loaded and ready!


In [3]:
test_data_path = os.path.join(parent_dir, 'datasets', 'titanic', 'test.csv')
X_test = pd.read_csv(test_data_path)

print(f"üìÇ Loaded {len(X_test)} new passengers to predict.")

# Make the predictions
predictions = automl.predict(X_test)
print("Predictions complete!")

print("\nFirst 5 Predictions:", predictions[:5])

üìÇ Loaded 418 new passengers to predict.
Predictions complete!

First 5 Predictions: [0 0 0 0 1]


In [5]:
answer_key_path = os.path.join(parent_dir, 'datasets', 'titanic', 'gender_submission.csv')
y_true = pd.read_csv(answer_key_path)['Survived']

accuracy = accuracy_score(y_true, predictions)
precision = precision_score(y_true, predictions)

print("="*30)
print("üèÜ YOUR AUTOML SCORECARD")
print("="*30)
print(f"Accuracy:  {accuracy * 100:.2f}%")
print("="*30)

# Print a detailed breakdown
print("\nDetailed Report:")
print(classification_report(y_true, predictions))

üèÜ YOUR AUTOML SCORECARD
Accuracy:  83.25%

Detailed Report:
              precision    recall  f1-score   support

           0       0.84      0.91      0.87       266
           1       0.82      0.70      0.75       152

    accuracy                           0.83       418
   macro avg       0.83      0.80      0.81       418
weighted avg       0.83      0.83      0.83       418

