## setup

In [None]:
TARGET = "Transported"
TRAIN_PATH = "titanic/train.csv"
TEST_PATH = "titanic/test.csv"
SUBMIT_NAME = "submission.csv"
ID_COL = "Id"

## dependencies

In [2]:
# ----------------------------
# Install all dependencies for AutoGluon 'extreme' preset
# ----------------------------

# Core AutoGluon
%pip install autogluon.tabular --quiet

%pip install autogluon.tabular[tabpfn]==1.4.0

# Optional: specific installs if needed (sometimes helps)
%pip install catboost lightgbm xgboost fastai torch --quiet


[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.
[0mNote: you may need to restart the kernel to use updated packages.


## fitting (presets='extreme', 'best', 'high', 'good') 
### !!! only 'extreme' works for me but they say it works 4x faster than any other and is SOTA
### try others maybe it is only my problema

In [None]:
import pandas as pd
from autogluon.tabular import TabularPredictor
import os


train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)

predictor = TabularPredictor(label='Transported').fit(
    train_data=train_df,
    presets='extreme', 
    ag_args_fit={'num_gpus': 1},
    num_bag_folds=3
    )
predictions = predictor.predict(test_df)


No path specified. Models will be saved in: "AutogluonModels/ag-20251116_182727"
Preset alias specified: 'extreme' maps to 'extreme_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.12.3
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #49~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC Wed Nov  6 17:42:15 UTC 2
CPU Count:          31
Memory Avail:       98.65 GB / 503.46 GB (19.6%)
Disk Space Avail:   2.47 GB / 16.00 GB (15.4%)
	We recommend a minimum available disk space of 10 GB, and large datasets may require more.
Presets specified: ['extreme']
`extreme` preset uses a dynamic portfolio based on dataset size...
	Detected data size: small (<=30000 samples), using `zeroshot_2025_tabfm` portfolio.
		Note: `zeroshot_2025_tabfm` portfolio requires a CUDA compatible GPU for best performance.
		Make sure you have all the relevant dependencies installed: `pip install autogluon.tabular[tabarena]`.
		It is strongly recommended to use a machi

## review and submit

In [None]:
print("\n===== Leaderboard =====")
lb = predictor.leaderboard(silent=True)
display(lb)

print("\n===== Training Summary =====")
predictor.fit_summary(show_plot=True)

print("\n===== Making Predictions for Submission =====")
test_pred = predictor.predict(test_df)

# If prediction is a dataframe (multi-class probs), turn into single column prediction
if isinstance(test_pred, pd.DataFrame):
    # take class with max probability
    test_pred = test_pred.idxmax(axis=1)

submission = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],  # change to test id column if exists
    'Transported': test_pred
})

submission.to_csv('submission.csv', index=False)

print(f"\nSubmission saved to submission.csv")
submission.head()


===== Leaderboard =====


Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,NeuralNetTorch,0.810345,accuracy,0.017097,53.063031,0.017097,53.063031,1,True,6
1,WeightedEnsemble_L2,0.810345,accuracy,0.017715,53.095845,0.000618,0.032814,2,True,7
2,CatBoost,0.8,accuracy,0.007343,5.694801,0.007343,5.694801,1,True,3
3,RandomForestGini,0.778161,accuracy,0.052479,0.693195,0.052479,0.693195,1,True,1
4,ExtraTreesEntr,0.777011,accuracy,0.054792,0.660572,0.054792,0.660572,1,True,5
5,RandomForestEntr,0.773563,accuracy,0.05306,0.661132,0.05306,0.661132,1,True,2
6,ExtraTreesGini,0.772414,accuracy,0.071172,62.665939,0.071172,62.665939,1,True,4



===== Training Summary =====
*** Summary of fit() ***
Estimated performance of each model:
                 model  score_val eval_metric  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       NeuralNetTorch   0.810345    accuracy       0.017097  53.063031                0.017097          53.063031            1       True          6
1  WeightedEnsemble_L2   0.810345    accuracy       0.017715  53.095845                0.000618           0.032814            2       True          7
2             CatBoost   0.800000    accuracy       0.007343   5.694801                0.007343           5.694801            1       True          3
3     RandomForestGini   0.778161    accuracy       0.052479   0.693195                0.052479           0.693195            1       True          1
4       ExtraTreesEntr   0.777011    accuracy       0.054792   0.660572                0.054792           0.660572            1       True          5
5     Ra




Submission saved to submission.csv


Unnamed: 0,PassengerId,Transported
0,0013_01,True
1,0018_01,False
2,0019_01,True
3,0021_01,True
4,0023_01,False


## then catboost:

In [None]:
import pandas as pd
from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score

# ============================
# 1. Load your dataset
# ============================


# Change these paths
train_path = TRAIN_PATH
test_path = TEST_PATH

# Read CSVs
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print('data loaded')

# ============================
# 2. Define target and features
# ============================

# Change these column names
target_col = TARGET
id_col = ID_COL  # or whatever your ID column is
feature_cols = [c for c in train_df.columns if c not in [target_col, id_col]]

# If you have categorical columns, list them here
categorical_cols = [c for c in feature_cols if train_df[c].dtype == 'object' or train_df[c].dtype.name == 'category']

for col in categorical_cols:
    train_df[col] = train_df[col].astype(str).fillna('nan')
    test_df[col] = test_df[col].astype(str).fillna('nan')

print('made features')

# ============================
# 3. Split train/validation
# ============================

X_train, X_valid, y_train, y_valid = train_test_split(
    train_df[feature_cols], train_df[target_col], test_size=0.2, random_state=42
)

# ============================
# 4. Initialize CatBoost model
# ============================

# Use CatBoostRegressor for regression or CatBoostClassifier for classification
model_type = "classification"  # change to "classification" if needed

if model_type == "regression":
    model = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        eval_metric='RMSE',
        random_seed=42,
        task_type="GPU",  # use CPU if no GPU
        verbose=100,
        use_best_model=True
    )
else:
    model = CatBoostClassifier(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        eval_metric='Accuracy',
        random_seed=42,
        task_type="GPU",
        verbose=100,
        use_best_model=True
    )

# ============================
# 5. Train the model
# ============================
print('starting training')

train_pool = Pool(X_train, y_train, cat_features=categorical_cols)
valid_pool = Pool(X_valid, y_valid, cat_features=categorical_cols)

model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=50)

# ============================
# 6. Evaluate on validation
# ============================

y_pred = model.predict(X_valid)

if model_type == "regression":
    rmse = mean_squared_error(y_valid, y_pred, squared=False)
    print(f"Validation RMSE: {rmse:.4f}")
else:
    acc = accuracy_score(y_valid, y_pred)
    print(f"Validation Accuracy: {acc:.4f}")

# ============================
# 7. Predict on test set
# ============================

test_pred = model.predict(test_df[feature_cols])

# Prepare submission
submission = pd.DataFrame({
    id_col: test_df[id_col],
    target_col: test_pred
})

submission.to_csv(SUBMIT_NAME, index=False)
print(f"Submission saved to {SUBMIT_NAME}")


data loaded
made features
starting training
