In [None]:
# ! pip install pycaret[full]
# ! pip install mlflow==2.11.3

In [19]:
# vagy (preferált) pip install -r requirements.txt
from pycaret.classification import *
import mlflow
print(mlflow.__version__)
from sklearn.model_selection import train_test_split

2.11.3


In [20]:
# Adathalmaz betöltése (feltételezve, hogy CSV formátumban van)
import pandas as pd
df = pd.read_csv('train.csv')

In [33]:
# Need to handle missing values before splitting
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [22]:
import pandas as pd

# Create a copy to avoid modifying the original
data_filled = df.copy()

# Replace NaN values
for column in data_filled.columns:
    if data_filled[column].dtype == "object":
        # Replace with mode (most frequent value)
        data_filled[column].fillna(data_filled[column].mode()[0], inplace=True)
    else:
        # Replace with mean
        data_filled[column].fillna(data_filled[column].mean(), inplace=True)

# If needed, convert to string/object type
data_filled['Embarked'] = data_filled['Embarked'].astype(str)
data_filled['Embarked'] = data_filled['Embarked'].replace({'S': 0, 'C': 1, 'Q': 2})
data_filled['Sex'] = data_filled['Sex'].replace({'male': 0, 'female': 1})

data = data_filled[['Survived', 'Pclass', 'Sex','Age','Parch','Fare','SibSp','Embarked']]  # Use the filled data for further processing

data.head()


Unnamed: 0,Survived,Pclass,Sex,Age,Parch,Fare,SibSp,Embarked
0,0,3,0,22.0,0,7.25,1,0
1,1,1,1,38.0,0,71.2833,1,1
2,1,3,1,26.0,0,7.925,0,0
3,1,1,1,35.0,0,53.1,1,0
4,0,3,0,35.0,0,8.05,0,0


In [23]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['Survived'])

In [24]:
# log_experiment=True - mlflow tracking
exp_clf = setup(data=train_data, target='Survived', log_experiment=True, experiment_name='titanic_exp_2', log_plots=True)

Unnamed: 0,Description,Value
0,Session id,5705
1,Target,Survived
2,Target type,Binary
3,Original data shape,"(712, 8)"
4,Transformed data shape,"(712, 8)"
5,Transformed train set shape,"(498, 8)"
6,Transformed test set shape,"(214, 8)"
7,Numeric features,7
8,Preprocess,True
9,Imputation type,simple


In [25]:
# Modell tanítása és összehasonlítása
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.8294,0.8703,0.7111,0.8273,0.7571,0.6277,0.6383,0.545
gbc,Gradient Boosting Classifier,0.8133,0.8636,0.7005,0.7968,0.737,0.5944,0.6045,0.024
ada,Ada Boost Classifier,0.8052,0.8499,0.7479,0.7557,0.7447,0.5881,0.595,0.021
qda,Quadratic Discriminant Analysis,0.793,0.8524,0.7113,0.7548,0.7245,0.5598,0.5677,0.008
ridge,Ridge Classifier,0.7912,0.8657,0.7221,0.7412,0.7253,0.5577,0.5636,0.008
lda,Linear Discriminant Analysis,0.7912,0.8659,0.7221,0.7412,0.7253,0.5577,0.5636,0.006
lr,Logistic Regression,0.7911,0.8662,0.7221,0.7402,0.7242,0.5572,0.5639,0.828
xgboost,Extreme Gradient Boosting,0.7911,0.8349,0.7005,0.7578,0.72,0.5543,0.5628,0.066
rf,Random Forest Classifier,0.791,0.8433,0.6958,0.757,0.7182,0.5531,0.5604,0.037
lightgbm,Light Gradient Boosting Machine,0.7851,0.8409,0.6847,0.7447,0.7074,0.5385,0.545,0.093


In [26]:
# Modell tanítása és mentése
final_model = finalize_model(best_model)
save_model(final_model, 'Titanic_modelV1')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['Pclass', 'Sex', 'Age', 'Parch',
                                              'Fare', 'SibSp', 'Embarked'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_values=nan,
                                                               strategy='mean'))),
                 ('categorical_imputer',
                  TransformerWrapper(exclude=None, include=[],
                                     transformer=SimpleImputer(add_indicator=False,
                                 

In [27]:
tuned_model = tune_model(best_model, optimize='AUC', search_library='scikit-learn', search_algorithm='random')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.82,0.8913,0.5789,0.9167,0.7097,0.5887,0.6213
1,0.82,0.8896,0.7368,0.7778,0.7568,0.6141,0.6146
2,0.76,0.826,0.4737,0.8182,0.6,0.4455,0.4794
3,0.76,0.8285,0.6842,0.6842,0.6842,0.4907,0.4907
4,0.84,0.8761,0.6316,0.9231,0.75,0.6383,0.6632
5,0.82,0.8234,0.6316,0.8571,0.7273,0.5975,0.613
6,0.88,0.9032,0.6842,1.0,0.8125,0.7288,0.7571
7,0.9,0.9467,0.9,0.8571,0.878,0.7934,0.7941
8,0.7551,0.8009,0.6842,0.6842,0.6842,0.4842,0.4842
9,0.898,0.9614,0.8421,0.8889,0.8649,0.783,0.7837


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [28]:
tuned_model_optuna = tune_model(best_model, optimize='AUC', search_library='optuna')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.84,0.8973,0.5789,1.0,0.7333,0.6303,0.6784
1,0.84,0.8744,0.7368,0.8235,0.7778,0.6534,0.6558
2,0.8,0.8455,0.4737,1.0,0.6429,0.5274,0.5985
3,0.8,0.8413,0.6842,0.7647,0.7222,0.5667,0.5689
4,0.84,0.8829,0.6316,0.9231,0.75,0.6383,0.6632
5,0.76,0.8124,0.4737,0.8182,0.6,0.4455,0.4794
6,0.86,0.8761,0.6316,1.0,0.7742,0.6801,0.7178
7,0.9,0.9333,0.85,0.8947,0.8718,0.7899,0.7906
8,0.8163,0.8237,0.6842,0.8125,0.7429,0.6016,0.607
9,0.8776,0.9614,0.7895,0.8824,0.8333,0.737,0.7399


In [29]:
bagging_model = ensemble_model(best_model, method="Bagging", n_estimators=5)

predictions = predict_model(bagging_model)

print(predictions)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.82,0.8676,0.6316,0.8571,0.7273,0.5975,0.613
1,0.88,0.8795,0.7895,0.8824,0.8333,0.74,0.7428
2,0.76,0.8336,0.4737,0.8182,0.6,0.4455,0.4794
3,0.78,0.8523,0.6842,0.7222,0.7027,0.5283,0.5288
4,0.84,0.8217,0.5789,1.0,0.7333,0.6303,0.6784
5,0.8,0.82,0.5789,0.8462,0.6875,0.5479,0.5693
6,0.84,0.9049,0.6316,0.9231,0.75,0.6383,0.6632
7,0.9,0.9283,0.85,0.8947,0.8718,0.7899,0.7906
8,0.7755,0.8026,0.7368,0.7,0.7179,0.5317,0.5322
9,0.9388,0.9579,0.8947,0.9444,0.9189,0.8698,0.8706


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,CatBoost Classifier,0.8037,0.8532,0.5976,0.8448,0.7,0.5604,0.579


     Pclass  Sex        Age  Parch     Fare  SibSp  Embarked  Survived  \
636       3    0  32.000000      0   7.9250      0         0         0   
197       3    0  42.000000      1   8.4042      0         0         0   
509       3    0  26.000000      0  56.4958      0         0         1   
202       3    0  34.000000      0   6.4958      0         0         0   
674       2    0  29.699118      0   0.0000      0         0         0   
..      ...  ...        ...    ...      ...    ...       ...       ...   
408       3    0  21.000000      0   7.7750      0         0         0   
500       3    0  17.000000      0   8.6625      0         0         0   
5         3    0  29.699118      0   8.4583      0         2         0   
67        3    0  19.000000      0   8.1583      0         0         0   
613       3    0  29.699118      0   7.7500      0         2         0   

     prediction_label  prediction_score  
636                 0            0.8640  
197                 0      

In [30]:
lda = create_model('lda') 
ridge = create_model('ridge') 
gbc = create_model('gbc')

blended_model = blend_models([best_model, lda, ridge, gbc])
compare_models()

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8,0.8998,0.5789,0.8462,0.6875,0.5479,0.5693
1,0.74,0.8455,0.7368,0.6364,0.6829,0.4646,0.4682
2,0.76,0.8693,0.6316,0.7059,0.6667,0.4801,0.4819
3,0.78,0.8234,0.8421,0.6667,0.7442,0.5557,0.5674
4,0.84,0.8981,0.7368,0.8235,0.7778,0.6534,0.6558
5,0.74,0.7547,0.5789,0.6875,0.6286,0.4308,0.4346
6,0.82,0.8947,0.6842,0.8125,0.7429,0.606,0.6113
7,0.84,0.92,0.8,0.8,0.8,0.6667,0.6667
8,0.6939,0.7763,0.7368,0.5833,0.6512,0.3849,0.3933
9,0.898,0.9772,0.8947,0.85,0.8718,0.7871,0.7879


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8,0.8998,0.5789,0.8462,0.6875,0.5479,0.5693
1,0.74,0.8438,0.7368,0.6364,0.6829,0.4646,0.4682
2,0.76,0.8693,0.6316,0.7059,0.6667,0.4801,0.4819
3,0.78,0.8234,0.8421,0.6667,0.7442,0.5557,0.5674
4,0.84,0.8981,0.7368,0.8235,0.7778,0.6534,0.6558
5,0.74,0.7547,0.5789,0.6875,0.6286,0.4308,0.4346
6,0.82,0.8947,0.6842,0.8125,0.7429,0.606,0.6113
7,0.84,0.9217,0.8,0.8,0.8,0.6667,0.6667
8,0.6939,0.7746,0.7368,0.5833,0.6512,0.3849,0.3933
9,0.898,0.9772,0.8947,0.85,0.8718,0.7871,0.7879


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.76,0.8379,0.5789,0.7333,0.6471,0.469,0.4766
1,0.8,0.8829,0.7895,0.7143,0.75,0.584,0.5861
2,0.7,0.7869,0.4211,0.6667,0.5161,0.3144,0.3319
3,0.76,0.8599,0.7368,0.6667,0.7,0.5008,0.5026
4,0.84,0.871,0.6316,0.9231,0.75,0.6383,0.6632
5,0.82,0.8362,0.6316,0.8571,0.7273,0.5975,0.613
6,0.88,0.8744,0.6842,1.0,0.8125,0.7288,0.7571
7,0.92,0.94,0.9,0.9,0.9,0.8333,0.8333
8,0.7347,0.786,0.6842,0.65,0.6667,0.4466,0.447
9,0.9184,0.9614,0.9474,0.8571,0.9,0.8313,0.8343


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.78,0.0,0.5263,0.8333,0.6452,0.4973,0.5248
1,0.76,0.0,0.7368,0.6667,0.7,0.5008,0.5026
2,0.74,0.0,0.4737,0.75,0.5806,0.4059,0.4284
3,0.76,0.0,0.6842,0.6842,0.6842,0.4907,0.4907
4,0.86,0.0,0.6316,1.0,0.7742,0.6801,0.7178
5,0.78,0.0,0.5263,0.8333,0.6452,0.4973,0.5248
6,0.84,0.0,0.5789,1.0,0.7333,0.6303,0.6784
7,0.9,0.0,0.8,0.9412,0.8649,0.7863,0.7929
8,0.7347,0.0,0.6842,0.65,0.6667,0.4466,0.447
9,0.9184,0.0,0.8947,0.8947,0.8947,0.8281,0.8281


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.8294,0.8703,0.7111,0.8273,0.7571,0.6277,0.6383,0.443
gbc,Gradient Boosting Classifier,0.8133,0.8636,0.7005,0.7968,0.737,0.5944,0.6045,0.024
ada,Ada Boost Classifier,0.8052,0.8499,0.7479,0.7557,0.7447,0.5881,0.595,0.017
qda,Quadratic Discriminant Analysis,0.793,0.8524,0.7113,0.7548,0.7245,0.5598,0.5677,0.006
ridge,Ridge Classifier,0.7912,0.8657,0.7221,0.7412,0.7253,0.5577,0.5636,0.005
lda,Linear Discriminant Analysis,0.7912,0.8659,0.7221,0.7412,0.7253,0.5577,0.5636,0.007
lr,Logistic Regression,0.7911,0.8662,0.7221,0.7402,0.7242,0.5572,0.5639,0.008
xgboost,Extreme Gradient Boosting,0.7911,0.8349,0.7005,0.7578,0.72,0.5543,0.5628,0.043
rf,Random Forest Classifier,0.791,0.8433,0.6958,0.757,0.7182,0.5531,0.5604,0.035
lightgbm,Light Gradient Boosting Machine,0.7851,0.8409,0.6847,0.7447,0.7074,0.5385,0.545,0.08


<catboost.core.CatBoostClassifier at 0x16855e50b50>

In [31]:
import mlflow
from mlflow.tracking import MlflowClient

client = MlflowClient()

experiment_name = "titanic_exp_2"

experiment = client.get_experiment_by_name(experiment_name)

runs = client.search_runs(experiment.experiment_id)

best_run = None
best_auc = 0

for run in runs:
    auc = run.data.metrics.get("AUC", None)
    if auc is not None and auc > best_auc:
        best_auc = auc
        best_run = run

if best_run:
    print(f"Best model AUC: {best_auc}, Run ID: {best_run.info.run_id}")
    
    best_model = mlflow.pyfunc.load_model(f"runs:/{best_run.info.run_id}/model")
    
else:
    print("No model with good AUC result")


Best model AUC: 0.8748, Run ID: 367ea09110ef48c784af7f4b24b463b6


In [32]:
predictions = predict_model(best_model, data=test_data)

print(predictions)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,PyFuncModel,0.7933,0.7454,0.5362,0.881,0.6667,0.5294,0.5637


     Pclass  Sex        Age  Parch     Fare  SibSp  Embarked  Survived  \
565       3    0  24.000000      0  24.1500      2         0         0   
160       3    0  44.000000      1  16.1000      0         0         0   
553       3    0  22.000000      0   7.2250      0         1         1   
860       3    0  41.000000      0  14.1083      2         0         0   
241       3    1  29.699118      0  15.5000      1         2         1   
..      ...  ...        ...    ...      ...    ...       ...       ...   
880       2    1  25.000000      1  26.0000      0         0         1   
91        3    0  20.000000      0   7.8542      0         0         0   
883       2    0  28.000000      0  10.5000      0         0         0   
473       2    1  23.000000      0  13.7917      0         1         1   
637       2    0  31.000000      1  26.2500      1         0         0   

     prediction_label  
565                 0  
160                 0  
553                 0  
860            