In [1]:
!pip install pycaret

Collecting pycaret
  Downloading pycaret-3.3.2-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.1/486.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
Collecting joblib<1.4,>=1.2.0 (from pycaret)
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn>1.4.0 (from pycaret)
  Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyod>=1.1.3 (from pycaret)
  Downloading pyod-1.1.3.tar.gz (160 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.5/160.5 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting imbalanced-learn>=0.12.0 (from p

In [5]:
import pandas as pd
from pycaret.classification import setup, compare_models, blend_models, stack_models, finalize_model, predict_model, save_model, create_model, tune_model

# Set a random seed for reproducibility
SEED = 42

# Load the data
train_file_path = 'train.csv'
test_file_path = 'test.csv'
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

# Feature Engineering: Example of creating new features (you can add your custom logic here)
# train_data['New_Feature'] = train_data['Feature1'] * train_data['Feature2']

# Separate features and target variable in train data
data = train_data.drop(columns=['ID'])
data['Target'] = train_data['Target']

# Initialize PyCaret setup
clf_setup = setup(data=data, target='Target', session_id=SEED, use_gpu=True, verbose=False)

# Compare models and select the top 5 models
top5_models = compare_models(n_select=5)

# Tune the top 5 models
tuned_models = [tune_model(model) for model in top5_models]

# Blend the top models
blended = blend_models(estimator_list=tuned_models, fold=5)

# Create a more advanced meta model (e.g., a gradient boosting classifier)
meta_model = create_model('gbc')

# Stack the top models with the gradient boosting meta model
stacked_model = stack_models(estimator_list=tuned_models, meta_model=meta_model, fold=5)

# Finalize the stacked model (retrain on the entire dataset)
final_model = finalize_model(stacked_model)

# Preprocess the test data and make predictions
test_data_processed = predict_model(final_model, data=test_data.drop(columns=['ID']))

# Extract predictions from the processed data
test_data['Target'] = test_data_processed['prediction_label']

# Save the predictions to a CSV file
submission = test_data[['ID', 'Target']]
submission.to_csv('submission.csv', index=False)

# Save the final model for future use
save_model(final_model, 'final_model')

# Load the submission CSV file
submission_file_path = 'submission.csv'
submission = pd.read_csv(submission_file_path)

# Round the 'Target' values to the nearest integer
submission['Target'] = submission['Target'].round().astype(int)

# Save the reformatted submission file
submission.to_csv('reformatted_submission.csv', index=False, header=True)

print(submission)


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 2, number of used features: 0
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bi

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
knn,K Neighbors Classifier,0.7066,0.7597,0.7652,0.732,0.7476,0.3977,0.3993,0.07
lightgbm,Light Gradient Boosting Machine,0.6894,0.7725,0.7671,0.7098,0.7367,0.3593,0.3621,0.649
gbc,Gradient Boosting Classifier,0.6781,0.7544,0.8608,0.6689,0.7523,0.3131,0.3348,2.249
xgboost,Extreme Gradient Boosting,0.6749,0.7501,0.7435,0.7017,0.7218,0.3316,0.3327,0.482
rf,Random Forest Classifier,0.6717,0.7171,0.7595,0.6928,0.7242,0.3207,0.3234,0.816
dt,Decision Tree Classifier,0.647,0.6424,0.6771,0.6942,0.6852,0.2835,0.2839,0.122
et,Extra Trees Classifier,0.6453,0.6727,0.7349,0.6719,0.7014,0.2667,0.2691,0.608
ada,Ada Boost Classifier,0.6319,0.6901,0.801,0.6407,0.7116,0.2196,0.231,0.651
ridge,Ridge Classifier,0.59,0.5787,0.803,0.6047,0.6896,0.1194,0.1302,0.075
lr,Logistic Regression,0.5809,0.568,0.8011,0.5979,0.6845,0.0979,0.107,0.722


Processing:   0%|          | 0/69 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6845,0.7648,0.6321,0.7701,0.6943,0.3748,0.3826
1,0.6667,0.7137,0.6571,0.7263,0.69,0.3315,0.3334
2,0.7258,0.7979,0.6571,0.8214,0.7302,0.4584,0.4702
3,0.6882,0.7296,0.6667,0.7527,0.7071,0.3763,0.3795
4,0.6344,0.7463,0.6,0.7079,0.6495,0.2729,0.2769
5,0.7258,0.7987,0.6415,0.8395,0.7273,0.4613,0.4783
6,0.6828,0.7611,0.5849,0.8052,0.6776,0.3805,0.3995
7,0.6667,0.76,0.5849,0.775,0.6667,0.3461,0.3599
8,0.7312,0.8165,0.6321,0.859,0.7283,0.4742,0.4962
9,0.672,0.7156,0.6509,0.7419,0.6935,0.3441,0.3475


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6364,0.708,0.7264,0.6638,0.6937,0.2486,0.2501
1,0.6882,0.752,0.7333,0.7196,0.7264,0.364,0.3641
2,0.6882,0.7605,0.7429,0.7156,0.729,0.3622,0.3625
3,0.6505,0.7259,0.6857,0.6923,0.689,0.2902,0.2903
4,0.6505,0.7118,0.7524,0.6695,0.7085,0.276,0.2789
5,0.7204,0.7842,0.783,0.7411,0.7615,0.4244,0.4254
6,0.672,0.7138,0.7547,0.6957,0.724,0.3217,0.3233
7,0.6774,0.7481,0.783,0.6917,0.7345,0.3275,0.3317
8,0.7097,0.7623,0.783,0.7281,0.7545,0.4004,0.402
9,0.6613,0.7364,0.7736,0.6777,0.7225,0.2928,0.2971


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6898,0.7654,0.8491,0.6818,0.7563,0.3435,0.3595
1,0.6398,0.7306,0.7905,0.6484,0.7124,0.2428,0.2514
2,0.7043,0.7933,0.8762,0.6866,0.7699,0.3729,0.3952
3,0.672,0.741,0.8571,0.6618,0.7469,0.3025,0.3235
4,0.6882,0.7526,0.8571,0.6767,0.7563,0.3397,0.3584
5,0.6989,0.7784,0.8396,0.6953,0.7607,0.3644,0.3763
6,0.6398,0.6907,0.7736,0.656,0.71,0.2432,0.249
7,0.6613,0.751,0.8396,0.6593,0.7386,0.2769,0.2937
8,0.6774,0.7663,0.8019,0.6855,0.7391,0.3233,0.3302
9,0.6828,0.7365,0.8208,0.685,0.7468,0.3314,0.3412


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6257,0.738,0.9434,0.6098,0.7407,0.1674,0.2312
1,0.6344,0.653,0.981,0.6095,0.7518,0.1826,0.2858
2,0.6613,0.7908,0.9429,0.6346,0.7586,0.2577,0.3224
3,0.6452,0.6992,0.9524,0.6211,0.7519,0.2164,0.2897
4,0.6774,0.7537,0.9619,0.6433,0.771,0.292,0.3698
5,0.6828,0.7603,0.9811,0.646,0.779,0.2934,0.3899
6,0.629,0.6441,0.9245,0.6164,0.7396,0.1764,0.2277
7,0.6667,0.713,0.9811,0.6341,0.7704,0.2537,0.3543
8,0.6237,0.722,0.9434,0.6098,0.7407,0.1574,0.2198
9,0.6237,0.6952,0.9434,0.6098,0.7407,0.1574,0.2198


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6631,0.7314,0.8868,0.6483,0.749,0.2726,0.3053
1,0.6398,0.706,0.8667,0.6319,0.7309,0.2247,0.2518
2,0.6774,0.7885,0.9048,0.6552,0.76,0.3047,0.3438
3,0.6613,0.7399,0.8857,0.6458,0.747,0.271,0.3037
4,0.672,0.7383,0.8762,0.6571,0.751,0.2983,0.3259
5,0.7151,0.7774,0.9057,0.6906,0.7837,0.3878,0.4194
6,0.6129,0.6657,0.8302,0.6197,0.7097,0.1643,0.1808
7,0.6505,0.7621,0.8868,0.6395,0.7431,0.2393,0.2728
8,0.7097,0.7612,0.9245,0.6806,0.784,0.3711,0.4139
9,0.629,0.7019,0.9057,0.6194,0.7356,0.1818,0.2234


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6756,0.7626,0.7642,0.6953,0.7281,0.3283,0.3306
1,0.6828,0.773,0.7915,0.6929,0.7389,0.3393,0.3442
2,0.6828,0.7559,0.7773,0.6979,0.7354,0.3423,0.3454
3,0.672,0.7542,0.7441,0.6978,0.7202,0.3251,0.326
4,0.6586,0.7541,0.763,0.6765,0.7171,0.2905,0.2939
Mean,0.6744,0.7599,0.768,0.6921,0.728,0.3251,0.328
Std,0.0089,0.0072,0.0158,0.008,0.0084,0.0185,0.0186


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6738,0.7373,0.8585,0.6642,0.749,0.3043,0.3253
1,0.6559,0.7536,0.8667,0.6454,0.7398,0.2627,0.2887
2,0.6989,0.8046,0.8857,0.6788,0.7686,0.3587,0.3855
3,0.6559,0.7279,0.8476,0.6496,0.7355,0.2671,0.2871
4,0.6774,0.7286,0.8571,0.6667,0.75,0.3149,0.3352
5,0.7204,0.8013,0.8396,0.7177,0.7739,0.4135,0.4223
6,0.6613,0.7218,0.8868,0.6483,0.749,0.2651,0.2977
7,0.672,0.7665,0.8868,0.6573,0.755,0.2908,0.3221
8,0.6935,0.7726,0.8113,0.6992,0.7511,0.3582,0.3649
9,0.672,0.7295,0.8679,0.6619,0.751,0.2954,0.3195


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6729,0.7449,0.7406,0.7009,0.7202,0.3274,0.3281
1,0.7043,0.7699,0.7062,0.7563,0.7304,0.4039,0.405
2,0.6801,0.7596,0.7062,0.7233,0.7146,0.3508,0.351
3,0.7016,0.7608,0.763,0.7252,0.7436,0.3873,0.388
4,0.6613,0.7593,0.7488,0.684,0.7149,0.2998,0.3017
Mean,0.684,0.7589,0.7329,0.7179,0.7248,0.3538,0.3547
Std,0.0166,0.008,0.023,0.0245,0.011,0.0381,0.0379


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

Transformation Pipeline and Model Successfully Saved
       ID  Target
0    2661       1
1    2662       1
2    2663       1
3    2664       1
4    2665       1
..    ...     ...
661  3322       0
662  3323       1
663  3324       1
664  3325       0
665  3326       1

[666 rows x 2 columns]
