# Set up library

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.pipeline import Pipeline as ImbPipeline
from sklearn.preprocessing import MinMaxScaler ,StandardScaler ,OneHotEncoder
from sklearn.feature_selection import SelectKBest,f_classif, chi2,SelectFromModel,RFE
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression 
from imblearn.over_sampling import SMOTE,ADASYN
from sklearn.model_selection import KFold, cross_val_score,train_test_split,GridSearchCV,StratifiedKFold,TunedThresholdClassifierCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score,classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis
from sklearn.utils.class_weight import compute_class_weight



# Train set

In [2]:
data=pd.read_csv("file/train_df.csv")
df=pd.DataFrame(data)
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No,...,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn
0,1,0,0,0,66,1,1,61.15,4017.45,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0
1,1,0,0,0,65,1,1,84.85,5459.20,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0
2,0,0,1,1,72,1,0,20.35,1354.40,1.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
3,0,0,1,1,68,1,1,72.95,4953.25,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0
4,0,0,1,1,12,0,0,35.50,432.25,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4220,0,0,0,0,63,1,0,104.50,6590.80,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
4221,0,0,0,0,1,1,0,51.25,51.25,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0
4222,1,0,1,0,71,1,0,109.25,7707.70,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0
4223,1,0,1,0,24,1,0,20.40,482.80,1.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0


In [3]:
X = df.drop(columns=['Churn'])
y = df['Churn']

# Validation Set

In [4]:
data=pd.read_csv("file/val_df.csv")
val_df=pd.DataFrame(data)
val_df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No,...,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn
0,0,0,0,0,2,0,0,24.30,38.45,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0
1,0,0,1,1,70,1,1,106.05,7554.05,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
2,1,0,1,1,7,1,0,69.45,477.05,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
3,0,0,0,0,9,1,1,75.85,724.65,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,1,0,1,0,26,1,0,85.90,2196.45,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1404,1,0,0,0,2,1,1,74.90,136.05,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0
1405,1,0,1,1,12,1,1,58.35,740.55,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0
1406,1,0,0,0,58,1,1,95.30,5817.70,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1407,0,0,1,1,6,1,0,19.55,122.90,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0


In [5]:
X_val = val_df.drop(columns=['Churn'])
y_val = val_df['Churn']

# Test Set

In [6]:
data=pd.read_csv("file/test_df.csv")
test_df=pd.DataFrame(data)
test_df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,MultipleLines_No,...,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn
0,1,0,1,0,31,1,1,64.00,1910.75,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
1,1,1,1,0,54,1,1,101.50,5373.10,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1
2,1,0,1,0,59,1,1,109.15,6557.75,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
3,1,0,0,0,9,1,0,80.55,653.90,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
4,0,0,0,0,49,1,1,66.15,3199.00,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1404,1,1,0,0,56,1,0,104.75,5841.35,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
1405,1,0,0,1,46,1,1,105.20,4822.85,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1
1406,1,0,1,1,16,1,1,89.45,1430.25,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1
1407,0,0,0,0,4,1,1,79.90,324.30,1.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1


In [7]:
X_test = test_df.drop(columns=['Churn'])
y_test = test_df['Churn']

# Model Selection

In [8]:
# üìç ‡∏Å‡∏≥‡∏´‡∏ô‡∏î‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏´‡∏•‡∏≤‡∏¢‡∏ï‡∏±‡∏ß‡∏ó‡∏µ‡πà‡∏ï‡πâ‡∏≠‡∏á‡∏Å‡∏≤‡∏£‡∏ó‡∏î‡∏™‡∏≠‡∏ö
models = {
    'LogisticRegression' : LogisticRegression(),
    'Support Vector Machine' : svm.SVC(),
    'KNN' : KNeighborsClassifier(),
    'LinearDiscriminantAnalysis' : LinearDiscriminantAnalysis(),
    'QuadraticDiscriminantAnalysis' : QuadraticDiscriminantAnalysis(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XGBoost': XGBClassifier(),
    'Naive Bayes': GaussianNB(),
    'Gradient Boosting': GradientBoostingClassifier()
}


In [None]:
X = X[['Contract_Month-to-month','tenure','TotalCharges','MonthlyCharges','OnlineSecurity_No','TechSupport_No']]
y = y

# ‡∏™‡∏°‡∏°‡∏ï‡∏¥ X_train, y_train ‡∏Ñ‡∏∑‡∏≠‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ó‡∏µ‡πà‡πÄ‡∏£‡∏≤‡πÅ‡∏ö‡πà‡∏á‡∏°‡∏≤‡πÅ‡∏•‡πâ‡∏ß (‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà‡∏ï‡πâ‡∏≠‡∏á SMOTE ‡∏Ç‡πâ‡∏≤‡∏á‡∏ô‡∏≠‡∏Å!)
# X_train, X_test, y_train, y_test = train_test_split(X, y, ...) 

def evaluate_models_correctly(models, X, y, cv=5):
    results = []
    
    # ‡πÉ‡∏ä‡πâ StratifiedKFold ‡∏î‡∏µ‡∏Å‡∏ß‡πà‡∏≤ KFold ‡∏ò‡∏£‡∏£‡∏°‡∏î‡∏≤‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• Imbalance
    kf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

    for name, model in models.items():
        # ‡∏™‡∏£‡πâ‡∏≤‡∏á Pipeline: 
        # 1. SMOTE (‡∏ó‡∏≥‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏• Train ‡πÉ‡∏ô‡πÅ‡∏ï‡πà‡∏•‡∏∞ Fold)
        # 2. Model (‡πÄ‡∏£‡∏µ‡∏¢‡∏ô‡∏£‡∏π‡πâ‡∏à‡∏≤‡∏Å‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏ó‡∏µ‡πà SMOTE ‡πÅ‡∏•‡πâ‡∏ß)
        model_pipeline = ImbPipeline([
            ('smote', SMOTE(random_state=42)),
            ('classifier', model)
        ])

        # ‡∏™‡πà‡∏á X, y (‡∏ï‡∏±‡∏ß‡∏î‡∏¥‡∏ö‡πÜ ‡∏ó‡∏µ‡πà‡∏¢‡∏±‡∏á‡πÑ‡∏°‡πà SMOTE) ‡πÄ‡∏Ç‡πâ‡∏≤‡πÑ‡∏õ
        # Pipeline ‡∏à‡∏∞‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡πÄ‡∏£‡∏∑‡πà‡∏≠‡∏á SMOTE ‡πÉ‡∏´‡πâ‡πÄ‡∏≠‡∏á‡∏≠‡∏¢‡πà‡∏≤‡∏á‡∏ñ‡∏π‡∏Å‡∏ï‡πâ‡∏≠‡∏á‡πÉ‡∏ô‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏£‡∏≠‡∏ö
        acc_scores = cross_val_score(model, X, y, cv=kf, scoring='accuracy')
        prec_scores = cross_val_score(model, X, y, cv=kf, scoring=make_scorer(precision_score, average='weighted', zero_division=0))
        rec_scores = cross_val_score(model, X, y, cv=kf, scoring=make_scorer(recall_score, average='weighted', zero_division=0))
        f1_scores = cross_val_score(model, X, y, cv=kf, scoring=make_scorer(f1_score, average='weighted', zero_division=0))
        # ... (Metrics ‡∏≠‡∏∑‡πà‡∏ô‡πÜ)

        results.append({
            'Algorithm': name,
            'Accuracy Score': np.mean(acc_scores),
            'Precision Score': np.mean(prec_scores),
            'Recall Score': np.mean(rec_scores),
            'F1 Score': np.mean(f1_scores)
        })

    return pd.DataFrame(results)

# --- ‡∏ß‡∏¥‡∏ò‡∏µ‡πÄ‡∏£‡∏µ‡∏¢‡∏Å‡πÉ‡∏ä‡πâ ---
# ‡∏™‡πà‡∏á X_train ‡πÅ‡∏•‡∏∞ y_train (‡∏Ç‡πâ‡∏≠‡∏°‡∏π‡∏•‡∏î‡∏¥‡∏ö) ‡πÄ‡∏Ç‡πâ‡∏≤‡πÑ‡∏õ
# ‡∏´‡πâ‡∏≤‡∏°‡∏™‡πà‡∏á X_test ‡∏´‡∏£‡∏∑‡∏≠ X_resampled
results_df = evaluate_models_correctly(models, X, y, cv=10)
results_df

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,Algorithm,Accuracy Score,Precision Score,Recall Score,F1 Score
0,LogisticRegression,0.795263,0.788512,0.795263,0.790512
1,Support Vector Machine,0.734674,0.539747,0.734674,0.622303
2,KNN,0.759039,0.744943,0.759039,0.748867
3,LinearDiscriminantAnalysis,0.78958,0.783321,0.78958,0.785467
4,QuadraticDiscriminantAnalysis,0.752416,0.78517,0.752416,0.762289
5,Decision Tree,0.730422,0.727879,0.73539,0.726268
6,Random Forest,0.773725,0.75845,0.769226,0.76386
7,XGBoost,0.780115,0.770278,0.780115,0.772989
8,Naive Bayes,0.750531,0.791927,0.750531,0.761868
9,Gradient Boosting,0.793607,0.782794,0.793371,0.784149


In [9]:
X = X[['Contract_Month-to-month','tenure','TotalCharges','MonthlyCharges','OnlineSecurity_No','TechSupport_No']]
y = y

X_val = X_val[['Contract_Month-to-month','tenure','TotalCharges','MonthlyCharges','OnlineSecurity_No','TechSupport_No']]
y_val = y_val

# Smote

In [10]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X, y)

# ADASYN

In [11]:
adasyn = ADASYN(random_state=42)
X_ada, y_ada = adasyn.fit_resample(X, y)

# Logistic Regression

In [41]:
# 1. ‡∏™‡∏£‡πâ‡∏≤‡∏á Pipeline ‡∏ó‡∏µ‡πà‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏° Feature Selector ‡∏Å‡∏±‡∏ö Model ‡πÄ‡∏Ç‡πâ‡∏≤‡∏î‡πâ‡∏ß‡∏¢‡∏Å‡∏±‡∏ô
# ‡∏™‡∏±‡∏á‡πÄ‡∏Å‡∏ï‡∏ß‡πà‡∏≤‡πÄ‡∏£‡∏≤‡∏ï‡∏±‡πâ‡∏á‡∏ä‡∏∑‡πà‡∏≠‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ß‡πà‡∏≤ 'classifier'
pipe = Pipeline([('regressor', LogisticRegression(random_state=42))])

# 2. ‡∏Å‡∏≥‡∏´‡∏ô‡∏î Search Space
# ‡πÄ‡∏£‡∏≤‡∏à‡∏∞‡πÄ‡∏Ç‡πâ‡∏≤‡∏ñ‡∏∂‡∏á hyperparameter ‡∏Ç‡∏≠‡∏á‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡πÉ‡∏ô pipeline ‡∏î‡πâ‡∏ß‡∏¢  "‡∏ä‡∏∑‡πà‡∏≠‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô__‡∏ä‡∏∑‡πà‡∏≠‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå"
param_grid =  {'regressor__penalty':['l1','l2','elasticnet','none'],
    'regressor__C' : np.logspace(-4,4,20),
    'regressor__solver': ['lbfgs','newton-cg','liblinear','sag','saga'],
    'regressor__max_iter' : [100,1000,2500,5000]
}

# 3. ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÅ‡∏•‡∏∞‡∏£‡∏±‡∏ô GridSearchCV
# Grid Search ‡∏à‡∏∞‡∏ó‡∏î‡∏•‡∏≠‡∏á‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î 5 * 2 * 3 = 30 ‡∏ä‡∏∏‡∏î‡∏Ñ‡πà‡∏≤‡∏ú‡∏™‡∏°
# Inner CV for hyperparameter tuning
search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1')

# Outer CV for model evaluation
outer_cv = KFold(n_splits=5)
# Nested CV
nested_score = cross_val_score(search, X, y, cv=outer_cv)
search.fit(X, y)

# 4. ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î
print("Best parameters found:")
print(search.best_params_)
# ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏ó‡∏µ‡πà‡πÑ‡∏î‡πâ‡∏≠‡∏≤‡∏à‡∏à‡∏∞‡πÄ‡∏õ‡πá‡∏ô:
# {'selector__k': 15, 'classifier__n_estimators': 200, 'classifier__max_depth': 10}
# ‡∏ã‡∏∂‡πà‡∏á‡∏´‡∏°‡∏≤‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ß‡πà‡∏≤ ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏≥‡∏á‡∏≤‡∏ô‡πÑ‡∏î‡πâ‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î‡πÄ‡∏°‡∏∑‡πà‡∏≠‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÉ‡∏ä‡πâ 15 ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå ‡πÅ‡∏•‡∏∞‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤ n_estimators=200, max_depth=10

print(f"\nBest cross-validation accuracy: {search.best_score_:.4f}")
#print("Nested CV Score: ", nested_score.mean())

Fitting 5 folds for each of 1600 candidates, totalling 8000 fits


5200 fits failed out of a total of 8000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
400 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\AppDa

Fitting 5 folds for each of 1600 candidates, totalling 8000 fits


5200 fits failed out of a total of 8000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
400 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\AppDa

Fitting 5 folds for each of 1600 candidates, totalling 8000 fits


5200 fits failed out of a total of 8000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
400 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\AppDa

Fitting 5 folds for each of 1600 candidates, totalling 8000 fits


5200 fits failed out of a total of 8000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
400 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\AppDa

Fitting 5 folds for each of 1600 candidates, totalling 8000 fits


5200 fits failed out of a total of 8000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
400 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\AppDa

Fitting 5 folds for each of 1600 candidates, totalling 8000 fits
Best parameters found:
{'regressor__C': np.float64(1.623776739188721), 'regressor__max_iter': 100, 'regressor__penalty': 'l2', 'regressor__solver': 'newton-cg'}

Best cross-validation accuracy: 0.5925


5200 fits failed out of a total of 8000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
400 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Admin\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\sklearn\base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Admin\AppDa

In [14]:
lr_model = LogisticRegression(C=1.623776739188721,max_iter=100, penalty='l2', solver='newton-cg')
lr_model.fit(X, y)
y_pred = lr_model.predict(X_val)

print("Classification Report:")
print(classification_report(y_val, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1035
           1       0.61      0.55      0.58       374

    accuracy                           0.79      1409
   macro avg       0.73      0.71      0.72      1409
weighted avg       0.78      0.79      0.78      1409



In [None]:
lr_model = LogisticRegression(C=1.623776739188721,max_iter=100, penalty='l2', solver='newton-cg')
lr_model.fit(X_train_resampled, y_train_resampled)
y_pred = lr_model.predict(X_val)

print("Classification Report:")
print(classification_report(y_val, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.71      0.80      1035
           1       0.50      0.80      0.62       374

    accuracy                           0.74      1409
   macro avg       0.71      0.76      0.71      1409
weighted avg       0.80      0.74      0.75      1409



In [15]:
lr_model = LogisticRegression(class_weight='balanced',C=1.623776739188721,max_iter=100, penalty='l2', solver='newton-cg')
lr_model.fit(X, y)
y_pred = lr_model.predict(X_val)

print("Classification Report:")
print(classification_report(y_val, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.71      0.80      1035
           1       0.50      0.81      0.62       374

    accuracy                           0.73      1409
   macro avg       0.70      0.76      0.71      1409
weighted avg       0.80      0.73      0.75      1409



In [19]:
lr_model = LogisticRegression(class_weight='balanced',C=1.623776739188721,max_iter=100, penalty='l2', solver='newton-cg')
lr_model.fit(X_train_resampled, y_train_resampled)
y_pred = lr_model.predict(X_val)

print("Classification Report:")
print(classification_report(y_val, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.71      0.80      1035
           1       0.50      0.80      0.62       374

    accuracy                           0.74      1409
   macro avg       0.71      0.76      0.71      1409
weighted avg       0.80      0.74      0.75      1409



In [20]:
lr_model = LogisticRegression(class_weight='balanced',C=1.623776739188721,max_iter=100, penalty='l2', solver='newton-cg')
lr_model.fit(X_ada, y_ada)
y_pred = lr_model.predict(X_val)

print("Classification Report:")
print(classification_report(y_val, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.69      0.79      1035
           1       0.49      0.82      0.61       374

    accuracy                           0.73      1409
   macro avg       0.70      0.76      0.70      1409
weighted avg       0.80      0.73      0.74      1409



In [18]:
lr_model = LogisticRegression(C=1.623776739188721,max_iter=100, penalty='l2', solver='newton-cg')
lr_model.fit(X_ada, y_ada)
y_pred = lr_model.predict(X_val)

print("Classification Report:")
print(classification_report(y_val, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.69      0.79      1035
           1       0.49      0.82      0.61       374

    accuracy                           0.72      1409
   macro avg       0.70      0.76      0.70      1409
weighted avg       0.80      0.72      0.74      1409



# Random forrest

In [43]:
# 1. ‡∏™‡∏£‡πâ‡∏≤‡∏á Pipeline ‡∏ó‡∏µ‡πà‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏° Feature Selector ‡∏Å‡∏±‡∏ö Model ‡πÄ‡∏Ç‡πâ‡∏≤‡∏î‡πâ‡∏ß‡∏¢‡∏Å‡∏±‡∏ô
# ‡∏™‡∏±‡∏á‡πÄ‡∏Å‡∏ï‡∏ß‡πà‡∏≤‡πÄ‡∏£‡∏≤‡∏ï‡∏±‡πâ‡∏á‡∏ä‡∏∑‡πà‡∏≠‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ß‡πà‡∏≤ 'classifier'
pipe = Pipeline([('classifier', RandomForestClassifier(random_state=42))])

# 2. ‡∏Å‡∏≥‡∏´‡∏ô‡∏î Search Space
# ‡πÄ‡∏£‡∏≤‡∏à‡∏∞‡πÄ‡∏Ç‡πâ‡∏≤‡∏ñ‡∏∂‡∏á hyperparameter ‡∏Ç‡∏≠‡∏á‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡πÉ‡∏ô pipeline ‡∏î‡πâ‡∏ß‡∏¢  "‡∏ä‡∏∑‡πà‡∏≠‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô__‡∏ä‡∏∑‡πà‡∏≠‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå"
param_grid = {
    # ‡∏à‡∏π‡∏ô 'n_estimators' ‡πÅ‡∏•‡∏∞ 'max_depth' ‡∏Ç‡∏≠‡∏á RandomForestClassifier (classifier)
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20, None],
    'classifier__min_samples_leaf' : [1, 2, 5, 10],
    'classifier__min_samples_split' : [2, 5, 10, 15, 100],
    'classifier__criterion' : ['gini', 'entropy']
}

# 3. ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÅ‡∏•‡∏∞‡∏£‡∏±‡∏ô GridSearchCV
# Grid Search ‡∏à‡∏∞‡∏ó‡∏î‡∏•‡∏≠‡∏á‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î 5 * 2 * 3 = 30 ‡∏ä‡∏∏‡∏î‡∏Ñ‡πà‡∏≤‡∏ú‡∏™‡∏°
# Inner CV for hyperparameter tuning
search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1')

# Outer CV for model evaluation
outer_cv = KFold(n_splits=5)
# Nested CV
nested_score = cross_val_score(search, X, y, cv=outer_cv)
search.fit(X, y)

# 4. ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î
print("Best parameters found:")
print(search.best_params_)
# ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏ó‡∏µ‡πà‡πÑ‡∏î‡πâ‡∏≠‡∏≤‡∏à‡∏à‡∏∞‡πÄ‡∏õ‡πá‡∏ô:
# {'selector__k': 15, 'classifier__n_estimators': 200, 'classifier__max_depth': 10}
# ‡∏ã‡∏∂‡πà‡∏á‡∏´‡∏°‡∏≤‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ß‡πà‡∏≤ ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏≥‡∏á‡∏≤‡∏ô‡πÑ‡∏î‡πâ‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î‡πÄ‡∏°‡∏∑‡πà‡∏≠‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÉ‡∏ä‡πâ 15 ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå ‡πÅ‡∏•‡∏∞‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤ n_estimators=200, max_depth=10

print(f"\nBest cross-validation accuracy: {search.best_score_:.4f}")
#print("Nested CV Score: ", nested_score.mean())

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
Fitting 5 folds for each of 240 candidates, totalling 1200 fits
Best parameters found:
{'classifier__criterion': 'entropy', 'classifier__max_depth': 20, 'classifier__min_samples_leaf': 5, 'classifier__min_samples_split': 15, 'classifier__n_estimators': 200}

Best cross-validation accuracy: 0.5703


In [21]:
rf_model = RandomForestClassifier(n_estimators=200,max_depth=20,criterion='entropy',min_samples_leaf=5,min_samples_split=15, random_state=42)
rf_model.fit(X, y)
y_pred = rf_model.predict(X_val)

print("Classification Report:")
print(classification_report(y_val, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1035
           1       0.65      0.49      0.56       374

    accuracy                           0.80      1409
   macro avg       0.74      0.70      0.71      1409
weighted avg       0.78      0.80      0.79      1409



In [22]:
rf_model = RandomForestClassifier(n_estimators=200,max_depth=20,criterion='entropy',min_samples_leaf=5,min_samples_split=15, random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)
y_pred = rf_model.predict(X_val)

print("Classification Report:")
print(classification_report(y_val, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.84      0.85      1035
           1       0.59      0.63      0.61       374

    accuracy                           0.78      1409
   macro avg       0.72      0.73      0.73      1409
weighted avg       0.79      0.78      0.79      1409



In [26]:
rf_model = RandomForestClassifier(n_estimators=200,max_depth=20,criterion='entropy',min_samples_leaf=5,min_samples_split=15, random_state=42)
rf_model.fit(X_ada, y_ada)
y_pred = rf_model.predict(X_val)

print("Classification Report:")
print(classification_report(y_val, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.82      0.84      1035
           1       0.57      0.66      0.61       374

    accuracy                           0.78      1409
   macro avg       0.72      0.74      0.73      1409
weighted avg       0.79      0.78      0.78      1409



# Xgboost

In [28]:
# 1. ‡∏™‡∏£‡πâ‡∏≤‡∏á Pipeline ‡∏ó‡∏µ‡πà‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏° Feature Selector ‡∏Å‡∏±‡∏ö Model ‡πÄ‡∏Ç‡πâ‡∏≤‡∏î‡πâ‡∏ß‡∏¢‡∏Å‡∏±‡∏ô
# ‡∏™‡∏±‡∏á‡πÄ‡∏Å‡∏ï‡∏ß‡πà‡∏≤‡πÄ‡∏£‡∏≤‡∏ï‡∏±‡πâ‡∏á‡∏ä‡∏∑‡πà‡∏≠‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ß‡πà‡∏≤ 'classifier'
pipe = Pipeline([('classifier', XGBClassifier(random_state=42))])

# 2. ‡∏Å‡∏≥‡∏´‡∏ô‡∏î Search Space
# ‡πÄ‡∏£‡∏≤‡∏à‡∏∞‡πÄ‡∏Ç‡πâ‡∏≤‡∏ñ‡∏∂‡∏á hyperparameter ‡∏Ç‡∏≠‡∏á‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡πÉ‡∏ô pipeline ‡∏î‡πâ‡∏ß‡∏¢  "‡∏ä‡∏∑‡πà‡∏≠‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô__‡∏ä‡∏∑‡πà‡∏≠‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå"
param_grid = {
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.1, 0.01, 0.001],
    'classifier__subsample': [0.5, 0.7, 1],
    'classifier__n_estimators': [50, 100, 200],
    'classifier__colsample_bytree': [0.3, 0.8, 1],
    'classifier__num_round': [50, 100, 200],
    'classifier__min_child_weight': [1, 5, 10]
}

# 3. ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÅ‡∏•‡∏∞‡∏£‡∏±‡∏ô GridSearchCV
# Grid Search ‡∏à‡∏∞‡∏ó‡∏î‡∏•‡∏≠‡∏á‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î 5 * 2 * 3 = 30 ‡∏ä‡∏∏‡∏î‡∏Ñ‡πà‡∏≤‡∏ú‡∏™‡∏°
# Inner CV for hyperparameter tuning
search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1')

# Outer CV for model evaluation
outer_cv = KFold(n_splits=5)
# Nested CV
nested_score = cross_val_score(search, X, y, cv=outer_cv)
search.fit(X, y)

# 4. ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î
print("Best parameters found:")
print(search.best_params_)
# ‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏ó‡∏µ‡πà‡πÑ‡∏î‡πâ‡∏≠‡∏≤‡∏à‡∏à‡∏∞‡πÄ‡∏õ‡πá‡∏ô:
# {'selector__k': 15, 'classifier__n_estimators': 200, 'classifier__max_depth': 10}
# ‡∏ã‡∏∂‡πà‡∏á‡∏´‡∏°‡∏≤‡∏¢‡∏Ñ‡∏ß‡∏≤‡∏°‡∏ß‡πà‡∏≤ ‡πÇ‡∏°‡πÄ‡∏î‡∏•‡∏ó‡∏≥‡∏á‡∏≤‡∏ô‡πÑ‡∏î‡πâ‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î‡πÄ‡∏°‡∏∑‡πà‡∏≠‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÉ‡∏ä‡πâ 15 ‡∏ü‡∏µ‡πÄ‡∏à‡∏≠‡∏£‡πå ‡πÅ‡∏•‡∏∞‡∏ï‡∏±‡πâ‡∏á‡∏Ñ‡πà‡∏≤ n_estimators=200, max_depth=10

print(f"\nBest cross-validation accuracy: {search.best_score_:.4f}")
#print("Nested CV Score: ", nested_score.mean())

Fitting 5 folds for each of 2187 candidates, totalling 10935 fits


Parameters: { "num_round" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fitting 5 folds for each of 2187 candidates, totalling 10935 fits


Parameters: { "num_round" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fitting 5 folds for each of 2187 candidates, totalling 10935 fits


Parameters: { "num_round" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fitting 5 folds for each of 2187 candidates, totalling 10935 fits


Parameters: { "num_round" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fitting 5 folds for each of 2187 candidates, totalling 10935 fits


Parameters: { "num_round" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fitting 5 folds for each of 2187 candidates, totalling 10935 fits
Best parameters found:
{'classifier__colsample_bytree': 1, 'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__min_child_weight': 1, 'classifier__n_estimators': 50, 'classifier__num_round': 50, 'classifier__subsample': 0.5}

Best cross-validation accuracy: 0.5774


Parameters: { "num_round" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [29]:
xg_model = XGBClassifier(colsample_bytree=1,learning_rate = 0.1, max_depth = 3,min_child_weight = 1,n_estimators = 50,num_round = 50,subsample = 0.5,random_state=42)
xg_model.fit(X, y)
y_pred = xg_model.predict(X_val)

print("Classification Report:")
print(classification_report(y_val, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.91      0.87      1035
           1       0.66      0.49      0.56       374

    accuracy                           0.80      1409
   macro avg       0.74      0.70      0.71      1409
weighted avg       0.78      0.80      0.79      1409



Parameters: { "num_round" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [50]:
xg_model = XGBClassifier(colsample_bytree=1,learning_rate = 0.1, max_depth = 3,min_child_weight = 1,n_estimators = 50,num_round = 50,subsample = 0.5,random_state=42)
xg_model.fit(X_train_resampled, y_train_resampled)
y_pred = xg_model.predict(X_val)

print("Classification Report:")
print(classification_report(y_val, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.79      0.84      1035
           1       0.56      0.74      0.64       374

    accuracy                           0.78      1409
   macro avg       0.73      0.77      0.74      1409
weighted avg       0.81      0.78      0.79      1409



Parameters: { "num_round" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [27]:
xg_model = XGBClassifier(colsample_bytree=1,learning_rate = 0.1, max_depth = 3,min_child_weight = 1,n_estimators = 50,num_round = 50,subsample = 0.5,random_state=42)
xg_model.fit(X_ada, y_ada)
y_pred = xg_model.predict(X_val)

print("Classification Report:")
print(classification_report(y_val, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.77      0.83      1035
           1       0.55      0.76      0.64       374

    accuracy                           0.77      1409
   macro avg       0.72      0.77      0.74      1409
weighted avg       0.81      0.77      0.78      1409



Parameters: { "num_round" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


# Gradient Boosting

In [24]:
# 1. ‡∏™‡∏£‡πâ‡∏≤‡∏á Pipeline ‡∏ó‡∏µ‡πà‡πÄ‡∏ä‡∏∑‡πà‡∏≠‡∏° Feature Selector ‡∏Å‡∏±‡∏ö Model ‡πÄ‡∏Ç‡πâ‡∏≤‡∏î‡πâ‡∏ß‡∏¢‡∏Å‡∏±‡∏ô
# ‡∏™‡∏±‡∏á‡πÄ‡∏Å‡∏ï‡∏ß‡πà‡∏≤‡πÄ‡∏£‡∏≤‡∏ï‡∏±‡πâ‡∏á‡∏ä‡∏∑‡πà‡∏≠‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡∏ß‡πà‡∏≤ 'classifier'
pipe = Pipeline([('classifier', GradientBoostingClassifier(random_state=42))])

# 2. ‡∏Å‡∏≥‡∏´‡∏ô‡∏î Search Space
# ‡πÄ‡∏£‡∏≤‡∏à‡∏∞‡πÄ‡∏Ç‡πâ‡∏≤‡∏ñ‡∏∂‡∏á hyperparameter ‡∏Ç‡∏≠‡∏á‡πÅ‡∏ï‡πà‡∏•‡∏∞‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô‡πÉ‡∏ô pipeline ‡∏î‡πâ‡∏ß‡∏¢  "‡∏ä‡∏∑‡πà‡∏≠‡∏Ç‡∏±‡πâ‡∏ô‡∏ï‡∏≠‡∏ô__‡∏ä‡∏∑‡πà‡∏≠‡∏û‡∏≤‡∏£‡∏≤‡∏°‡∏¥‡πÄ‡∏ï‡∏≠‡∏£‡πå"
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__learning_rate': [0.01, 0.1, 0.2],
    'classifier__max_depth': [3, 5, 7],
    'classifier__subsample': [0.8, 0.9, 1.0],
    'classifier__min_samples_split': [2, 3, 4],
    'classifier__min_samples_leaf' : [1, 2, 5, 10]
}

# 3. ‡∏™‡∏£‡πâ‡∏≤‡∏á‡πÅ‡∏•‡∏∞‡∏£‡∏±‡∏ô GridSearchCV
# Grid Search ‡∏à‡∏∞‡∏ó‡∏î‡∏•‡∏≠‡∏á‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î 5 * 2 * 3 = 30 ‡∏ä‡∏∏‡∏î‡∏Ñ‡πà‡∏≤‡∏ú‡∏™‡∏°
# Inner CV for hyperparameter tuning
search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, verbose=2, scoring='f1')

# Outer CV for model evaluation
outer_cv = KFold(n_splits=5)
# Nested CV
nested_score = cross_val_score(search, X, y, cv=outer_cv)
search.fit(X, y)

# 4. ‡πÅ‡∏™‡∏î‡∏á‡∏ú‡∏•‡∏•‡∏±‡∏û‡∏ò‡πå‡∏ó‡∏µ‡πà‡∏î‡∏µ‡∏ó‡∏µ‡πà‡∏™‡∏∏‡∏î
print("Best parameters found:")
print(search.best_params_)

print(f"\nBest cross-validation accuracy: {search.best_score_:.4f}")
#print("Nested CV Score: ", nested_score.mean())

Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Fitting 5 folds for each of 972 candidates, totalling 4860 fits
Best parameters found:
{'classifier__learning_rate': 0.2, 'classifier__max_depth': 3, 'classifier__min_samples_leaf': 10, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50, 'classifier__subsample': 0.8}

Best cross-validation accuracy: 0.5750


In [26]:
gb_model = GradientBoostingClassifier(n_estimators=50, learning_rate=0.2,max_depth=3,min_samples_leaf = 10,min_samples_split=2, subsample=0.8,random_state=42)
gb_model.fit(X, y)
y_pred = gb_model.predict(X_val)

print("Classification Report:")
print(classification_report(y_val, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1035
           1       0.67      0.51      0.58       374

    accuracy                           0.80      1409
   macro avg       0.75      0.71      0.73      1409
weighted avg       0.79      0.80      0.80      1409



In [27]:
gb_model = GradientBoostingClassifier(n_estimators=50, learning_rate=0.2,max_depth=3,min_samples_leaf = 10,min_samples_split=2, subsample=0.8,random_state=42)
gb_model.fit(X_train_resampled, y_train_resampled)
y_pred = gb_model.predict(X_val)

print("Classification Report:")
print(classification_report(y_val, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.81      0.85      1035
           1       0.58      0.70      0.63       374

    accuracy                           0.78      1409
   macro avg       0.73      0.76      0.74      1409
weighted avg       0.80      0.78      0.79      1409



In [28]:
gb_model = GradientBoostingClassifier(n_estimators=50, learning_rate=0.2,max_depth=3,min_samples_leaf = 10,min_samples_split=2, subsample=0.8,random_state=42)
gb_model.fit(X_ada, y_ada)
y_pred = gb_model.predict(X_val)

print("Classification Report:")
print(classification_report(y_val, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.80      0.84      1035
           1       0.56      0.72      0.63       374

    accuracy                           0.78      1409
   macro avg       0.73      0.76      0.74      1409
weighted avg       0.80      0.78      0.78      1409

