In [95]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, LogisticRegression, ElasticNet, LinearRegression
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, KFold
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss, roc_auc_score, r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.decomposition import PCA
from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, VotingClassifier, VotingRegressor, BaggingClassifier, BaggingRegressor, AdaBoostClassifier, AdaBoostRegressor, GradientBoostingClassifier, GradientBoostingRegressor 
import warnings
warnings.filterwarnings('ignore')

# AdaBoosting

In [18]:
df = pd.read_csv("Breastcancer.csv")
df.head(3)


Unnamed: 0,Code,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses,Class
0,61634,5,4,3,1,2,2,2,3,1,Benign
1,63375,9,1,2,6,4,10,7,7,2,Malignant
2,76389,10,4,7,2,2,8,6,1,1,Malignant


In [22]:
x, y = df.drop("Class", axis = 1), df["Class"]


In [26]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    random_state = 24, 
                                                    test_size= 0.3,
                                                    stratify= y)


In [42]:
ada = AdaBoostClassifier(
    n_estimators= 155,
    random_state= 24,
)

ada.fit(x_train, y_train)
y_pred= ada.predict(x_test)
print(accuracy_score(y_test, y_pred))

0.9809523809523809


In [82]:


# ______________________________________________________________________________________
# Pipeline

# one hot encoder
ohe = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False,
    drop='first'
).set_output(transform='pandas')

trans_ohe = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude=['category', object])),
    (ohe, make_column_selector(dtype_include=['category', object])),
    verbose_feature_names_out=False
).set_output(transform='pandas')


# scaler
scl_std = StandardScaler().set_output(transform = "pandas")
scl_mm = MinMaxScaler().set_output(transform = "pandas")


# Model

dtc = DecisionTreeClassifier(
    random_state = 24
)

adac = AdaBoostClassifier(
    estimator = dtc,
    random_state = 24
)

pipe = Pipeline([("ADAC", adac)])


# _____________________________________________________________________________________
# GCV

params = {
    "ADAC__n_estimators" : [10, 50, 100, 150, 155],
    "ADAC__estimator__max_depth" : [1, 2, 3]
}

kfolds = StratifiedKFold(n_splits = 5,
                        random_state = 24,
                        shuffle = True)

kfold = KFold(n_splits = 5,
            random_state = 24,
            shuffle = True)

gcv = GridSearchCV(pipe,
                  param_grid = params,
                  scoring = "neg_log_loss",
                  cv = kfolds,
                  verbose = 3)


# gcv.fit(x, y)

In [80]:
pipe.get_params()

{'memory': None,
 'steps': [('ADAC',
   AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=24),
                      random_state=24))],
 'verbose': False,
 'ADAC': AdaBoostClassifier(estimator=DecisionTreeClassifier(random_state=24),
                    random_state=24),
 'ADAC__algorithm': 'SAMME.R',
 'ADAC__estimator__ccp_alpha': 0.0,
 'ADAC__estimator__class_weight': None,
 'ADAC__estimator__criterion': 'gini',
 'ADAC__estimator__max_depth': None,
 'ADAC__estimator__max_features': None,
 'ADAC__estimator__max_leaf_nodes': None,
 'ADAC__estimator__min_impurity_decrease': 0.0,
 'ADAC__estimator__min_samples_leaf': 1,
 'ADAC__estimator__min_samples_split': 2,
 'ADAC__estimator__min_weight_fraction_leaf': 0.0,
 'ADAC__estimator__monotonic_cst': None,
 'ADAC__estimator__random_state': 24,
 'ADAC__estimator__splitter': 'best',
 'ADAC__estimator': DecisionTreeClassifier(random_state=24),
 'ADAC__learning_rate': 1.0,
 'ADAC__n_estimators': 50,
 'ADAC__random_state': 24}

In [84]:
gcv.fit(x, y)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
[CV 1/5] END ADAC__estimator__max_depth=1, ADAC__n_estimators=10;, score=-0.408 total time=   0.0s
[CV 2/5] END ADAC__estimator__max_depth=1, ADAC__n_estimators=10;, score=-0.345 total time=   0.0s
[CV 3/5] END ADAC__estimator__max_depth=1, ADAC__n_estimators=10;, score=-0.435 total time=   0.0s
[CV 4/5] END ADAC__estimator__max_depth=1, ADAC__n_estimators=10;, score=-0.374 total time=   0.0s
[CV 5/5] END ADAC__estimator__max_depth=1, ADAC__n_estimators=10;, score=-0.386 total time=   0.0s
[CV 1/5] END ADAC__estimator__max_depth=1, ADAC__n_estimators=50;, score=-0.487 total time=   0.1s
[CV 2/5] END ADAC__estimator__max_depth=1, ADAC__n_estimators=50;, score=-0.445 total time=   0.1s
[CV 3/5] END ADAC__estimator__max_depth=1, ADAC__n_estimators=50;, score=-0.515 total time=   0.0s
[CV 4/5] END ADAC__estimator__max_depth=1, ADAC__n_estimators=50;, score=-0.494 total time=   0.0s
[CV 5/5] END ADAC__estimator__max_depth=1, ADAC_

In [86]:
print(gcv.best_score_)
print(gcv.best_params_)

-0.11063208430580765
{'ADAC__estimator__max_depth': 3, 'ADAC__n_estimators': 155}


In [90]:
y_pred_prob = gcv.predict_proba(x_test)[:, 1] 
print(log_loss(y_test, y_pred_prob))

0.003665194502550277


# Gradient Boosting

In [99]:
df = pd.read_csv("Breastcancer.csv")
df.head(3)

Unnamed: 0,Code,Clump,UniCell_Size,Uni_CellShape,MargAdh,SEpith,BareN,BChromatin,NoemN,Mitoses,Class
0,61634,5,4,3,1,2,2,2,3,1,Benign
1,63375,9,1,2,6,4,10,7,7,2,Malignant
2,76389,10,4,7,2,2,8,6,1,1,Malignant


In [101]:
x, y = df.drop("Class", axis = 1), df["Class"]


In [105]:
x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                    random_state = 24,
                                                    test_size = 0.3,
                                                    stratify = y)

In [187]:


# ______________________________________________________________________________________
# Pipeline

# one hot encoder
ohe = OneHotEncoder(
    handle_unknown="ignore",
    sparse_output=False,
    drop='first'
).set_output(transform='pandas')

trans_ohe = make_column_transformer(
    ('passthrough', make_column_selector(dtype_exclude=['category', object])),
    (ohe, make_column_selector(dtype_include=['category', object])),
    verbose_feature_names_out=False
).set_output(transform='pandas')


# scaler
scl_std = StandardScaler().set_output(transform = "pandas")
scl_mm = MinMaxScaler().set_output(transform = "pandas")


# Model

dtc = DecisionTreeClassifier(
    random_state = 24
)


gbc = GradientBoostingClassifier(
    # estimator = dtc, 
    random_state = 24
)

adac = AdaBoostClassifier(
    estimator = dtc,
    random_state = 24
)

pipe = Pipeline([("GBC", gbc)])


# _____________________________________________________________________________________
# GCV

params = {
    "GBC__n_estimators" : list(range(30, 41)),
    "GBC__max_depth" : [2, 3, 4],
    "GBC__learning_rate" : np.linspace(0.001, 1, 5),
}

kfolds = StratifiedKFold(n_splits = 5,
                        random_state = 24,
                        shuffle = True)

kfold = KFold(n_splits = 5,
            random_state = 24,
            shuffle = True)

gcv = GridSearchCV(pipe,
                  param_grid = params,
                  scoring = "neg_log_loss",
                  cv = kfolds,
                  verbose = 3)


# gcv.fit(x, y)

In [189]:
gcv.fit(x_train, y_train)

Fitting 5 folds for each of 165 candidates, totalling 825 fits
[CV 1/5] END GBC__learning_rate=0.001, GBC__max_depth=2, GBC__n_estimators=30;, score=-0.624 total time=   0.0s
[CV 2/5] END GBC__learning_rate=0.001, GBC__max_depth=2, GBC__n_estimators=30;, score=-0.622 total time=   0.0s
[CV 3/5] END GBC__learning_rate=0.001, GBC__max_depth=2, GBC__n_estimators=30;, score=-0.624 total time=   0.0s
[CV 4/5] END GBC__learning_rate=0.001, GBC__max_depth=2, GBC__n_estimators=30;, score=-0.623 total time=   0.0s
[CV 5/5] END GBC__learning_rate=0.001, GBC__max_depth=2, GBC__n_estimators=30;, score=-0.619 total time=   0.0s
[CV 1/5] END GBC__learning_rate=0.001, GBC__max_depth=2, GBC__n_estimators=31;, score=-0.623 total time=   0.0s
[CV 2/5] END GBC__learning_rate=0.001, GBC__max_depth=2, GBC__n_estimators=31;, score=-0.622 total time=   0.0s
[CV 3/5] END GBC__learning_rate=0.001, GBC__max_depth=2, GBC__n_estimators=31;, score=-0.623 total time=   0.0s
[CV 4/5] END GBC__learning_rate=0.001, GB

In [191]:
print(gcv.best_score_)
print(gcv.best_params_)

-0.11141805903366649
{'GBC__learning_rate': 0.25075, 'GBC__max_depth': 2, 'GBC__n_estimators': 38}


In [193]:
y_pred_prob = gcv.predict_proba(x_test)[:, 1] 
print(roc_auc_score(y_test, y_pred_prob))

0.9912439613526569
