In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = sns.load_dataset("titanic")

In [3]:
df.drop(["deck", "embark_town", "alive", "class", "who", "adult_male"], axis=1, inplace=True)

In [4]:
df["age"].fillna(df["age"].mean(), inplace=True)

In [5]:
df.dropna(subset=["embarked"], inplace=True)

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [7]:
df['sex'] = le.fit_transform(df['sex'])
df["embarked"] = le.fit_transform(df["embarked"])

In [8]:
df = df.astype(int)

In [9]:
X = df.drop("survived", axis=1)
y = df["survived"]

# Cross Validation

In [11]:
from sklearn.neighbors import KNeighborsClassifier

In [12]:
model = KNeighborsClassifier()

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42)

In [15]:
model.fit(X_train,y_train)

In [16]:
model.score(X_test,y_test)

0.7040816326530612

In [17]:
from sklearn.model_selection import cross_val_score

In [18]:
cross_val_results = cross_val_score(model, X,y, cv = 5, scoring='accuracy')

In [19]:
print(cross_val_results)

[0.65730337 0.69662921 0.71910112 0.74719101 0.71186441]


# HyperParameter Tuning

# 1. Manual

# 2. Grid Search CV

In [23]:
from sklearn.model_selection import GridSearchCV

In [24]:
clf = GridSearchCV((model), {
    "n_neighbors": [5,7,9,11,13,15],
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "leaf_size": [30,40,50]
}, cv = 5, return_train_score=False)

In [25]:
clf.fit(X,y)

In [26]:
clf.cv_results_

{'mean_fit_time': array([0.0034874 , 0.00240688, 0.00256386, 0.00264597, 0.00255861,
        0.00272603, 0.0025466 , 0.00254822, 0.00251851, 0.00250196,
        0.00253553, 0.0027215 , 0.00244164, 0.00242867, 0.00228548,
        0.00228467, 0.00205007, 0.0022325 , 0.00247087, 0.00227799,
        0.00245042, 0.00233598, 0.00289292, 0.0028873 , 0.00257497,
        0.00230722, 0.00247626, 0.00249872, 0.00250087, 0.00257716,
        0.00224485, 0.00305605, 0.00221729, 0.00218735, 0.00214653,
        0.00255113, 0.00314207, 0.00209131, 0.00212259, 0.00209088,
        0.00203381, 0.00228434, 0.00206118, 0.00204034, 0.00203619,
        0.00204601, 0.00205398, 0.00245118, 0.00262408, 0.00230842,
        0.00207582, 0.00231795, 0.00211992, 0.00241542, 0.00225644,
        0.00204449, 0.00228539, 0.00248041, 0.00224042, 0.00223088,
        0.00207853, 0.00232902, 0.00251632, 0.00209951, 0.00212569,
        0.00197611, 0.00214472, 0.00209527, 0.00232482, 0.00210648,
        0.00212955, 0.00202475,

In [27]:
results = pd.DataFrame(clf.cv_results_)

In [28]:
results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_leaf_size,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003487,0.00105,0.012618,0.003912,auto,30,5,uniform,"{'algorithm': 'auto', 'leaf_size': 30, 'n_neig...",0.657303,0.696629,0.719101,0.747191,0.711864,0.706418,0.029533,99
1,0.002407,0.00041,0.002971,0.001111,auto,30,5,distance,"{'algorithm': 'auto', 'leaf_size': 30, 'n_neig...",0.668539,0.724719,0.724719,0.730337,0.706215,0.710906,0.022699,73
2,0.002564,0.000502,0.010153,0.00062,auto,30,7,uniform,"{'algorithm': 'auto', 'leaf_size': 30, 'n_neig...",0.629213,0.674157,0.691011,0.752809,0.734463,0.696331,0.043974,125
3,0.002646,0.000494,0.002289,0.000434,auto,30,7,distance,"{'algorithm': 'auto', 'leaf_size': 30, 'n_neig...",0.657303,0.719101,0.713483,0.741573,0.723164,0.710925,0.028417,69
4,0.002559,0.000507,0.009614,0.000605,auto,30,9,uniform,"{'algorithm': 'auto', 'leaf_size': 30, 'n_neig...",0.634831,0.662921,0.724719,0.741573,0.711864,0.695182,0.039965,139


In [29]:
results[['params','mean_test_score']]

Unnamed: 0,params,mean_test_score
0,"{'algorithm': 'auto', 'leaf_size': 30, 'n_neig...",0.706418
1,"{'algorithm': 'auto', 'leaf_size': 30, 'n_neig...",0.710906
2,"{'algorithm': 'auto', 'leaf_size': 30, 'n_neig...",0.696331
3,"{'algorithm': 'auto', 'leaf_size': 30, 'n_neig...",0.710925
4,"{'algorithm': 'auto', 'leaf_size': 30, 'n_neig...",0.695182
...,...,...
139,"{'algorithm': 'brute', 'leaf_size': 50, 'n_nei...",0.723278
140,"{'algorithm': 'brute', 'leaf_size': 50, 'n_nei...",0.712068
141,"{'algorithm': 'brute', 'leaf_size': 50, 'n_nei...",0.725538
142,"{'algorithm': 'brute', 'leaf_size': 50, 'n_nei...",0.708690


In [30]:
clf.best_score_

0.7255379927632831

In [31]:
clf.best_params_

{'algorithm': 'brute',
 'leaf_size': 30,
 'n_neighbors': 13,
 'weights': 'distance'}

In [32]:
results[
    ["mean_test_score", "param_n_neighbors", "param_weights",
     "param_algorithm", "param_leaf_size"]
].head(10)

Unnamed: 0,mean_test_score,param_n_neighbors,param_weights,param_algorithm,param_leaf_size
0,0.706418,5,uniform,auto,30
1,0.710906,5,distance,auto,30
2,0.696331,7,uniform,auto,30
3,0.710925,7,distance,auto,30
4,0.695182,9,uniform,auto,30
5,0.719888,9,distance,auto,30
6,0.713204,11,uniform,auto,30
7,0.722161,11,distance,auto,30
8,0.707567,13,uniform,auto,30
9,0.718796,13,distance,auto,30


# 3. Randomized Search CV

In [34]:
from sklearn.model_selection import RandomizedSearchCV

In [35]:
RandomizedSearch_clf = RandomizedSearchCV((model), {
    "n_neighbors": [5,7,9,11,13,15],
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "leaf_size": [30,40,50]
},n_iter= 10, cv = 5, return_train_score=False)

In [36]:
RandomizedSearch_clf.fit(X,y)

In [37]:
RandomizedSearch_clf.best_score_

0.7142956897098965

In [38]:
RandomizedSearch_clf.best_params_

{'weights': 'distance',
 'n_neighbors': 7,
 'leaf_size': 30,
 'algorithm': 'ball_tree'}

# iris dataset testing on grid search cv and randomized search cv

In [40]:
df2 = sns.load_dataset('iris')

In [41]:
df2.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [42]:
df2['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [43]:

from sklearn.model_selection import train_test_split

In [44]:
X2 = df2.drop('species',axis = 1)
y2 = df2['species']

In [45]:
X_train, X_test, y_train, y_test = train_test_split(
X2, y2, test_size=0.33, random_state=42)
     

In [46]:
model_knn_iris = KNeighborsClassifier(n_neighbors=13)

In [47]:

model_knn_iris.fit(X_train,y_train)

In [48]:
model_knn_iris.score(X_test,y_test)

1.0

In [49]:
from sklearn.svm import SVC

In [50]:
model_svm = SVC(gamma= 'auto')

In [51]:
model_svm.fit(X_train,y_train)

In [52]:
model_svm.score(X_test,y_test)

1.0

In [53]:

from sklearn.model_selection import GridSearchCV

In [54]:

classifier = GridSearchCV((model_svm),{
    'C' : [1,10,20,30],
    'kernel' :['rbf','linear'],
},cv = 5,return_train_score = False)
     

In [55]:
classifier.fit(X2,y2)

In [56]:
results_iris = pd.DataFrame(classifier.cv_results_)

In [57]:
results_iris

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002382,0.000934,0.001912,0.000294,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.002079,7.4e-05,0.001312,0.000415,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.002057,4.8e-05,0.001817,0.000397,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.001886,0.000256,0.001727,0.000611,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.002354,0.000409,0.001223,0.000415,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.002357,0.000421,0.00152,0.00045,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6
6,0.002097,6.3e-05,0.001437,0.000487,30,rbf,"{'C': 30, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.933333,1.0,0.96,0.038873,7
7,0.00164,0.000502,0.001537,0.000499,30,linear,"{'C': 30, 'kernel': 'linear'}",1.0,1.0,0.9,0.9,1.0,0.96,0.04899,7


In [58]:

results_iris[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667
6,30,rbf,0.96
7,30,linear,0.96


In [59]:

from sklearn.model_selection import RandomizedSearchCV

In [60]:
classifier_r = RandomizedSearchCV((model_svm),{
    'C' : [1,10,20,30],
    'kernel' :['rbf','linear'],
},n_iter = 4,cv = 5,return_train_score = False)

In [61]:

classifier_r.fit(X2,y2)

In [62]:
results_r = pd.DataFrame(classifier_r.cv_results_)

In [63]:
results_r

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.002723,0.000569,0.002068,0.000645,linear,30,"{'kernel': 'linear', 'C': 30}",1.0,1.0,0.9,0.9,1.0,0.96,0.04899,3
1,0.002033,2.8e-05,0.001523,0.000476,rbf,1,"{'kernel': 'rbf', 'C': 1}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.002083,5.4e-05,0.001786,0.000319,rbf,30,"{'kernel': 'rbf', 'C': 30}",0.966667,1.0,0.9,0.933333,1.0,0.96,0.038873,3
3,0.002051,3.2e-05,0.001038,3.7e-05,linear,20,"{'kernel': 'linear', 'C': 20}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,2


In [64]:
results_r[['param_C','param_kernel','mean_test_score']]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,30,linear,0.96
1,1,rbf,0.98
2,30,rbf,0.96
3,20,linear,0.966667


# Ensemble Learning

# 1. Stacking

In [123]:

from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [125]:
df3 = sns.load_dataset('iris')

In [127]:
df3.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [131]:
X3 = df3.drop('species', axis=1)
y3 = df3['species']

In [135]:

le = LabelEncoder()
y_encoded = le.fit_transform(y3)
     

In [137]:
X_train, X_test, y_train, y_test = train_test_split(X3, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

In [139]:

base_learners = [
    ('dt', DecisionTreeClassifier(random_state=42)),
    ('svc', SVC(probability=True, kernel='rbf', random_state=42)),
    ('lr', LogisticRegression(max_iter=1000))
]

In [141]:
meta_learner = LogisticRegression(max_iter=1000)

In [143]:
stacking_clf = StackingClassifier(
    estimators=base_learners,
    final_estimator=meta_learner,
    cv=5
)

In [145]:
stacking_clf.fit(X_train, y_train)

In [147]:
y_pred = stacking_clf.predict(X_test)

In [149]:
accuracy = accuracy_score(y_test, y_pred)

In [151]:
accuracy

0.9666666666666667

# 2. Bagging (Bootstrap Aggregating)

In [157]:
from sklearn.ensemble import RandomForestClassifier

In [159]:

rf_model = RandomForestClassifier(
    n_estimators=100,     # number of trees
    max_depth=None,       # let trees grow fully
    random_state=42,

)

In [161]:
rf_model.fit(X_train,y_train)

In [163]:

y_pred = rf_model.predict(X_test)

In [167]:
rf_accuracy = accuracy_score(y_test, y_pred)

In [169]:
rf_accuracy

0.9

# 3. Boosting

In [174]:
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier

In [176]:
ada_model = AdaBoostClassifier(n_estimators=100, random_state=42)

In [178]:
ada_model.fit(X_train,y_train)

In [183]:

y_pred_ada = ada_model.predict(X_test)

In [187]:
 accuracy_ada = accuracy_score(y_test, y_pred_ada)

In [189]:
accuracy_ada

0.9333333333333333

In [191]:
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

In [193]:
gb_model.fit(X_train,y_train)

In [195]:
y_pred_gb = gb_model.predict(X_test)

In [197]:
 accuracy_gb = accuracy_score(y_test, y_pred_gb)

In [199]:
accuracy_gb

0.9666666666666667

In [201]:
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, use_label_encoder=False, eval_metric='mlogloss', random_state=42)

In [203]:
xgb_model.fit(X_train, y_train)

In [205]:
y_pred_xgb = xgb_model.predict(X_test)

In [207]:
 accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

In [209]:
accuracy_xgb

0.9333333333333333