In [23]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer,load_wine
#from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
#from sklearn.multiclass import OneVsRestClassifier #For handling the Multiclass Logistic Regression

## Breast Cancer Dataset

In [24]:
data = load_breast_cancer()

In [25]:

df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target


print(df.head())
df.shape

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

(569, 31)

In [26]:
df['target'].value_counts(),data.target_names

(target
 1    357
 0    212
 Name: count, dtype: int64,
 array(['malignant', 'benign'], dtype='<U9'))

<b> There are two types of classes</b> namely 0: benign and 1: malignant

In [27]:
kf = KFold(n_splits=5)

In [28]:
print(f"The total numbers of rows are: {df.shape[0]}")
for train_index, test_index in kf.split(df[data.feature_names],df['target']):
    print(train_index.shape,test_index.shape)

The total numbers of rows are: 569
(455,) (114,)
(455,) (114,)
(455,) (114,)
(455,) (114,)
(456,) (113,)


In [29]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [30]:
scores_knn_kfold = []
scores_svm_kfold = []
scores_rf_kfold = []
scores_nb_kfold = []
for train_index, test_index in kf.split(df[data.feature_names],df['target']):
    X_train, X_test, y_train, y_test = data.data[train_index], data.data[test_index], \
                                       data.target[train_index], data.target[test_index]
    scores_knn_kfold.append(get_score(KNeighborsClassifier(n_neighbors=3), X_train, X_test, y_train, y_test))
    scores_svm_kfold.append(get_score(SVC(gamma='auto'), X_train, X_test, y_train, y_test))
    scores_rf_kfold.append(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))
    scores_nb_kfold.append(get_score(GaussianNB(), X_train, X_test, y_train, y_test))

In [32]:
print("K_Fold")
print(f"Scores of KNN  {sum(scores_knn_kfold)/5}")
print(f"Scores of SVM {sum(scores_svm_kfold)/5}")
print(f"Scores of Random Forest {sum(scores_rf_kfold)/5}")
print(f"Scores of  Naive Bayes {sum(scores_nb_kfold)/5}")

K_Fold
Scores of KNN  0.9209284272628473
Scores of SVM 0.6276665114112715
Scores of Random Forest 0.952585002328831
Scores of  Naive Bayes 0.9367644775655954


In [33]:
folds = StratifiedKFold(n_splits=5)

In [34]:
scores_knn_sfold = []
scores_svm_sfold = []
scores_rf_sfold = []
scores_nb_sfold = []

for train_index, test_index in kf.split(df[data.feature_names],df['target']):
    X_train, X_test, y_train, y_test = data.data[train_index], data.data[test_index], \
                                       data.target[train_index], data.target[test_index]
    scores_knn_sfold.append(get_score(KNeighborsClassifier(n_neighbors=3), X_train, X_test, y_train, y_test))
    scores_svm_sfold.append(get_score(SVC(gamma='auto'), X_train, X_test, y_train, y_test))
    scores_rf_sfold.append(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))
    scores_nb_sfold.append(get_score(GaussianNB(), X_train, X_test, y_train, y_test))

In [35]:
print("Stratified K_Fold")
print(f"Scores of KNN  {sum(scores_knn_sfold)/5}")
print(f"Scores of SVM {sum(scores_svm_sfold)/5}")
print(f"Scores of Random Forest {sum(scores_rf_sfold)/5}")
print(f"Scores of  Naive Bayes {sum(scores_nb_sfold)/5}")

Stratified K_Fold
Scores of KNN  0.9209284272628473
Scores of SVM 0.6276665114112715
Scores of Random Forest 0.9543549138332557
Scores of  Naive Bayes 0.9367644775655954


* Here we see that the <b>Random Forest Classifier</b> is giving the best result for both <b>K-Fold </b> and <b>Stratified K-Fold</b> for <b>5</b> splits

## Wine Dataset

In [13]:
data1= load_wine()

In [14]:
df1 = pd.DataFrame(data1.data,columns=data1.feature_names)
df1['target'] = data1.target
print(df1.head())

   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  target  
0          

In [15]:
df1['target'].value_counts(),data1.target_names

(target
 1    71
 0    59
 2    48
 Name: count, dtype: int64,
 array(['class_0', 'class_1', 'class_2'], dtype='<U7'))

<b>There are 3 classes</b> namely <b>0: class_0</b> , <b>1: class_1 </b> and <b>2: class_2</b>

In [16]:
kf = KFold(n_splits=5)

In [17]:
print(f"The total number of rows are: {df1.shape[0]}")
for train_index,test_index in kf.split(data1.data,data1.target):
    print(f"The train and test shape is: {train_index.shape,test_index.shape}")

The total number of rows are: 178
The train and test shape is: ((142,), (36,))
The train and test shape is: ((142,), (36,))
The train and test shape is: ((142,), (36,))
The train and test shape is: ((143,), (35,))
The train and test shape is: ((143,), (35,))


In [18]:
scores_knn_kfold = []
scores_svm_kfold = []
scores_rf_kfold = []
scores_nb_kfold = []

for train_index,test_index in kf.split(data1.data,data1.target):
    X_train,X_test,y_train,y_test = data1.data[train_index],data1.data[test_index],\
                                    data1.target[train_index],data1.target[test_index]
    scores_knn_kfold.append(get_score(KNeighborsClassifier(n_neighbors=5), X_train, X_test, y_train, y_test))
    scores_svm_kfold.append(get_score(SVC(gamma='auto'), X_train, X_test, y_train, y_test))
    scores_rf_kfold.append(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))
    scores_nb_kfold.append(get_score(GaussianNB(), X_train, X_test, y_train, y_test))

In [19]:
print("K_Fold")
print(f"Scores of KNN  {sum(scores_knn_kfold)/5}")
print(f"Scores of SVM {sum(scores_svm_kfold)/5}")
print(f"Scores of Random Forest {sum(scores_rf_kfold)/5}")
print(f"Scores of  Naive Bayes {sum(scores_nb_kfold)/5}")

K_Fold
Scores of KNN  0.6100000000000001
Scores of SVM 0.10603174603174603
Scores of Random Forest 0.9441269841269841
Scores of  Naive Bayes 0.9326984126984126


In [20]:
folds = StratifiedKFold(n_splits=5)

In [21]:
scores_knn_sfold = []
scores_svm_sfold = []
scores_rf_sfold = []
scores_nb_sfold = []

for train_index,test_index in folds.split(data1.data,data1.target):
    X_train,X_test,y_train,y_test = data1.data[train_index],data1.data[test_index],\
                                    data1.target[train_index],data1.target[test_index]
    scores_knn_sfold.append(get_score(KNeighborsClassifier(n_neighbors=5), X_train, X_test, y_train, y_test))
    scores_svm_sfold.append(get_score(SVC(gamma='auto'), X_train, X_test, y_train, y_test))
    scores_rf_sfold.append(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))
    scores_nb_sfold.append(get_score(GaussianNB(), X_train, X_test, y_train, y_test))

In [22]:
print("Stratified K_Fold")
print(f"Scores of KNN  {sum(scores_knn_sfold)/5}")
print(f"Scores of SVM {sum(scores_svm_sfold)/5}")
print(f"Scores of Random Forest {sum(scores_rf_sfold)/5}")
print(f"Scores of  Naive Bayes {sum(scores_nb_sfold)/5}")

Stratified K_Fold
Scores of KNN  0.6912698412698413
Scores of SVM 0.4273015873015873
Scores of Random Forest 0.9722222222222221
Scores of  Naive Bayes 0.9663492063492063


* Here we see that the <b>Naive Bayes</b> is giving the best result for  <b>K-Fold </b> and <b>Random Forest</b> is giving the best result for <b>Stratified K-Fold</b> for <b>5</b> splits