<a href="https://colab.research.google.com/github/BhavikDudhrejiya/Cross-Validation-Methods/blob/main/Cross_Validation_Methods2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
#Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import LeavePOut
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold

In [2]:
#Loading data
data = load_breast_cancer()

In [3]:
#Checking attributes
data.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [4]:
#Splitting data into X & y
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

In [5]:
#Splitting data into train & test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [10]:
#Creating function which train a model and extract accuracy
def classifier(model, X_train, X_test, y_train, y_test, cv_method):
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  accuracy = accuracy_score(y_test, y_pred)
  cv = cross_val_score(estimator=model, X= X_train, y= y_train, scoring = 'accuracy', cv=cv_method, n_jobs=-1)
  cv_score = cv.mean()
  return {'Accuracy':accuracy, 'CV_score':cv_score}

# K Fold Cross Validation

In [11]:
#Creating classifier
rf = RandomForestClassifier()
kfold = KFold(n_splits=5)
classifier(rf, X_train, X_test, y_train, y_test, kfold)

{'Accuracy': 0.9649122807017544, 'CV_score': 0.9626373626373625}

In [15]:
#Creating classifier
rf = RandomForestClassifier()
kfold = KFold(n_splits=10, random_state=0)
classifier(rf, X_train, X_test, y_train, y_test, kfold)



{'Accuracy': 0.9649122807017544, 'CV_score': 0.9648309178743961}

# Stratify Cross Validation

In [27]:
#Creating classifier
rf = RandomForestClassifier()
stratify_cv = StratifiedKFold(n_splits=10)
classifier(rf, X_train, X_test, y_train, y_test, stratify_cv)

{'Accuracy': 0.956140350877193, 'CV_score': 0.9626086956521739}

In [32]:
#Increasing number of split through loop and checking the result
for i in range(2,10,1):
  stratify_cv = StratifiedKFold(n_splits=i, random_state=42, shuffle=True)
  rf = RandomForestClassifier()
  cv = cross_val_score(estimator=rf, X= X_train, y= y_train, scoring = 'accuracy', cv=stratify_cv, n_jobs=-1)
  cv_score = cv.mean()
  print(i, 'Stratify CV Score:',cv_score)

2 Stratify CV Score: 0.9450498492928356
3 Stratify CV Score: 0.9494597420704078
4 Stratify CV Score: 0.9472713864306784
5 Stratify CV Score: 0.9604395604395606
6 Stratify CV Score: 0.9538888888888889
7 Stratify CV Score: 0.9604395604395605
8 Stratify CV Score: 0.9582941729323309
9 Stratify CV Score: 0.9626579520697168


#Repeated Cross Validation

In [38]:
#Creating classifier
rf = RandomForestClassifier()
repeated_cv = RepeatedKFold(n_splits=2, n_repeats=4, random_state=42)
classifier(rf, X_train, X_test, y_train, y_test, repeated_cv)

{'Accuracy': 0.9649122807017544, 'CV_score': 0.9538386853698122}

#Leave P Out Cross Validation

In [None]:
#Creating classifier
rf = RandomForestClassifier()
LPO_cv = LeavePOut(p=2)
classifier(rf, X_train, X_test, y_train, y_test, LPO_cv)

#Leave One Out Cross Validation

In [40]:
#Creating classifier
rf = RandomForestClassifier()
LOO_cv = LeaveOneOut()
classifier(rf, X_train, X_test, y_train, y_test, LOO_cv)

{'Accuracy': 0.9649122807017544, 'CV_score': 0.9604395604395605}