In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

X,y= datasets.load_iris(return_X_y=True)
X.shape,y.shape

((150, 4), (150,))

In [5]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.4,random_state=0)#random_state=0 is used to ensure that the same split is produced each time the code is run.
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((90, 4), (60, 4), (90,), (60,))

In [4]:
clf = svm.SVC(kernel='linear', C=1).fit(X_train, y_train)# c=1 This paarameter controls the trade-off between achieving a low training error and a low testing error.
clf.score(X_test, y_test)#returns the average accuracy score of the SVM classifier on the test data.

0.9666666666666667

In [11]:
# Computing cross valdiation metrics
from sklearn.model_selection import cross_val_score
clf=svm.SVC(kernel='linear',C=1,random_state=42)
scores=cross_val_score(clf,X,y,cv=4)
scores

array([1.        , 0.97368421, 0.97297297, 0.97297297])

In [12]:
print("%0.2f accuracy with a standard deviation of % 0.2f " %(scores.mean(),scores.std()))

0.98 accuracy with a standard deviation of  0.01 


In [14]:
# f1 score 
from sklearn import metrics
score= cross_val_score(clf,X,y,scoring='f1_macro')
scores

array([1.        , 0.97368421, 0.97297297, 0.97297297])

In [17]:
#It is also possible to use other cross validation strategies by passing a cross validation iterator instead, for instance:
from sklearn.model_selection import ShuffleSplit
n_samples=X.shape[0]
cv=ShuffleSplit(n_splits=5, test_size=0.3,random_state=0)
cross_val_score(clf,X,y,cv=cv)

array([0.97777778, 0.97777778, 1.        , 0.95555556, 1.        ])

In [21]:
#Another option is to use an iterable yielding (train, test) splits as arrays of indices, for example:
def custom_cv_2folds(X):
    n=X.shape[0]
    i=1
    while i<=2:
        idx= np.arange(n*(i-1)/2,n*i/2,dtype=int)
        yield idx,idx
        i+=1

custom_cv=custom_cv_2folds(X)
cross_val_score(clf,X,y,cv=custom_cv)

array([1.        , 0.97333333])

In [36]:
#Example of 2 fold cross Validation on a dataset with 4 samples:
from sklearn.model_selection import KFold
X=["a","b","c","d"]
kf=KFold(n_splits=4)
for train,test in kf.split(X):
    print("%s %s"%(train,test))

[1 2 3] [0]
[0 2 3] [1]
[0 1 3] [2]
[0 1 2] [3]


In [37]:
#Repeated K-Fold
#Example of 2 fold k-fold repeatedd 2 time
from sklearn.model_selection import RepeatedKFold
X=np.array([[1,2],[3,4],[1,2],[3,4],[3,3]])
random_state=12883823
rkf=RepeatedKFold(n_splits=4,n_repeats=2,random_state=random_state)
for train,test in rkf.split(X):
    print("%s %s " %(train,test))

[0 2 4] [1 3] 
[0 1 2 3] [4] 
[0 1 3 4] [2] 
[1 2 3 4] [0] 
[1 3 4] [0 2] 
[0 1 2 4] [3] 
[0 2 3 4] [1] 
[0 1 2 3] [4] 


In [39]:
# leave One Out(LOO)
from sklearn.model_selection import LeaveOneOut
X=[1,2,3,4]
loo=LeaveOneOut()
for train,test in loo.split(X):
    print("%s %s" %(train,test))

[1 2 3] [0]
[0 2 3] [1]
[0 1 3] [2]
[0 1 2] [3]


In [43]:
#Example of Leave-2-Out on a dataset with 4 samples:
from sklearn.model_selection import LeavePOut
X=np.ones(5)
lpp=LeavePOut(p=3) # p value decides how much smaple will be in test ex if p=3 then test will have 3 data 
for train ,test in lpp.split(X):
    print("%s %s" %(train, test))


[3 4] [0 1 2]
[2 4] [0 1 3]
[2 3] [0 1 4]
[1 4] [0 2 3]
[1 3] [0 2 4]
[1 2] [0 3 4]
[0 4] [1 2 3]
[0 3] [1 2 4]
[0 2] [1 3 4]
[0 1] [2 3 4]


In [51]:
# Here is a usage example: of Random permutations cross-validation a.k.a. Shuffle & Split
from sklearn.model_selection import ShuffleSplit
X=np.arange(10)
ss=ShuffleSplit(n_splits=4,test_size=0.25,random_state=2)
for train,test in ss.split(X):
    print("%s %s" %(train,test))

[0 7 2 3 6 9 8] [4 1 5]
[2 3 9 7 8 4 5] [1 6 0]
[0 8 1 2 6 7 3] [4 9 5]
[0 2 1 3 7 6 4] [5 9 8]


In [56]:
#Stratified k-fold:-Here is an example of stratified 3-fold cross-validation on a dataset with 50 samples from two unbalanced classes. We show the number of samples in each class and compare with KFold.
from sklearn.model_selection import StratifiedKFold, KFold
import numpy as np
X,y=np.ones((50,1)),np.hstack(([0]*45,[1]*5))
skf=StratifiedKFold(n_splits=3)
for train,test in skf.split(X,y):
    print('train-{} | test -{}'.format(np.bincount(y[train]),np.bincount(y[test])))#np.bincount(y[train]): This part calculates the number of occurrences of each unique label in the training set (y[train]). It returns an array where the index represents the label, and the value represents the count of occurrences of that label in the training set.

train-[30  3] | test -[15  2]
train-[30  3] | test -[15  2]
train-[30  4] | test -[15  1]


In [None]:
#The numbers inside the square brackets represent the count of each label. For example, [45 5] means there are 45 samples with label 0 and 5 samples with label 1.

In [6]:
# Group K-fold -Imagine you have three subjects, each with an associated number from 1 to 3:
from sklearn.model_selection import GroupKFold
import numpy as np
#X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 8.8, 9, 10]
#y = ["a", "b", "b", "b", "c", "c", "c", "d", "d", "d"]# labels 
#groups = [1, 1, 1, 2, 2, 2, 3, 3, 3, 3]# Groups for grouping k-fold
# Example data: features, target, and groups
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])
y = np.array([0, 0, 1, 1, 1])  # Binary classification labels
groups = np.array([1, 2, 2, 3, 3])  # Groups for grouping k-fold
gkf= GroupKFold(n_splits=3)
for train,test in gkf.split(X,y,groups=groups):
    print("%s%s" % (train,test))

[0 1 2][3 4]
[0 3 4][1 2]
[1 2 3 4][0]


In [12]:
# example of StratifiedGroupKfold
# intance:-Each instance represents a single occurrence or sample of the data being analyzed.
#instances are often represented as rows in a dataset, where each row contains information about a specific entity or observation.
from sklearn.model_selection import StratifiedGroupKFold
X=list(range(18))
y=[1]*6 +[0]*12# 6 instance of class 1 and 12 instances of class 0 of target class
groups=[1,2,3,3,4,4,1,1,2,2,3,4,5,5,5,6,6,6]
sgkf=StratifiedGroupKFold(n_splits=5)
for train,test in sgkf.split(X,y, groups=groups):
    print("%s %s" %(train, test))

[ 0  1  2  3  6  7  8  9 10 12 13 14] [ 4  5 11 15 16 17]
[ 0  2  3  4  5  6  7 10 11 12 13 14 15 16 17] [1 8 9]
[ 0  1  4  5  6  7  8  9 11 12 13 14 15 16 17] [ 2  3 10]
[ 0  1  2  3  4  5  6  7  8  9 10 11 15 16 17] [12 13 14]
[ 1  2  3  4  5  8  9 10 11 12 13 14 15 16 17] [0 6 7]


In [13]:
# Leave one group out :-
from sklearn.model_selection import LeaveOneGroupOut

X=[1,5,10,50,60,70,80]
y=[0,1,1,2,2,2,2] # target variable
groups=[1,1,2,2,3,3,3]
logo=LeaveOneGroupOut()
for train,test in logo.split(X,y, groups=groups):
    print("%s %s" %(train, test))


[2 3 4 5 6] [0 1]
[0 1 4 5 6] [2 3]
[0 1 2 3] [4 5 6]


In [16]:
# Leave P Group out cross- validation
from sklearn.model_selection import LeavePGroupsOut

X=np.arange(6)
y=[1,1,1,2,2,2]
groups=[1,1,2,2,3,3]
lpgo=LeavePGroupsOut(n_groups=2)
for  train, test in lpgo.split(X,y, groups=groups):
    print("%s%s"%(train, test))

[4 5][0 1 2 3]
[2 3][0 1 4 5]
[0 1][2 3 4 5]


In [21]:
#Here is a usage example of Group Shuffle Split:
from sklearn.model_selection import GroupShuffleSplit
X = [0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001]
y = ["a", "b", "b", "b", "c", "c", "c", "a"]
groups = [1, 1, 2, 2, 3, 3, 4, 4]
gss=GroupShuffleSplit(n_splits=4,test_size=0.5, random_state=0)
for train,test in gss.split(X,y,groups=groups):
    print("%s %s" %(train,test))
    

[0 1 2 3] [4 5 6 7]
[2 3 6 7] [0 1 4 5]
[2 3 4 5] [0 1 6 7]
[4 5 6 7] [0 1 2 3]


In [28]:
#.2.5. Using cross-validation iterators to split train and test
import numpy as np
from sklearn.model_selection import GroupShuffleSplit

X = np.array([0.1, 0.2, 2.2, 2.4, 2.3, 4.55, 5.8, 0.001])
y = np.array(["a", "b", "b", "b", "c", "c", "c", "a"])
groups = np.array([1, 1, 2, 2, 3, 3, 4, 4])

train_indx,test_indx=next(GroupShuffleSplit(random_state=7).split(X,y,groups))
X_train,X_test,y_train,y_test=X[train_indx],X[test_indx],y[train_indx],y[test_indx]
X_train.shape,X_test.shape



((6,), (2,))

In [29]:
np.unique(groups[train_indx]),np.unique(groups[test_indx])

(array([1, 2, 4]), array([3]))

In [31]:
# Example of 3-split time series cross-validation on a dataset with 6 samples:
from sklearn.model_selection import TimeSeriesSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([1, 2, 3, 4, 5, 6])
tscv=TimeSeriesSplit(n_splits=3)
print(tscv)

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None)


In [32]:
for train,test in tscv.split(X):
    print("%s %s " %(train,test))

[0 1 2] [3] 
[0 1 2 3] [4] 
[0 1 2 3 4] [5] 
