In [1]:
from sklearn import model_selection, datasets, linear_model, metrics

In [2]:
import numpy as np
import pandas as pd

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
iris = datasets.load_iris()

In [7]:
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [11]:
iris.target[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [12]:
train_data, test_data, train_label, test_label = train_test_split(iris.data, iris.target, test_size=0.3)

In [16]:
model_selection.KFold

sklearn.model_selection._split.KFold

In [52]:
kf = model_selection.KFold(n_splits=5)

In [53]:
kf.get_n_splits(train_data[:20])

5

In [54]:
for train_indices, test_indices in kf.split(train_data[:20]):
    print(train_indices, test_indices)

[ 4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [0 1 2 3]
[ 0  1  2  3  8  9 10 11 12 13 14 15 16 17 18 19] [4 5 6 7]
[ 0  1  2  3  4  5  6  7 12 13 14 15 16 17 18 19] [ 8  9 10 11]
[ 0  1  2  3  4  5  6  7  8  9 10 11 16 17 18 19] [12 13 14 15]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15] [16 17 18 19]


In [55]:
kf.split(train_data)

<generator object _BaseKFold.split at 0x000002352B72FF90>

In [56]:
kff = model_selection.KFold(n_splits=5, shuffle=True)

In [57]:
kff.get_n_splits(train_data[:20])

5

In [58]:
kff

KFold(n_splits=5, random_state=None, shuffle=True)

In [60]:
for train_ind, test_ind in kff.split(train_data[:20]):
    print(train_ind, test_ind)

[ 0  1  3  4  5  6  8  9 10 11 13 14 16 17 18 19] [ 2  7 12 15]
[ 0  1  2  3  4  5  6  7 10 11 12 15 16 17 18 19] [ 8  9 13 14]
[ 1  2  3  5  6  7  8  9 11 12 13 14 15 17 18 19] [ 0  4 10 16]
[ 0  2  4  6  7  8  9 10 12 13 14 15 16 17 18 19] [ 1  3  5 11]
[ 0  1  2  3  4  5  7  8  9 10 11 12 13 14 15 16] [ 6 17 18 19]


In [77]:
skf = model_selection.StratifiedKFold(n_splits=3, shuffle=True)

In [78]:
skf.split(train_data, test_data)

<generator object _BaseKFold.split at 0x000002352C03D7B0>

In [79]:
for train_i, test_i in skf.split(train_data[:20], train_label[:20]):
    print(train_i, test_i)

[ 3  4  6  8  9 11 12 13 14 15 16 17 18] [ 0  1  2  5  7 10 19]
[ 0  1  2  5  6  7 10 13 14 15 16 17 19] [ 3  4  8  9 11 12 18]
[ 0  1  2  3  4  5  7  8  9 10 11 12 18 19] [ 6 13 14 15 16 17]




In [80]:
train_data.shape

(105, 4)

In [81]:
test_data.shape

(45, 4)

In [82]:
train_data[:20]

array([[5.8, 2.7, 3.9, 1.2],
       [7.2, 3. , 5.8, 1.6],
       [6.7, 3.1, 4.7, 1.5],
       [6.4, 3.2, 4.5, 1.5],
       [6.1, 2.9, 4.7, 1.4],
       [6.8, 3. , 5.5, 2.1],
       [6.8, 3.2, 5.9, 2.3],
       [6.6, 2.9, 4.6, 1.3],
       [5. , 3.4, 1.5, 0.2],
       [5.8, 2.7, 4.1, 1. ],
       [6.3, 2.5, 5. , 1.9],
       [6.2, 3.4, 5.4, 2.3],
       [7.2, 3.2, 6. , 1.8],
       [6.5, 2.8, 4.6, 1.5],
       [7.7, 3. , 6.1, 2.3],
       [6.4, 2.9, 4.3, 1.3],
       [6.9, 3.1, 4.9, 1.5],
       [6.7, 2.5, 5.8, 1.8],
       [5.9, 3. , 4.2, 1.5],
       [5.6, 2.7, 4.2, 1.3]])

In [83]:
train_label[:20]

array([1, 2, 1, 1, 1, 2, 2, 1, 0, 1, 2, 2, 2, 1, 2, 1, 1, 2, 1, 1])

In [95]:
for train_i, test_i in skf.split(train_data[:20], train_label[:20]):
    print(f"train_i: {train_label[train_i]}, test_i:{train_label[test_i]}, percentage1: {train_label[train_i][train_label[train_i] == 1].shape[0]}, percentage2: {train_label[train_i][train_label[train_i] == 2].shape[0]}")

train_i: [2 1 1 1 2 2 0 1 2 1 2 1 1], test_i:[1 1 2 2 1 1 2], percentage1: 7, percentage2: 5
train_i: [1 2 1 2 2 1 2 2 1 1 2 1 1], test_i:[1 1 0 1 2 1 2], percentage1: 7, percentage2: 6
train_i: [1 1 1 1 0 1 2 2 2 1 2 1 1 2], test_i:[2 1 2 2 1 1], percentage1: 8, percentage2: 5




In [88]:
y = train_label[:20]

In [93]:
y[y == 1].shape[0]

11

In [94]:
y[y == 2].shape[0]

8

In [106]:
ss = model_selection.ShuffleSplit(n_splits=5, test_size=0.2, random_state=1)

In [107]:
ss.get_n_splits()

5

In [108]:
ss

ShuffleSplit(n_splits=5, random_state=1, test_size=0.2, train_size=None)

In [111]:
for train_i, test_i in ss.split(train_data):
    print(train_i, test_i)

[ 83  67  53  58  55  87  85  59  36  10   2  32  70  19  27  44  62  98
 100  40  46  56  78  23  34  91  94 102  15  88  41  51  45  96  52  26
 103  43  97  24   4  66  74  49  21   3  30  47  80   8  60   0  95  57
  22  61  63   7 101  13  68  90  14  29  28  11  84  18  20  50  25   6
  71  76   1  16  64  79   5  75   9  72  12  37] [ 65  35  42  93  38  39  54  86  31  77  99  81  92  82  33  89 104  69
  17  73  48]
[ 11   0  44  33  69  52   3  34  62  28  16  91  51  59  27  46  95  37
  99 104  89  80   9  90  72  50   6 102  63  83  14  49  58  19  61  39
  21  25  56  55  85  79  96  53  93  22  87  86  42  17   5  15  78  48
  71  66  70  54  92   2  88  60  65  12  32 101  74  24  98  81  23  10
  13  82  57  68  45   7  36  30  20  43  76  75] [ 47  97  73  67  94  40   4  38  77  26   8  29  18  41  84 100 103  31
  64   1  35]
[ 54  76  40  51  81  12  36  18  77  95  33 102  59  58  61  43  93 100
  55  85   3  21  97  23   4  62  75   9  87  16  65   0  80 101  50 

In [128]:
target = np.array([0] * 5 + [1] * 5)
print(target)

fft = model_selection.StratifiedShuffleSplit(n_splits=4)

for train_indices, test_indices in fft.split(train_data, train_label):
    print(train_indices, test_indices)

[0 0 0 0 0 1 1 1 1 1]
[  4  59 101  73 100  18  28  51  24  23   0  54  81  69  72  84  36  85
  65  60  22  86  53  45  90  95   6  79  66  52  31  87  74   7  78  64
  56  33  39  62  29  40  19  14  96  76  92  16  10  55  63 104  71  93
  43  77  88  38  61  49  20   2  12  41  47  37  34  82  80  48  13  97
  50 103  83  15  46  58  89  57  70  44  25  11  94  68  30  98   8  17
   5  42  35   3] [ 99  67   9  21  26  91  75   1  27  32 102]
[ 75  84  82  69  35  93   2  68 100  33  74  40  12  15   5  56  36  14
  57  81  47  73  54  55  51  45  24   3  23  76  97  71   6  88  19   0
  29  59  32  28  48  66  95   1  38   8  77   9  17  65 103  52 101  99
  37  60  26  91  13  30  94  53  92  25  89  16  70  58  86  63  72  31
  79  43  64  62  87  85  98  50  46  41  80  83  39  21  42  18  49  90
  27  78  44  20] [102  10  22  96  67   7   4  61 104  11  34]
[ 48  29  89  75   4  47  45  88   9  65  23  99  91  52  84  27  71  24
  14  92  31  70  93 102  62  40  90  56  35  5

In [135]:
for train_indices, test_indices in model_selection.LeaveOneOut().split(train_data[:20], train_label[:20]):
    print(train_indices, test_indices)

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [0]
[ 0  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [1]
[ 0  1  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [2]
[ 0  1  2  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [3]
[ 0  1  2  3  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [4]
[ 0  1  2  3  4  6  7  8  9 10 11 12 13 14 15 16 17 18 19] [5]
[ 0  1  2  3  4  5  7  8  9 10 11 12 13 14 15 16 17 18 19] [6]
[ 0  1  2  3  4  5  6  8  9 10 11 12 13 14 15 16 17 18 19] [7]
[ 0  1  2  3  4  5  6  7  9 10 11 12 13 14 15 16 17 18 19] [8]
[ 0  1  2  3  4  5  6  7  8 10 11 12 13 14 15 16 17 18 19] [9]
[ 0  1  2  3  4  5  6  7  8  9 11 12 13 14 15 16 17 18 19] [10]
[ 0  1  2  3  4  5  6  7  8  9 10 12 13 14 15 16 17 18 19] [11]
[ 0  1  2  3  4  5  6  7  8  9 10 11 13 14 15 16 17 18 19] [12]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 14 15 16 17 18 19] [13]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 15 16 17 18 19] [14]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 16 1