# Cross Validation and model Selection

## imports

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 

import sklearn
from sklearn import tree
from sklearn import datasets
from sklearn import model_selection
from sklearn import metrics

## Load Data

In [2]:
iris = datasets.load_iris()
tmp = {name: iris.data[:, i] for i, name in enumerate(iris.feature_names)}  # dictionary comprehension
# print(iris.data)
print(iris.feature_names)
tmp["target"] = [iris.target_names[i] for i in iris.target]
iris_df = pd.DataFrame(tmp)
iris_df.columns = ["sepal_length", "sepal_width", "petal_length", "petal_width", "target"]
iris_df.head()

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
X = iris_df.drop(["target"], axis=1) # 除了最后一列的所有数据
y = iris_df["target"] #最后一列的所有
clf = tree.DecisionTreeClassifier()  #classifier!
# print(clf)

In [4]:
(X_train, X_test, y_train, y_test) = model_selection.train_test_split(X, y, test_size=1/3) #split! 
# 机器学习一般会把数据分成训练数据和测试数据。 右边括号中，相当于---x为训练数据，y为目标数据，所要划分的样本结果。test_size为 样本占比
# 这里的例子只是简单地进行分区，下面的例子是使用交叉验证的方式
print(X_train.shape, X_test.shape) 

(100, 4) (50, 4)


In [5]:
#train 应用到决策树中
clf.fit(X_train, y_train)
#predict  预测结果
y_pred = clf.predict(X_test)
y_pred

array(['virginica', 'versicolor', 'virginica', 'setosa', 'setosa',
       'setosa', 'virginica', 'versicolor', 'virginica', 'setosa',
       'virginica', 'versicolor', 'versicolor', 'versicolor', 'setosa',
       'versicolor', 'versicolor', 'setosa', 'setosa', 'setosa', 'setosa',
       'virginica', 'setosa', 'setosa', 'virginica', 'versicolor',
       'setosa', 'versicolor', 'versicolor', 'virginica', 'versicolor',
       'virginica', 'versicolor', 'versicolor', 'setosa', 'setosa',
       'setosa', 'setosa', 'virginica', 'versicolor', 'versicolor',
       'versicolor', 'setosa', 'setosa', 'versicolor', 'virginica',
       'virginica', 'versicolor', 'versicolor', 'virginica'], dtype=object)

In [6]:
# score  https://zhuanlan.zhihu.com/p/37654241 
# precision_score: 查准率，看的是我们所关注的类别正确分类的比率
# recall_score: 召回率，即真实正例中最后预测为正例所占的比例
# f1 score：为准确率和召回率的组合，常作为模型选择的指标
print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))


precision=0.9810526315789474
recall=   0.98
f1=       0.9798998998998999


## Cross Validation

In [7]:
kf = model_selection.KFold(n_splits=5, shuffle=True)
for train_index, test_index in kf.split(iris_df):
    print("TRAIN:", train_index[:5], "TEST:", test_index[:5])
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("precision={}".format(metrics.precision_score(y_test, y_pred, average="weighted")))
    print("recall=   {}".format(metrics.recall_score(y_test, y_pred, average="weighted")))
    print("f1=       {}".format(metrics.f1_score(y_test, y_pred, average="weighted")))
    
    print()
    

TRAIN: [0 1 3 5 7] TEST: [ 2  4  6 10 13]
precision=1.0
recall=   1.0
f1=       1.0

TRAIN: [0 1 2 3 4] TEST: [11 18 21 22 25]
precision=0.8781818181818183
recall=   0.8666666666666667
f1=       0.8672727272727273

TRAIN: [2 3 4 6 7] TEST: [ 0  1  5 12 17]
precision=0.9523809523809523
recall=   0.9333333333333333
f1=       0.9365079365079365

TRAIN: [0 1 2 4 5] TEST: [ 3  7  8 14 20]
precision=0.95
recall=   0.9333333333333333
f1=       0.9350649350649349

TRAIN: [0 1 2 3 4] TEST: [ 9 15 19 23 26]
precision=0.9700000000000001
recall=   0.9666666666666667
f1=       0.966750208855472



[Scoring Parameter](https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter)

In [8]:
scores = model_selection.cross_validate(clf, X, y, cv=5, 
                                        scoring=["precision_weighted", "recall_weighted", "f1_weighted"])
scores

{'fit_time': array([0.0035162 , 0.00289297, 0.00248313, 0.00224376, 0.00220323]),
 'score_time': array([0.00488663, 0.00595093, 0.00410891, 0.00372291, 0.00403881]),
 'test_precision_weighted': array([0.96969697, 0.96969697, 0.9023569 , 1.        , 1.        ]),
 'test_recall_weighted': array([0.96666667, 0.96666667, 0.9       , 1.        , 1.        ]),
 'test_f1_weighted': array([0.96658312, 0.96658312, 0.89974937, 1.        , 1.        ])}

In [9]:
np.mean(scores["test_precision_weighted"]) #最后计算总的

0.9683501683501683