# Decision Tree for the moons dataset

## 导入数据，设定随机数

In [2]:
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=10000, noise=0.4, random_state=42)

## 划分数据集

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

## 用带交叉验证的贪婪搜索寻找最优参数

In [4]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(random_state=42)
params = {'max_leaf_nodes':list(range(2,100)),
          'min_samples_split':[2,3,4]}
grid_search_cv = GridSearchCV(tree_clf,params,n_jobs=-1,verbose=1,cv=3)
grid_search_cv.fit(X_train,y_train)

Fitting 3 folds for each of 294 candidates, totalling 882 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 882 out of 882 | elapsed:    3.1s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'max_leaf_nodes': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99], 'min_samples_split': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       s

In [5]:
grid_search_cv.best_estimator_

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=17,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')

## 评估最优参数

1、用最优参数在整个训练集训练模型；  
2、所得模型在测试集评估。

In [6]:
from sklearn.metrics import accuracy_score
#grid_search_cv的refit参数已经将最好参数用于训练整个训练集
y_pred = grid_search_cv.predict(X_test)
accuracy_score(y_test,y_pred)

0.8695

# Grow Forest with Decision Tree

## 生成子训练集

生成1000个子训练集，每个子训练集有100个样本实例

In [25]:
from sklearn.model_selection import ShuffleSplit

mini_sets = []
rs = ShuffleSplit(n_splits = 1000, train_size = 100,random_state=42)
for mini_train_index, _ in rs.split(X_train):
    X_mini_train = X_train[mini_train_index]
    y_mini_train = y_train[mini_train_index]
    mini_sets.append((X_mini_train, y_mini_train))



用上述最佳参数在每个子集训练决策树，在测试集上评估这1000个决策树。  
由于它们是在较小的集合上训练的，因此这些决策树可能比第一个决策树表现更差。

In [26]:
from sklearn import clone

# forest = [clone(grid_search_cv.best_estimator_)]*1000  ## wrong useage
forest = [clone(grid_search_cv.best_estimator_) for _ in range(n_trees)]
accuracy_scores = []
for tree, (X_mini_train, y_mini_train) in zip(forest, mini_sets):
    tree.fit(X_mini_train, y_mini_train)
    
    y_pred = tree.predict(X_test)
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    
np.mean(accuracy_scores)

0.8065545

对于每个测试集实例，生成1000个决策树的预测，并且仅保留最频繁的预测。  
用SciPy的mode函数来执行多数投票预测。

In [27]:
Y_pred = np.empty([n_trees, len(X_test)], dtype=np.uint8)
#np.shape(Y_pred):  (1000,2000)
for tree_index, tree in enumerate(forest):
    Y_pred[tree_index] = tree.predict(X_test)

In [28]:
from scipy.stats import mode
y_pred_majority_votes, n_votes = mode(Y_pred, axis=0)

In [29]:
accuracy_score(y_test, y_pred_majority_votes.reshape([-1]))

0.87