# 决策树

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
# 为了和视屏结果一致，选择网上的数据
file_path = "http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic.txt"
df = pd.read_csv(file_path)
df.head(10)

Unnamed: 0,row.names,pclass,survived,name,age,embarked,home.dest,room,ticket,boat,sex
0,1,1st,1,"Allen, Miss Elisabeth Walton",29.0,Southampton,"St Louis, MO",B-5,24160 L221,2,female
1,2,1st,0,"Allison, Miss Helen Loraine",2.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
2,3,1st,0,"Allison, Mr Hudson Joshua Creighton",30.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,(135),male
3,4,1st,0,"Allison, Mrs Hudson J.C. (Bessie Waldo Daniels)",25.0,Southampton,"Montreal, PQ / Chesterville, ON",C26,,,female
4,5,1st,1,"Allison, Master Hudson Trevor",0.9167,Southampton,"Montreal, PQ / Chesterville, ON",C22,,11,male
5,6,1st,1,"Anderson, Mr Harry",47.0,Southampton,"New York, NY",E-12,,3,male
6,7,1st,1,"Andrews, Miss Kornelia Theodosia",63.0,Southampton,"Hudson, NY",D-7,13502 L77,10,female
7,8,1st,0,"Andrews, Mr Thomas, jr",39.0,Southampton,"Belfast, NI",A-36,,,male
8,9,1st,1,"Appleton, Mrs Edward Dale (Charlotte Lamson)",58.0,Southampton,"Bayside, Queens, NY",C-101,,2,female
9,10,1st,0,"Artagaveytia, Mr Ramon",71.0,Cherbourg,"Montevideo, Uruguay",,,(22),male


In [12]:
# 处理数据，找出特征值和目标值
x = df[["pclass", "age", "sex"]]
y = df["survived"]
x.head(10)

Unnamed: 0,pclass,age,sex
0,1st,29.0,female
1,1st,2.0,female
2,1st,30.0,male
3,1st,25.0,female
4,1st,0.9167,male
5,1st,47.0,male
6,1st,63.0,female
7,1st,39.0,male
8,1st,58.0,female
9,1st,71.0,male


In [30]:
# 缺失值的处理
x["age"].copy().fillna(x["age"].mean(), inplace=True)

In [31]:
# 分割数据集到训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=666)

In [32]:
# 进行处理（特征工程）one_hot编码
from sklearn.feature_extraction import DictVectorizer
dict = DictVectorizer(sparse=False)
X_train = dict.fit_transform(X_train.to_dict(orient="records"))   # 将训练集的每一行转换成一个字典
X_test = dict.fit_transform(X_test.to_dict(orient="records"))
X_train  # pclass有三个类别，sex有两个类别，可以看出后边的五个0/1代表的就是plcass和sex

array([[32.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ],
       [31.19418104,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ],
       [19.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ],
       ...,
       [55.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [ 6.        ,  0.        ,  0.        ,  1.        ,  0.        ,
         1.        ],
       [18.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ]])

In [33]:
# 用决策树进行预测
from sklearn.tree import DecisionTreeClassifier
dec = DecisionTreeClassifier()
dec.fit(X_train, y_train)
dec.score(X_test, y_test)

0.8145896656534954

In [34]:
# 决策树的可视化
from sklearn.tree import export_graphviz
export_graphviz(dec, out_file="./Titanic.dot", feature_names=["age", "pclass=1st", "pclass=2nd", "pclass=3rd", "sex=female", "sex=male"])

# 随机森林

In [35]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
# 网格搜索与交叉验证
from sklearn.model_selection import GridSearchCV
param_grid = {
    "n_estimators": [120, 200, 300, 500, 800, 1200],
    "max_depth": [5, 8, 15, 25,30]
}
gc = GridSearchCV(rf, param_grid, cv=2)
gc.fit(X_train, y_train)
print(gc.score(X_test, y_test))
print(gc.best_params_)

0.8358662613981763
{'max_depth': 5, 'n_estimators': 500}
