In [2]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer # 对非数字化的数据进行数值化方法
from sklearn.tree import DecisionTreeClassifier

In [3]:
# 1.数据加载
train_data = pd.read_csv('C:/Users/18280/Desktop/RS/主课/L2/Titanic_Data-master/train.csv')
test_data = pd.read_csv('C:/Users/18280/Desktop/RS/主课/L2/Titanic_Data-master/test.csv')

In [4]:
# 数据概况
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [5]:
print(train_data.describe()) # include=None (default) : The result will include all numeric columns.
print('-'*30)
print(train_data.describe(include=['O'])) # include['O'] To select pandas categorical columns, use 'category'
print('-'*30)
print(train_data.describe(exclude=['O']))
#explination: https://pandas.pydata.org/pandas-docs/version/0.21/generated/pandas.DataFrame.describe.html

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  
------------------------------
                             Name   Sex    Ticket 

In [6]:
print(train_data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [7]:
print(train_data.tail())

     PassengerId  Survived  Pclass                                      Name  \
886          887         0       2                     Montvila, Rev. Juozas   
887          888         1       1              Graham, Miss. Margaret Edith   
888          889         0       3  Johnston, Miss. Catherine Helen "Carrie"   
889          890         1       1                     Behr, Mr. Karl Howell   
890          891         0       3                       Dooley, Mr. Patrick   

        Sex   Age  SibSp  Parch      Ticket   Fare Cabin Embarked  
886    male  27.0      0      0      211536  13.00   NaN        S  
887  female  19.0      0      0      112053  30.00   B42        S  
888  female   NaN      1      2  W./C. 6607  23.45   NaN        S  
889    male  26.0      0      0      111369  30.00  C148        C  
890    male  32.0      0      0      370376   7.75   NaN        Q  


In [11]:
#数据清洗
# 使用平均年龄来填充年龄中的 nan 值
train_data['Age'].fillna(train_data['Age'].mean(), inplace=True)
test_data['Age'].fillna(test_data['Age'].mean(), inplace=True)
# 使用票价的均值填充票价中的 nan 值
train_data['Fare'].fillna(train_data['Fare'].mean(), inplace=True)
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace=True)
# 使用登录最多的港口来填充登录港口的 nan 值
train_data['Embarked'].fillna('S', inplace=True)
test_data['Embarked'].fillna('S', inplace=True)

![dict](https://images2017.cnblogs.com/blog/1161096/201709/1161096-20170908174736882-1862873544.png)

In [16]:
# 特征选择'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'
features = ['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
train_features = train_data[features]
train_labels = train_data['Survived']
test_features = test_data[features]

#特征向量化处理 https://www.cnblogs.com/hellcat/p/7886765.html
# 1 DataFrame字典化，2字典向量化
dvec = DictVectorizer(sparse=False) 
#DictVectorizer： 将dict类型的list数据，转换成numpy array，具有属性vec.feature_names_，查看提取后的特征名。
print(train_features.to_dict(orient='record')[1:3])
train_features = dvec.fit_transform(train_features.to_dict(orient='record'))
print(dvec.feature_names_)

[{'Pclass': 1, 'Sex': 'female', 'Age': 38.0, 'SibSp': 1, 'Parch': 0, 'Fare': 71.2833, 'Embarked': 'C'}, {'Pclass': 3, 'Sex': 'female', 'Age': 26.0, 'SibSp': 0, 'Parch': 0, 'Fare': 7.925, 'Embarked': 'S'}]
['Age', 'Embarked=C', 'Embarked=Q', 'Embarked=S', 'Fare', 'Parch', 'Pclass', 'Sex=female', 'Sex=male', 'SibSp']


In [17]:
# ID3模型构造&训练
clf = DecisionTreeClassifier(criterion='entropy')
clf.fit(train_features,train_labels)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [18]:
#预测
test_features = dvec.transform(test_features.to_dict(orient='record'))
pred_labels = clf.predict(test_features)

In [19]:
#accuracy evaluate
acc_decision_tree = round(clf.score(train_features,train_labels),6)
print('预测准确率：%.4lf'%acc_decision_tree)

预测准确率：0.9820


In [24]:
# Cart模型构造
clf1 = DecisionTreeClassifier()
clf1.fit(train_features,train_labels)
pre_labels = clf1.predict(test_features)
acc_decision_tree = round(clf.score(train_features,train_labels),6)
print('准确率为%.4lf'%acc_decision_tree)

准确率为0.9820
