In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
import re
from tool import *

%matplotlib inline

In [2]:
data_train = pd.read_csv('train.csv')
data_test = pd.read_csv('test.csv')
# 因为数据量少，直接合并训练集和测试集
data_raw = data_train.append(data_test)

In [3]:
# birdview(data_raw)

- **PassengerId：** 乘客ID
- **Pclass：** 乘客等级(1/2/3等舱位)
- **Name：** 乘客姓名
- **Sex：** 性别
- **Age：** 年龄
- **SibSp：** 堂兄弟/妹个数
- **Parch：** 父母与小孩个数
- **Ticket：** 船票信息
- **Fare：** 票价
- **Cabin：** 客舱
- **Embarked：** 登船港口

目标信息：
- **Survived: ** 生还

# 特征工程思路
* 缺失值填补
* 切分出新特征
* 定值分箱转定性
* 交叉特征

In [4]:
data_washed = data_raw.copy()

## 缺失值处理

### Embarked 登船港口

In [5]:
data_washed[data_washed.Embarked.isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
61,38.0,B28,,80.0,"Icard, Miss. Amelie",0,62,1,female,0,1.0,113572
829,62.0,B28,,80.0,"Stone, Mrs. George Nelson (Martha Evelyn)",0,830,1,female,0,1.0,113572


In [6]:
# 根据票价确定登船港口应该为S或者C
# data_raw[data_raw.Fare >= 70].sort_values(by=['Fare']) 

# 判断62号为C
# data_raw[(data_raw.PassengerId>=50) & (data_raw.PassengerId<=70) \
#          & (data_raw.Sex=='female') & (data_raw.Pclass==1)] 
# 判断830号为S
# data_raw[(data_raw.PassengerId>=820) & (data_raw.PassengerId<=840) \
#          & (data_raw.Sex=='female') & (data_raw.Pclass==1)] 

In [7]:
data_washed.loc[data_washed.PassengerId==62,'Embarked'] = 'C'
data_washed.loc[data_washed.PassengerId==830,'Embarked'] = 'S'

### Fare 船票

In [8]:
data_washed[data_washed.Fare.isnull()]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
152,60.5,,S,,"Storey, Mr. Thomas",0,1044,3,male,0,,3701


In [9]:
# 找出同类型的人，使用票价均值
data_raw[(data_raw.PassengerId>=1000) & (data_raw.PassengerId<=1100) \
         & (data_raw.Sex=='male') & (data_raw.Pclass==3) & (data_raw.Parch==0) & (data_raw.SibSp==0)\
         & (data_raw.Embarked=='S')]

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
108,,,S,8.7125,"Willer, Mr. Aaron (Abi Weller"")""",0,1000,3,male,0,,3410
123,28.0,,S,7.25,"Carver, Mr. Alfred John",0,1015,3,male,0,,392095
126,22.0,,S,7.7958,"Brobeck, Mr. Karl Rudolf",0,1018,3,male,0,,350045
129,24.0,,S,8.05,"Petersen, Mr. Marius",0,1021,3,male,0,,342441
130,32.0,,S,8.05,"Spinner, Mr. Henry John",0,1022,3,male,0,,STON/OQ. 369943
134,43.0,,S,7.8958,"Dintcheff, Mr. Valtcho",0,1026,3,male,0,,349226
135,24.0,,S,7.8542,"Carlsson, Mr. Carl Robert",0,1027,3,male,0,,350409
147,22.0,,S,8.05,"Davies, Mr. Evan",0,1039,3,male,0,,SC/A4 23568
152,60.5,,S,,"Storey, Mr. Thomas",0,1044,3,male,0,,3701
155,24.0,,S,7.55,"Duquemin, Mr. Joseph",0,1047,3,male,0,,S.O./P.P. 752


In [10]:
data_washed.loc[data_washed.PassengerId==1044, 'Fare'] = data_raw[(data_raw.PassengerId>=1000) \
                                                            & (data_raw.PassengerId<=1100) \
                                                            & (data_raw.Sex=='male') & (data_raw.Pclass==3) \
                                                            & (data_raw.Parch==0) & (data_raw.SibSp==0) \
                                                            & (data_raw.Embarked=='S') \
                                                            & (data_raw.Fare.notnull())].Fare.mean()

### Age 年龄
Age特征缺失值：Age有20%缺失值，缺失值较多，大量删除会减少样本信息，这里利用其它特征进行预测填补Age，会在后续进行处理

## 衍生变量

### CabinCat 客舱分类
Cabin特征缺失值：Cain特征有70%的缺失值，较为严重，如果进行大量的填补会引入更多噪声。因为缺失值也是一种值，这里将Cabin是否为缺失值作为一个新的特征来处理

In [11]:
data_washed['CabinCat'] = pd.Categorical(data_washed.Cabin.fillna('0').apply(lambda x: x[0])).codes

### NameLength 姓名长度

In [12]:
# 从Name特征衍生出Name的长度
data_washed["NameLength"] = data_washed["Name"].apply(lambda x: len(x))

### Surname 姓氏

In [13]:
# 提取出Name中的Surname 姓氏
data_washed['surname'] = data_washed["Name"].apply(lambda x: x.split(',')[0].lower())

### Title 称呼

In [14]:
# 提取出Name中的Title 称呼
data_washed["Title"] = data_washed["Name"].apply(lambda x: re.search(' ([A-Za-z]+)\.',x).group(1))
# title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Mlle": 2, "Mme": 3,"Don": 9,"Dona": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10, "Sir": 9, "Capt": 7, "Ms": 2}


### TitleCat 称呼量化

In [15]:
# 量化Title信息
title_mapping = {"Mr": 1, "Miss": 2, "Ms": 2, "Mlle": 2, "Mrs": 3, "Mme": 3, 
                 "Master": 4, "Dr": 5, "Rev": 6, "Major": 7, "Col": 7, "Capt": 7,  
                 "Don": 9,"Dona": 9, "Sir": 9, "Lady": 10, "Countess": 10, "Jonkheer": 10}
data_washed["TitleCat"] = data_washed.loc[:,'Title'].map(title_mapping)

### FamilySize 家庭Size

In [16]:
# SibSp和Parch特征进行组合
data_washed["FamilySize"] = data_washed["SibSp"] + data_washed["Parch"] + 1
# 根据FamilySize分布进行分箱
data_washed["FamilySize"] = pd.cut(data_washed["FamilySize"], bins=[0,1,4,20], labels=[0,1,2])

### Embarked&Sex处理

In [17]:
# 量化Embarked特征
data_washed["Embarked"] = pd.Categorical(data_washed.Embarked).codes

# 对Sex特征进行独热编码分组
data_washed = pd.concat([data_washed,pd.get_dummies(data_washed['Sex'])],axis=1)

### 高级衍生变量

In [18]:
# 妇女/儿童/男士标签   但是Age的缺失值挺多的啊？
child_age = 18
def get_person(passenger):
    age, sex = passenger
    if (age < child_age):
        return 'child'
    elif (sex == 'female'):
        return 'female_adult'
    else:
        return 'male_adult'
    
data_washed = pd.concat(
    [data_washed,
     pd.DataFrame(
         data_washed[['Age', 'Sex']].apply(get_person, axis=1), columns=['person']
         )
    ],axis=1)
# 对person特征进行独热编码分组
data_washed= pd.concat([data_washed,
                        pd.get_dummies(data_washed['person'])
                       ],axis=1)

In [19]:
# cabin奇偶性的衍生特征
def get_type_cabine(cabine): 
    cabine_search = re.search('\d+', cabine)
    if cabine_search:
        num = cabine_search.group(0)
        if np.float64(num) % 2 == 0:
            return '2'
        else:
            return '1'
    return '0'

data_washed["Cabin"] = data_washed["Cabin"].fillna(" ")
data_washed["CabinType"] = data_washed["Cabin"].apply(get_type_cabine)
print(pd.value_counts(data_washed["CabinType"]))

0    1020
2     154
1     135
Name: CabinType, dtype: int64


## 特殊变量

### surname衍生特征

In [20]:
# 统计姓氏人数， 简称族群
table_surname = pd.DataFrame(data_washed["surname"].value_counts()) 
table_surname.rename(columns={'surname':'Surname_Numbers'}, inplace=True)

# 族群死亡妇女人数
table_surname['Surname_dead_women'] = data_washed.surname[(data_washed.female_adult == 1.0)
                                    & (data_washed.Survived == 0.0)
                                    & ((data_washed.Parch > 0) | (data_washed.SibSp > 0))].value_counts()
table_surname['Surname_dead_women'] = table_surname['Surname_dead_women'].fillna(0)
table_surname.loc[table_surname['Surname_dead_women']>0,'Surname_dead_women'] = 1.0

# # 族群存活男性人数
table_surname['Surname_surviving_men'] = data_washed.surname[(data_washed.male_adult == 1.0)
                                    & (data_washed.Survived == 1.0)
                                    & ((data_washed.Parch > 0) | (data_washed.SibSp > 0))].value_counts()
table_surname['Surname_surviving_men'] = table_surname['Surname_surviving_men'].fillna(0)
table_surname.loc[table_surname['Surname_surviving_men']>0,'Surname_surviving_men'] = 1.0

# surname特征量化
table_surname["Surname_Id"]= pd.Categorical(table_surname.index).codes
# surname数量分箱
table_surname.loc[table_surname["Surname_Numbers"] < 3, "Surname_Id"] = -1
table_surname["Surname_Numbers"] = pd.cut(table_surname["Surname_Numbers"], bins=[0,1,4,20], labels=[0,1,2])

# 警告
data_washed = pd.merge(data_washed, table_surname, left_on="surname",right_index=True,how='left', sort=False)

ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


### ticket的衍生特征

In [21]:
# 统计同票量
table_ticket = pd.DataFrame(data_washed["Ticket"].value_counts())
table_ticket.rename(columns={'Ticket':'Ticket_Numbers'}, inplace=True)

# 套票内死亡妇女人数
table_ticket['Ticket_dead_women'] = data_washed.Ticket[(data_washed.female_adult == 1.0) 
                                    & (data_washed.Survived == 0.0) 
                                    & ((data_washed.Parch > 0) | (data_washed.SibSp > 0))].value_counts()
table_ticket['Ticket_dead_women'] = table_ticket['Ticket_dead_women'].fillna(0)
table_ticket.loc[table_ticket['Ticket_dead_women']>0,'Ticket_dead_women'] = 1.0

# 套票内存活男性人数
table_ticket['Ticket_surviving_men'] = data_washed.Ticket[(data_washed.male_adult == 1.0) 
                                    & (data_washed.Survived == 1.0) 
                                    & ((data_washed.Parch > 0) | (data_washed.SibSp > 0))].value_counts()
table_ticket['Ticket_surviving_men'] = table_ticket['Ticket_surviving_men'].fillna(0)
table_ticket.loc[table_ticket['Ticket_surviving_men']>0,'Ticket_surviving_men'] = 1.0

# Ticket特征量化
table_ticket["Ticket_Id"] = pd.Categorical(table_ticket.index).codes
# Ticket数量分箱
table_ticket.loc[table_ticket["Ticket_Numbers"] < 3, "Ticket_Id"] = -1
table_ticket["Ticket_Numbers"] = pd.cut(table_ticket["Ticket_Numbers"], bins=[0,1,4,20], labels=[0,1,2])

# 警告
data_washed = pd.merge(data_washed, table_ticket, left_on="Ticket",right_index=True, how='left', sort=False)

ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'

Exception ignored in: 'pandas._libs.lib.is_bool_array'
ValueError: Buffer dtype mismatch, expected 'Python object' but got 'long'


### Age的缺失填补

In [22]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesRegressor

classers = ['Fare','Parch','Pclass','SibSp','TitleCat','CabinCat',
            'female','male', 'Embarked', 'FamilySize', 'NameLength','Ticket_Numbers','Ticket_Id']

etr = ExtraTreesRegressor(n_estimators=200,random_state=0)

X_train = data_washed[classers][data_washed['Age'].notnull()]
Y_train = data_washed['Age'][data_washed['Age'].notnull()]
X_test = data_washed[classers][data_washed['Age'].isnull()]

etr.fit(X_train.as_matrix(),np.ravel(Y_train))
age_preds = etr.predict(X_test.as_matrix())
data_washed.loc[data_washed['Age'].isnull(), 'Age'] = age_preds

# 特征选择

In [23]:
X_train = data_washed[0:891].copy()
y_train = data_train["Survived"]
X_test = data_washed[891:].copy()

In [24]:
from sklearn.feature_selection import SelectKBest, f_classif, chi2

features= ['female','male','Age','male_adult','female_adult', 'child','TitleCat',
           'Pclass','Ticket_Id','NameLength','CabinType','CabinCat', 'SibSp', 'Parch',
           'Fare','Embarked','Surname_Numbers','Ticket_Numbers','FamilySize',
           'Ticket_dead_women','Ticket_surviving_men',
           'Surname_dead_women','Surname_surviving_men']

selector = SelectKBest(f_classif, k=len(features))
selector.fit(X_train[features], y_train)
scores = -np.log10(selector.pvalues_)
indices = np.argsort(scores)[::-1]
print("Features importance :")
for f in range(len(scores)):
    print("%0.2f %s" % (scores[indices[f]],features[indices[f]]))

Features importance :
68.85 female
68.85 male
64.90 male_adult
53.23 female_adult
26.22 TitleCat
24.60 Pclass
23.69 NameLength
17.75 CabinCat
17.00 Ticket_surviving_men
16.28 CabinType
14.21 Fare
13.54 Ticket_dead_women
13.04 Surname_surviving_men
10.36 Surname_dead_women
6.55 Embarked
5.27 Ticket_Numbers
3.59 child
2.93 FamilySize
1.83 Parch
1.65 Age
1.07 Ticket_Id
0.73 Surname_Numbers
0.53 SibSp


# 建模预测

In [25]:
features_selected= features

In [37]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier,GradientBoostingClassifier

rfc = RandomForestClassifier(n_estimators=3000, min_samples_split=4, class_weight={0:0.745,1:0.255})
# rfc = AdaBoostClassifier(n_estimators=3000, learning_rate=0.1, random_state=1)
# rfc = GradientBoostingClassifier(n_estimators=3000)

scores = cross_val_score(rfc, X_train[features_selected], y_train, cv=3)
print("Accuracy: %0.3f (+/- %0.2f) [%s]" % (scores.mean()*100, scores.std()*100, 'RFC Cross Validation'))

rfc.fit(X_train[features_selected], y_train)
score = rfc.score(X_train[features_selected], y_train)
print("Accuracy: %0.3f           [%s]" % (score*100, 'RFC full test'))

importances = rfc.feature_importances_
indices = np.argsort(importances)[::-1]
for f in range(len(features_selected)):
    print("%d. feature %d (%f) %s" % (f + 1, 
                                      indices[f]+1, 
                                      importances[indices[f]]*100, 
                                      features_selected[indices[f]]))

Accuracy: 89.450 (+/- 0.57) [RFC Cross Validation]
Accuracy: 96.409           [RFC full test]
1. feature 15 (9.496257) Fare
2. feature 3 (9.169027) Age
3. feature 10 (8.600774) NameLength
4. feature 20 (7.768113) Ticket_dead_women
5. feature 22 (7.092957) Surname_dead_women
6. feature 7 (7.028014) TitleCat
7. feature 2 (6.691796) male
8. feature 1 (5.841961) female
9. feature 8 (5.306097) Pclass
10. feature 4 (4.797643) male_adult
11. feature 21 (4.481418) Ticket_surviving_men
12. feature 5 (3.525052) female_adult
13. feature 18 (2.831558) Ticket_Numbers
14. feature 12 (2.783614) CabinCat
15. feature 23 (2.589469) Surname_surviving_men
16. feature 19 (2.405441) FamilySize
17. feature 11 (1.967324) CabinType
18. feature 17 (1.941359) Surname_Numbers
19. feature 16 (1.693444) Embarked
20. feature 9 (1.345310) Ticket_Id
21. feature 13 (1.247839) SibSp
22. feature 14 (0.700526) Parch
23. feature 6 (0.695008) child


In [38]:
# 输出预测结果
PassengerId =np.array(data_test["PassengerId"]).astype(int)
predictions = rfc.predict(X_test[features_selected])
my_prediction = pd.DataFrame(predictions, PassengerId, columns = ["Survived"])
my_prediction.to_csv("prediction01.csv", index_label = ["PassengerId"])