### Titanic Machine Learning from Disaster
https://www.kaggle.com/c/titanic

In [1]:
# 做完特徵工程前的所有準備 (與前範例相同)
import pandas as pd
import numpy as np
import copy, time
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

# 忽略警告訊息
import warnings
warnings.filterwarnings('ignore')

data_path = '../data/'
df_train = pd.read_csv(data_path + 'titanic_train.csv')
df_test = pd.read_csv(data_path + 'titanic_test.csv')

train_Y = df_train['Survived']
train_num = train_Y.shape[0]
ids = df_test['PassengerId']
df_train = df_train.drop(['PassengerId', 'Survived', 'Name'] , axis=1)
df_test = df_test.drop(['PassengerId', 'Name'] , axis=1)
df = pd.concat([df_train,df_test])
df.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,3,male,22.0,1,0,A/5 21171,7.25,,S
1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,female,35.0,1,0,113803,53.1,C123,S
4,3,male,35.0,0,0,373450,8.05,,S


In [2]:
#只取類別值 (object) 型欄位, 存於 object_features 中
object_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'object':
        object_features.append(feature)
print(f'{len(object_features)} Numeric Features : {object_features}\n')

# 只留類別型欄位
df_object = df[object_features]
df_object = df_object.fillna('None')
print(df_object.shape)
print(df.shape)

4 Numeric Features : ['Sex', 'Ticket', 'Cabin', 'Embarked']

(1309, 4)
(1309, 9)


In [3]:
#只取 int64, float64 兩種數值型欄位, 存於 num_features 中
num_features = []
for dtype, feature in zip(df.dtypes, df.columns):
    if dtype == 'float64' or dtype == 'int64':
        num_features.append(feature)
print(f'{len(num_features)} Numeric Features : {num_features}\n')

# 只留數值型欄位
df_num = df[num_features]
df_num = df_num.fillna(0)
df_num = MinMaxScaler().fit_transform(df_num)
print(df_num.shape)
print(df.shape)

5 Numeric Features : ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

(1309, 5)
(1309, 9)


In [10]:
# MinMaxScaler + One hot encoding + RandomForestClassifier

df[object_features] = df_object
df[num_features] = df_num 

df_temp = pd.get_dummies(df)
train_X = df_temp[:train_num] 
test_X = df_temp[train_num:]

estimator = RandomForestClassifier(n_estimators=500)
estimator.fit(train_X, train_Y)
pred = estimator.predict(test_X)

sub = pd.DataFrame({'PassengerId': ids, 'Survived': pred})
sub.to_csv('titanic_baseline.csv', index=False) 

start = time.time()
print(f'score : {cross_val_score(estimator, train_X, train_Y, cv=10).mean()}')
print(f'time : {time.time() - start} sec')
df_temp.head()

score : 0.8372610940869368
time : 13.587064981460571 sec


Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Ticket_110152,Ticket_110413,Ticket_110465,...,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_None,Cabin_T,Embarked_C,Embarked_None,Embarked_Q,Embarked_S
0,1.0,0.275,0.125,0.0,0.014151,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
1,0.0,0.475,0.125,0.0,0.139136,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1.0,0.325,0.0,0.0,0.015469,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0.0,0.4375,0.125,0.0,0.103644,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1.0,0.4375,0.0,0.0,0.015713,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
