In [197]:
import pandas as pd
import numpy as np

df = pd.read_csv('./data/train.csv')

In [198]:
df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)

In [199]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 62.7+ KB


In [200]:
age_mean = df['Age'].mean()
df['Age'] = df['Age'].fillna(age_mean)

In [201]:
from scipy.stats import mode

mode_embarked = mode(df['Embarked'])[0][0]
df['Embarked'] = df['Embarked'].fillna(mode_embarked)

In [202]:
df['Gender'] = df['Sex'].map({ 'female': 0, 'male': 1 }).astype(int)
df['Port'] = df['Embarked'].map({ 'C': 1, 'S': 2, 'Q': 3 }).astype(int)

df = df.drop(['Sex', 'Embarked'], axis=1)

For convenience, we move the column Survived to the left-most column. We note that the left-most column is indexed as 0.

In [203]:
cols = df.columns.tolist()
cols = [cols[1]] + cols[0:1] + cols[2:]
df = df[cols]

In [204]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
Survived       891 non-null int64
PassengerId    891 non-null int64
Pclass         891 non-null int64
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Gender         891 non-null int64
Port           891 non-null int64
dtypes: float64(2), int64(7)
memory usage: 62.7 KB


In [205]:
train_data = df.values

In [206]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 100)
model = model.fit(train_data[0:, 2:], train_data[0:, 0])

In [207]:
df_test = pd.read_csv('./data/test.csv')

df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [208]:
df_test = df_test.drop(['Name', 'Ticket', 'Cabin'], axis=1)
df_test['Age'] = df_test['Age'].fillna(age_mean)

For the column Fare, however, it makes sense to fill in the NaN values with the mean by the column Pclass, or Passenger class.

In [209]:
fare_means = df.pivot_table('Fare', index='Pclass', aggfunc='mean')
fare_means

Unnamed: 0_level_0,Fare
Pclass,Unnamed: 1_level_1
1,84.154687
2,20.662183
3,13.67555


Here we created a pivot table by calculating the mean of the column Fare by each Pclass, which we will use to fill in our NaN values.

In [210]:
fare_means.info()
df_test[150:160]

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 1 to 3
Data columns (total 1 columns):
Fare    3 non-null float64
dtypes: float64(1)
memory usage: 48.0 bytes


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
150,1042,1,female,23.0,0,1,83.1583,C
151,1043,3,male,29.699118,0,0,7.8958,C
152,1044,3,male,60.5,0,0,,S
153,1045,3,female,36.0,0,2,12.1833,S
154,1046,3,male,13.0,4,2,31.3875,S
155,1047,3,male,24.0,0,0,7.55,S
156,1048,1,female,29.0,0,0,221.7792,S
157,1049,3,female,23.0,0,0,7.8542,S
158,1050,1,male,42.0,0,0,26.55,S
159,1051,3,female,26.0,0,2,13.775,S


In [211]:
df_test['Fare'] = df_test[['Fare', 'Pclass']].apply(lambda x:
                            13.675550 if pd.isnull(x['Fare'])
                            else x['Fare'], axis=1)

In [212]:
df_test['Gender'] = df_test['Sex'].map({'female': 0, 'male': 1}).astype(int)
df_test['Port'] = df_test['Embarked'].map({'C':1, 'S':2, 'Q':3})

df_test = df_test.drop(['Sex', 'Embarked'], axis=1)

test_data = df_test.values

output = model.predict(test_data[:,1:])

In [213]:
result = np.c_[test_data[:,0].astype(int), output.astype(int)]
df_result = pd.DataFrame(result[:,0:2], columns=['PassengerId', 'Survived'])

df_result.to_csv('./results/titanic_1-1.csv', index=False)

In [214]:
df_result.shape

(418, 2)