In [1]:
import pandas as pd
import sklearn

In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
df.columns

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')

In [4]:
clean = df.copy()
# Female is encoded as 0; male as 1
clean['sex'] = clean.sex.map({'female': 0, 'male': 1}).astype(int)
clean.head(5)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",0,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",1,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",0,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",1,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",0,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [5]:
clean.embarked.value_counts()

S    914
C    270
Q    123
Name: embarked, dtype: int64

In [6]:
embark_dummies = pd.get_dummies(clean[['embarked']])

In [7]:
clean = pd.concat([clean, embark_dummies], axis=1)
del clean['embarked']
del clean['name']
del clean['home.dest']
del clean['body']
del clean['ticket']
del clean['boat']
clean

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,cabin,embarked_C,embarked_Q,embarked_S
0,1,1,0,29.0000,0,0,211.3375,B5,0,0,1
1,1,1,1,0.9167,1,2,151.5500,C22 C26,0,0,1
2,1,0,0,2.0000,1,2,151.5500,C22 C26,0,0,1
3,1,0,1,30.0000,1,2,151.5500,C22 C26,0,0,1
4,1,0,0,25.0000,1,2,151.5500,C22 C26,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,0,14.5000,1,0,14.4542,,1,0,0
1305,3,0,0,,1,0,14.4542,,1,0,0
1306,3,0,1,26.5000,0,0,7.2250,,1,0,0
1307,3,0,1,27.0000,0,0,7.2250,,1,0,0


In [8]:
cabins = pd.get_dummies(clean.cabin.str[0])
clean = pd.concat([clean, cabins], axis=1)
del clean['cabin']
clean

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked_C,embarked_Q,embarked_S,A,B,C,D,E,F,G,T
0,1,1,0,29.0000,0,0,211.3375,0,0,1,0,1,0,0,0,0,0,0
1,1,1,1,0.9167,1,2,151.5500,0,0,1,0,0,1,0,0,0,0,0
2,1,0,0,2.0000,1,2,151.5500,0,0,1,0,0,1,0,0,0,0,0
3,1,0,1,30.0000,1,2,151.5500,0,0,1,0,0,1,0,0,0,0,0
4,1,0,0,25.0000,1,2,151.5500,0,0,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,0,14.5000,1,0,14.4542,1,0,0,0,0,0,0,0,0,0,0
1305,3,0,0,,1,0,14.4542,1,0,0,0,0,0,0,0,0,0,0
1306,3,0,1,26.5000,0,0,7.2250,1,0,0,0,0,0,0,0,0,0,0
1307,3,0,1,27.0000,0,0,7.2250,1,0,0,0,0,0,0,0,0,0,0


In [9]:
clean = clean.fillna(999)

In [10]:
clean.to_csv('titanic-cleaned.csv')

In [11]:
X = clean.drop('survived', axis=1)
y = clean['survived']

In [18]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [19]:
X_train.shape, X_test.shape

((981, 17), (328, 17))

In [20]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [21]:
rf.predict(X_test)

array([0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0,

In [22]:
rf.score(X_test, y_test)

0.7926829268292683