In [15]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn import linear_model

In [16]:
# File paths
TRAIN_PATH = "data/train.csv"
TEST_PATH = "data/test.csv"

In [17]:
# Read in the datasets from the respective CSV files
train_data = pd.read_csv(TRAIN_PATH)
test_data = pd.read_csv(TEST_PATH)

In [18]:
# Replace the categorical classes with binary
train = train_data.replace({'Sex': {'male': 0, 'female': 1}})
test = test_data.replace({'Sex': {'male': 0, 'female': 1}})

In [19]:
def replace_na (data_set):
    # Find mean male age
    male_age = data_set.loc[data_set['Sex'] == 0]['Age'].dropna()
    male_mean_age = np.mean(male_age)
    print "Mean male age: ", male_mean_age

    # Find mean female age
    female_age = data_set.loc[data_set['Sex'] == 1]['Age'].dropna()
    female_mean_age = np.mean(female_age)
    print "Mean female age: ", female_mean_age

    # Replace empty cells with appropriate mean age based on sex
    data_set[(data_set['Sex']==0) & (pd.isnull(data_set['Age']))] = data_set[(data_set['Sex']==0) & (pd.isnull(data_set['Age']))].fillna(male_mean_age)
    data_set[(data_set['Sex']==1) & (pd.isnull(data_set['Age']))] = data_set[(data_set['Sex']==1) & (pd.isnull(data_set['Age']))].fillna(female_mean_age)
    return

In [20]:
replace_na(train)
replace_na(test)

Mean male age:  30.7266445916
Mean female age:  27.9157088123
Mean male age:  30.2727317073
Mean female age:  30.2723622047


In [24]:
# Choose features to include
x_train = train[['Pclass', 'Age', 'Sex', 'SibSp', 'Parch']].values
y_train = train['Survived'].values

x_test = test[['Pclass', 'Age', 'Sex', 'SibSp', 'Parch']].values

In [25]:
# Fit
lr = linear_model.LogisticRegression(random_state=1).fit(x_train, y_train)

# Cross validation, k=3
scores = sk.cross_validation.cross_val_score(lr, x_train, y_train, cv=3)
print scores.mean()

0.787878787879


In [26]:
# Predict and submit!
predicted_values = lr.predict(x_test)
p_id = test_data[['PassengerId']].values.flatten()

submission = pd.DataFrame({'PassengerId': p_id, 'Survived': predicted_values})
submission.to_csv('titanic_submission.csv', index=False)
print submission

     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         1
5            897         0
6            898         1
7            899         0
8            900         1
9            901         0
10           902         0
11           903         0
12           904         1
13           905         0
14           906         1
15           907         1
16           908         0
17           909         0
18           910         1
19           911         0
20           912         0
21           913         0
22           914         1
23           915         1
24           916         1
25           917         0
26           918         1
27           919         0
28           920         0
29           921         0
..           ...       ...
388         1280         0
389         1281         0
390         1282         1
391         1283         1
392         1284         0
3