In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
mapping = {'Don': 'RoyaltyM', 'Mme': 'Miss', 'Ms': 'Miss',
    'Major': 'Officer', 'Lady': 'RoyaltyF', 'Sir': 'RoyaltyM', 
    'Mlle': 'Miss', 'Col': 'Officer', 'Capt': 'Officer',
    'Countess': 'RoyaltyF', 'Jonkheer': 'RoyaltyM', 'Dona': 'RoyaltyF'}

In [3]:
train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')

In [4]:
def populate_embarked(df):
    embarked = df.groupby('Embarked').count()['PassengerId']
    embarked_max = embarked[embarked == embarked.max()].index[0]
    df.loc[df['Embarked'].isnull(), 'Embarked'] = embarked_max
    return df

In [5]:
def set_titles(df, mapping):
    df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.')
    df.replace({'Title': mapping}, inplace=True)
    return df

In [6]:
def populate_age(df):
    age_med = df.groupby('Title')['Age'].median()
    for title, age in age_med.iteritems():
        filter_age = df['Age'].isnull()
        filter_title = df['Title'] == title
        df.loc[filter_age & filter_title, 'Age'] = age
    return df

In [7]:
train = populate_embarked(train)
train = set_titles(train, mapping)
train = populate_age(train)

In [8]:
test = populate_embarked(test)
test = set_titles(test, mapping)
test = populate_age(test)

In [9]:
test['Fare'][test['Fare'].isnull()] = test['Fare'].median()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [10]:
train = train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [12]:
result = pd.DataFrame(test['PassengerId'])
test = test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [13]:
le = LabelEncoder()
classes = {}

In [14]:
le.fit(train['Sex'])
classes['Sex'] = le.classes_
train['Sex'] = le.transform(train['Sex'])

le.fit(train['Embarked'])
classes['Embarked'] = le.classes_
train['Embarked'] = le.transform(train['Embarked'])

le.fit(train['Title'])
classes['Title'] = le.classes_
train['Title'] = le.transform(train['Title'])

In [15]:
le.fit(classes['Sex'])
test['Sex'] = le.transform(test['Sex'])

le.fit(classes['Embarked'])
test['Embarked'] = le.transform(test['Embarked'])

le.fit(classes['Title'])
test['Title'] = le.transform(test['Title'])

In [17]:
target = train['Survived']
train = train.drop(['Survived'], axis=1)

In [18]:
model_rf = RandomForestClassifier(n_estimators=100)
model_kn = KNeighborsClassifier(n_neighbors=25)

In [19]:
scores = cross_val_score(model_rf, train, target, cv=5)
print(scores)
print(scores.mean())

[0.77094972 0.79329609 0.84269663 0.75280899 0.83615819]
0.7991819240247579


In [20]:
scores = cross_val_score(model_kn, train, target, cv=5)
print(scores)
print(scores.mean())

[0.6424581  0.72625698 0.71910112 0.73595506 0.75141243]
0.715036738590539


In [21]:
model_rf = model_rf.fit(train, target)

In [22]:
predictions = model_rf.predict(test)

In [27]:
result.insert(1, 'Survived', predictions)

In [29]:
result.to_csv('titanic/result.csv', index=False)