In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [2]:
mapping = {'Don': 'Aristocratic', 'Mme': 'Miss', 'Ms': 'Miss', 'Dr': 'Aristocratic',
    'Major': 'Aristocratic', 'Lady': 'Aristocratic', 'Sir': 'Aristocratic', 
    'Mlle': 'Miss', 'Col': 'Aristocratic', 'Capt': 'Aristocratic', 'Rev': 'Aristocratic',
    'Countess': 'Aristocratic', 'Jonkheer': 'Aristocratic', 'Dona': 'Aristocratic'}

In [3]:
train = pd.read_csv('titanic/train.csv')
test = pd.read_csv('titanic/test.csv')

In [4]:
def populate_embarked(df):
    embarked = df.groupby('Embarked').count()['PassengerId']
    embarked_max = embarked[embarked == embarked.max()].index[0]
    df.loc[df['Embarked'].isnull(), 'Embarked'] = embarked_max
    return df

In [5]:
def set_titles(df, mapping):
    df['Title'] = df['Name'].str.extract('([A-Za-z]+)\.')
    df.replace({'Title': mapping}, inplace=True)
    return df

In [6]:
def populate_age(df):
    age_med = df.groupby('Title')['Age'].median()
    for title, age in age_med.iteritems():
        filter_age = df['Age'].isnull()
        filter_title = df['Title'] == title
        df.loc[filter_age & filter_title, 'Age'] = age
    return df

In [7]:
def populate_fare(df):
    for pclass, fare in df.groupby('Pclass')['Fare'].median().iteritems():
        filter_fare = df['Fare'] == 0
        filter_pclass = df['Pclass'] == pclass
        df.loc[filter_fare & filter_pclass, 'Fare'] = fare
    return df

In [8]:
def set_family(df):
    df['Family'] = df['SibSp'] + df['Parch']
    return df

In [9]:
def set_iscabin(df):
    df['isCabin'] = np.where(df['Cabin'].notna(), 1, 0)
    return df

In [11]:
train = populate_embarked(train)
train = set_titles(train, mapping)
train = populate_age(train)
train = populate_fare(train)
train = set_family(train)
train = set_iscabin(train)

In [12]:
test = populate_embarked(test)
test = set_titles(test, mapping)
test = populate_age(test)
test = populate_fare(test)
test = set_family(test)
test = set_iscabin(test)

In [13]:
test['Fare'][test['Fare'].isnull()] = test['Fare'].median()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [14]:
train = train.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [15]:
result = pd.DataFrame(test['PassengerId'])
test = test.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)

In [16]:
le = LabelEncoder()
classes = {}

In [17]:
le.fit(train['Sex'])
classes['Sex'] = le.classes_
train['Sex'] = le.transform(train['Sex'])

le.fit(train['Embarked'])
classes['Embarked'] = le.classes_
train['Embarked'] = le.transform(train['Embarked'])

le.fit(train['Title'])
classes['Title'] = le.classes_
train['Title'] = le.transform(train['Title'])

In [18]:
le.fit(classes['Sex'])
test['Sex'] = le.transform(test['Sex'])

le.fit(classes['Embarked'])
test['Embarked'] = le.transform(test['Embarked'])

le.fit(classes['Title'])
test['Title'] = le.transform(test['Title'])

In [19]:
target = train['Survived']
train = train.drop(['Survived'], axis=1)

In [20]:
model_rf = RandomForestClassifier(n_estimators=100)

In [21]:
scores = cross_val_score(model_rf, train, target, cv=5)
print(scores)
print(scores.mean())

[0.83798883 0.7877095  0.83146067 0.76966292 0.84180791]
0.8137259658264968


In [22]:
model_rf = model_rf.fit(train, target)

In [23]:
predictions = model_rf.predict(test)

In [24]:
result.insert(1, 'Survived', predictions)

In [25]:
result.to_csv('titanic/result4.csv', index=False)