# Titanic Kaggle

In [134]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
  
def load_titanic_data(csv_path):
    return pd.read_csv(csv_path)

def split_train_test(data, test_ratio):
    np.random.seed(42)
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [135]:
train_data = load_titanic_data('datasets/titanic/train.csv')
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [136]:
test_data = load_titanic_data('datasets/titanic/test.csv')

In [137]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
women

1      1
2      1
3      1
8      1
9      1
      ..
880    1
882    0
885    0
887    1
888    0
Name: Survived, Length: 314, dtype: int64

In [138]:
rate_women = sum(women)/len(women)
print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [139]:
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)
print("% of men who survived:", rate_men)


% of men who survived: 0.18890814558058924


In [140]:
from sklearn.ensemble import RandomForestClassifier

y = train_data["Survived"]

In [141]:
def toNum(e):
    result = 0
    if isinstance(e, str):
        result = len(e.split())
    return result

def toNameLabel(e):
    if "Mr." in e:
        return "Mr."
    elif "Master." in e:
        return "Master."
    # elif "Major." in e:
    #     return "Major."
    # elif "Don." in e:
    #     return "Don."
    elif "Rev." in e:
        return "Rev."
    elif "Dr." in e:
        return "Dr."
    elif "Mrs." in e:
        return "Mrs."
    elif "Miss." in e:
        return "Miss."
    # elif "Mlle." in e:
    #     return "Mlle."
    # elif "Countess." in e:
    #     return "Countess."
    # elif "Dona." in e:
    #     return "Dona."
    else:
        return ""

def toMarried(e):
    result = 0
    import re
    p = re.compile('.*\(.*\).*')
    if p.match(e):
        result = 1
    return result

In [142]:
# def toAgeLabel(e):
#     result = 0
#     import math
#     if not math.isnan(e):
#         result = round(e/10)
#     return str(result)

train_data["Cabin"] = list(map(lambda e: toNum(e), train_data["Cabin"]))
test_data["Cabin"] = list(map(lambda e: toNum(e), test_data["Cabin"]))

# train_data["Married"] = list(map(lambda e: toMarried(e), train_data["Name"]))
# test_data["Married"] = list(map(lambda e: toMarried(e), test_data["Name"]))

train_data["Title"] = list(map(lambda e: toNameLabel(e), train_data["Name"]))
test_data["Title"] = list(map(lambda e: toNameLabel(e), test_data["Name"]))

# train_data["Age"] = list(map(lambda e: toAgeLabel(e), train_data["Age"]))
# test_data["Age"] = list(map(lambda e: toAgeLabel(e), test_data["Age"]))

In [143]:
features = ["Pclass", "SibSp", "Parch", "Age", "Fare", "Cabin", "Title", "Embarked"]
# features = ["Name"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

In [144]:
X.head()

Unnamed: 0,Pclass,SibSp,Parch,Age,Fare,Cabin,Title_,Title_Dr.,Title_Master.,Title_Miss.,Title_Mr.,Title_Mrs.,Title_Rev.,Embarked_C,Embarked_Q,Embarked_S
0,3,1,0,22.0,7.25,0,0,0,0,0,1,0,0,0,0,1
1,1,1,0,38.0,71.2833,1,0,0,0,0,0,1,0,1,0,0
2,3,0,0,26.0,7.925,0,0,0,0,1,0,0,0,0,0,1
3,1,1,0,35.0,53.1,1,0,0,0,0,0,1,0,0,0,1
4,3,0,0,35.0,8.05,0,0,0,0,0,1,0,0,0,0,1


In [145]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 16 columns):
Pclass           891 non-null int64
SibSp            891 non-null int64
Parch            891 non-null int64
Age              714 non-null float64
Fare             891 non-null float64
Cabin            891 non-null int64
Title_           891 non-null uint8
Title_Dr.        891 non-null uint8
Title_Master.    891 non-null uint8
Title_Miss.      891 non-null uint8
Title_Mr.        891 non-null uint8
Title_Mrs.       891 non-null uint8
Title_Rev.       891 non-null uint8
Embarked_C       891 non-null uint8
Embarked_Q       891 non-null uint8
Embarked_S       891 non-null uint8
dtypes: float64(2), int64(4), uint8(10)
memory usage: 50.6 KB


In [146]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 16 columns):
Pclass           418 non-null int64
SibSp            418 non-null int64
Parch            418 non-null int64
Age              332 non-null float64
Fare             417 non-null float64
Cabin            418 non-null int64
Title_           418 non-null uint8
Title_Dr.        418 non-null uint8
Title_Master.    418 non-null uint8
Title_Miss.      418 non-null uint8
Title_Mr.        418 non-null uint8
Title_Mrs.       418 non-null uint8
Title_Rev.       418 non-null uint8
Embarked_C       418 non-null uint8
Embarked_Q       418 non-null uint8
Embarked_S       418 non-null uint8
dtypes: float64(2), int64(4), uint8(10)
memory usage: 23.8 KB


In [147]:
#todo don't use it
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy="median")

# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()

imputer.fit(X)
X_ = imputer.transform(X)
# scaler.fit(X_)
# X_ = scaler.transform(X_)
X_ = pd.DataFrame(X_, columns=X.columns)

imputer.fit(X_test)
X_test_ = imputer.transform(X_test)
# scaler.fit(X_test_)
# X_test_ = scaler.transform(X_test_)
X_test_ = pd.DataFrame(X_test_, columns=X_test.columns)

X1, X2 = split_train_test(X_, 0.2)
y1, y2 = split_train_test(y, 0.2)

# model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
# model.fit(X, y)
# predictions = model.predict(X_test)

In [148]:
from sklearn.model_selection import GridSearchCV
model = RandomForestClassifier()
param_grid = [
    { 
        'n_estimators': [100],
        'max_depth': [5],
        'min_samples_split': [2],
        'min_samples_leaf': [1],
        'max_leaf_nodes': [None],
        'random_state': [1],
        'criterion': ['gini']
    }
]

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=0)

In [149]:
grid_search.fit(X_, y)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [150]:
X_.head()

Unnamed: 0,Pclass,SibSp,Parch,Age,Fare,Cabin,Title_,Title_Dr.,Title_Master.,Title_Miss.,Title_Mr.,Title_Mrs.,Title_Rev.,Embarked_C,Embarked_Q,Embarked_S
0,3.0,1.0,0.0,22.0,7.25,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,1.0,1.0,0.0,38.0,71.2833,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,3.0,0.0,0.0,26.0,7.925,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,1.0,0.0,35.0,53.1,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,3.0,0.0,0.0,35.0,8.05,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [151]:
X_test_.head()

Unnamed: 0,Pclass,SibSp,Parch,Age,Fare,Cabin,Title_,Title_Dr.,Title_Master.,Title_Miss.,Title_Mr.,Title_Mrs.,Title_Rev.,Embarked_C,Embarked_Q,Embarked_S
0,3.0,0.0,0.0,34.5,7.8292,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,3.0,1.0,0.0,47.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,2.0,0.0,0.0,62.0,9.6875,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,3.0,0.0,0.0,27.0,8.6625,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,3.0,1.0,1.0,22.0,12.2875,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [152]:
y2_pred = grid_search.predict(X2)

In [153]:
from sklearn.metrics import accuracy_score
accuracy_score(y2, y2_pred)


0.8258426966292135

In [154]:
grid_search.score(X2, y2)

0.8258426966292135

In [155]:
grid_search.score(X_, y)

0.8496071829405163

In [156]:
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model.fit(X_, y)
model.score(X_, y)

0.8496071829405163

In [157]:
predictions = grid_search.predict(X_test_)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [158]:
grid_search.best_params_



{'criterion': 'gini',
 'max_depth': 5,
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 100,
 'random_state': 1}