# **Predicting Fatal VIctims of the Titanic Sinking**

**>79% -> Better than 92% of submissions**

In [328]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

## Data Understanding

In [329]:
raw_train = pd.read_csv('./data/train.csv')
raw_test = pd.read_csv('./data/test.csv')

In [330]:
raw_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Data Preparation

In [331]:
X, y = raw_train.iloc[:, 2:], raw_train['Survived']

In [332]:
def dealWithCabin(df):
    
    dft = df.copy()
    dft['Numeration'] = dft['Cabin'].str.extract(r'([A-Za-z])(\d+)', expand=False)[1].astype(float)
    dft['Numeration'].fillna(0, inplace=True)
    
    dft['Sector'] = dft['Cabin'].str.extract(r'([A-Za-z])(\d+)', expand=False)[0]
    dft['Sector'].fillna('Z', inplace=True)
    dft['Sector'] = dft['Sector'].map({
        'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5,
        'F': 6, 'G': 7, 'T': 8, 'Z': 0
    }).astype(int)

    dft.drop('Cabin', axis=1, inplace=True)
    return dft

In [333]:
def categoricToNumeric(df):
    
    df = df.copy()

    # map categorical values to numeric on 'Embarke' column
    emb = {x: i for i, x in enumerate(df['Embarked'].unique())}
    df['Embarked'] = df['Embarked'].map(emb)

    # map categorical values to numeric on 'Sex' column
    sex = {df['Sex'].unique()[0]: 0, df['Sex'].unique()[1]: 1}
    df['Sex'] = df['Sex'].map(sex)
    
    # remove categorical parts from 'Ticket' column
    df['Ticket'] = df['Ticket'].str.extract('(\d+)', expand=False).astype(float)

    df = dealWithCabin(df)
    return df

In [334]:
def removeNaN(df):
    for column in df:
        if df[column].dtype == 'float64':
            df[column] = df[column].fillna(df[column].mean())
    df.drop(['Name'], axis=1, inplace=True, errors='ignore')
    return df

In [335]:
def normalize (df):
    for column in df:
        if df[column].dtype == 'float64':
            df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())
    return df 

In [336]:
# def a DataPreparation that assemble all the data preparation processes done to raw_train until now

def DataPreparation(df):
    x = df.copy()
    x = categoricToNumeric(x)
    x = removeNaN(x)
    x = normalize(x)
    return x

X_norm = DataPreparation(X)
X_norm.describe()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Numeration,Sector
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,2.308642,0.352413,0.367921,0.523008,0.381594,0.073467,0.062858,0.368126,0.076577,0.756453
std,0.836071,0.47799,0.163383,1.102743,0.806057,0.161648,0.096995,0.647601,0.181855,1.570855
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.271174,0.0,0.0,0.003802,0.01544,0.0,0.0,0.0
50%,3.0,0.0,0.367921,0.0,0.0,0.035601,0.028213,0.0,0.0,0.0
75%,3.0,1.0,0.434531,1.0,0.0,0.111907,0.060508,1.0,0.0,0.0
max,3.0,1.0,1.0,8.0,6.0,1.0,1.0,3.0,1.0,7.0


## Modelling

### Random Forest probably plateu, so lets Stack it

In [337]:
rf = RandomForestClassifier(
    n_estimators=50,
    max_depth=6,
    min_samples_split=5,
    min_samples_leaf=2,
)

In [338]:
results = rf.fit(X_norm, raw_train['Survived'])

In [339]:
# validate predictions of results against X_base and save the results to score variable
def validate_model(model, df):
    return model.score(df.drop('Survived', axis=1), df['Survived'])

X_base = X_norm.copy()
X_base['Survived'] = y
score = validate_model(results, X_base)

In [340]:
print(score)

0.8832772166105499


## Evaluation

In [341]:
X_test = raw_test.iloc[:, 1:]
X_test = DataPreparation(X_test)
X_test.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Numeration,Sector
0,3,0,0.452723,0,0,0.1067,0.015282,0,0.0,0
1,3,1,0.617566,1,0,0.117134,0.013663,1,0.0,0
2,2,0,0.815377,0,0,0.077475,0.018909,0,0.0,0
3,3,0,0.353818,0,0,0.101619,0.016908,1,0.0,0
4,3,1,0.287881,1,1,0.999995,0.023984,1,0.0,0


In [342]:
# predict the results of X_test and save it to predictions variable
predictions = results.predict(X_test)

# save predictions to a csv file
df_predictions = pd.DataFrame({'PassengerId': raw_test['PassengerId'], 'Survived': predictions.astype(int)})
df_predictions.to_csv('predictions.csv', index=False)