# Loading data from kaggle

In [299]:
import numpy as np
import pandas as pd
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
train_data= pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/gender_submission.csv
/kaggle/input/titanic/test.csv


# Filtering data:
I want to sort and split to tables by missing values to get different set's to learn.
I choose to use columns: Pclass, Sex, Age, Embarked and Cabin

In [301]:
fcols = {
    "Age":[],
    "Cabin":[],
    "Embarked":[]
}
fTcols = {
    "Age":[],
    "Cabin":[],
    "Embarked":[]
}
for key in fcols.keys():
    fcols[key] = pd.notnull(train_data[key])
    fTcols[key] = pd.notnull(test_data[key])
    
trainFiltered = [
    train_data[np.logical_and(fcols["Age"],fcols["Cabin"])],
    train_data[np.logical_and(fcols["Cabin"],fcols["Embarked"])],
    train_data[np.logical_and(fcols["Age"],fcols["Embarked"])],
    train_data[np.logical_and(np.logical_and(fcols["Age"],fcols["Cabin"]),fcols["Embarked"])]
]
trainTestFiltered = [
    test_data[np.logical_and(fTcols["Age"],fTcols["Cabin"])],
    test_data[np.logical_and(fTcols["Cabin"],fTcols["Embarked"])],
    test_data[np.logical_and(fTcols["Age"],fTcols["Embarked"])],
    test_data[np.logical_and(np.logical_and(fTcols["Age"],fTcols["Cabin"]),fTcols["Embarked"])]
]

# Generating Predictions
For each filtered earlier group + Class, Sex tags, I generate different model to predict. Creating more accurate groups allows the tree to be deepened.

In [302]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder(handle_unknown='ignore')

dataSets = [["Pclass","Sex","Age","Cabin"],["Pclass","Sex","Cabin","Embarked"],["Pclass","Sex","Age","Embarked"],["Pclass","Sex","Age","Cabin","Embarked"]]

resultData = [pd.DataFrame({'PassengerId': train_data["PassengerId"], 'Survived': 0})]
resultTestData = [pd.DataFrame({'PassengerId': test_data["PassengerId"], 'Survived': 0})]

for i, dataSet in enumerate(dataSets):
    y = trainFiltered[i]["Survived"]
    
    X = ohe.fit_transform(trainFiltered[i][dataSet])
    X_test = ohe.transform(trainTestFiltered[i][dataSet])
    
    model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=1)
    model.fit(X, y)
    
    resultData.insert(i, pd.DataFrame({'PassengerId': trainFiltered[i]["PassengerId"], 'Survived': model.predict(X)}))
    resultData[i].loc[resultData[i]['Survived'] == 0, 'Survived'] = -1
    print("Filter accuracy:",accuracy_score(y, model.predict(X)))
    model.predict(X_test)
    resultTestData.insert(i, pd.DataFrame({'PassengerId': trainTestFiltered[i]["PassengerId"], 'Survived': model.predict(X_test)}))
    resultTestData[i].loc[resultTestData[i]['Survived'] == 0, 'Survived'] = -1
    

Filter accuracy: 0.9513513513513514
Filter accuracy: 0.9207920792079208
Filter accuracy: 0.848314606741573
Filter accuracy: 0.9617486338797814


# Final Result
Merging and summing predictions from all filters, the resultant sum is the probability of surviving.

In [303]:
finalResult = pd.concat(resultData).groupby(['PassengerId'],as_index=False)['Survived'].sum()
finalResult.loc[finalResult['Survived'] < 0, 'Survived'] = 0
finalResult.loc[finalResult['Survived'] > 0, 'Survived'] = 1
print("Train data accuracy:",accuracy_score(train_data["Survived"], finalResult["Survived"]))

resultTestData = pd.concat(resultTestData).groupby(['PassengerId'],as_index=False)['Survived'].sum()
resultTestData.loc[resultTestData['Survived'] < 0, 'Survived'] = 0
resultTestData.loc[resultTestData['Survived'] > 0, 'Survived'] = 1

resultTestData.to_csv('my_submission.csv', index=False)
print("Saved")

Train data accuracy: 0.8597081930415263
Saved


# Summary
Overall accuracy of this code is scored at 0.76794 which means, I predicted third part of the passengers that were alive after catastrophe.
Splitting data before training improved this algorithm by 0.04. I don't think that's ideal way to solve this, but for less important predictions it is enough.