In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import tree
import kaggle
import seaborn as sns
from cobratools import Analysis

In [None]:
#!kaggle competitions download -c titanic # api copied from kaggle
#!unzip titanic.zip

# Explore data

## Describe

In [None]:
# Load data
data = pd.read_csv("train.csv")

# Instanciate analysis object
Analyze = Analysis(data)

# Describe
Analyze.describe(investigation_level=3)

## Visualize

In [None]:
Analyze.visualize()

## Statistics

In [None]:
print("How many died (0)/ survived (1)?\n{}\n\n"
      .format(data['Survived'].value_counts()))

print("\nHow many survived for each Pclass?\n{}\n\n"
      .format(data['Survived'].groupby(data['Pclass']).sum()))

print("\nHow many people are there in each Pclass?\n{}\n\n"
      .format(data['Pclass'].value_counts()))

print("\nWhich proportion survived within each Pclass? (p1_survived/p1_passengers)\n{}\n\n"
      .format(data['Survived'].groupby(data['Pclass']).sum() / data['Pclass'].value_counts()))

print("Within the survivors, what is the proportion of females/males? (female_survived/n_passengers)\n{}\n\n"
      .format(data['Survived'].groupby(data['Sex']).sum()/n_passengers))

print("\nWhich proportion of females/males survived? (female_survived/n_females)\n{}"
      .format(data['Survived'].groupby(data['Sex']).sum() / data['Sex'].value_counts()))

### Analyse Sex

In [None]:
# Create Survived masks
survived = data['Survived'] == 1
died = data['Survived'] == 0

# Create Sex masks
females = data['Sex'] == 'female'
males = data['Sex'] == 'male'

# Init subplots
fig, axes = plt.subplots(nrows=1, ncols=3, sharex=True, sharey=True)
plt.subplots_adjust(right=1.5, bottom=0.5)
# Subplots survived by Sex
pd.get_dummies(data['Sex']).sum().plot.bar(ax=axes[0], title='Number of passengers by Sex')
pd.get_dummies(data[survived]['Sex']).sum().plot.bar(ax=axes[1], title="Number of survivors by Sex")
pd.get_dummies(data[died]['Sex']).sum().plot.bar(ax=axes[2], title="Number of deaths by Sex")


### Analyse Age Distribution

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, sharey=True, sharex=True)
plt.subplots_adjust(wspace=0, right=1.5)

# Distribution age for all passengers
data['Age'].plot.hist(ax=axes[0], title="All passengers", bins=20)

# Distribution for dead by age
data[data["Survived"] == 0]['Age'].plot.hist(ax=axes[1], title="Died", bins=20)

# Distribution for "survived" by age
data[data["Survived"] == 1]['Age'].plot.hist(ax=axes[2], title="Survived", bins=20)
plt.xlabel("Age")

for i in range(axes.shape[0]):
    axes[i].set_xlabel("Age")
    axes[i].set_ylabel("N passengers")

### Analyse Age Distribution by Sex

In [None]:
# Compare female vs male age impact on survival prob

fig, axes = plt.subplots(nrows=2, ncols=3, sharey=True, sharex=True)
plt.subplots_adjust(wspace=0, right=1.5, bottom=-0.5)

# Distribution age by Sex
data[females]['Age'].plot.hist(ax=axes[0, 0], title="Females All", bins=20)
data[males]['Age'].plot.hist(ax=axes[1, 0], title="Males All", bins=20)

# Distribution age females (1 died, 2 survived)
data[females & died]['Age'].plot.hist(ax=axes[0, 1], title="Females Died", bins=20)
data[females & survived]['Age'].plot.hist(ax=axes[0, 2], title="Females Survived", bins=20)

# Distribution age males (1 died, 2 survived)
data[males & died]['Age'].plot.hist(ax=axes[1, 1], title="Males Died", bins=20)
data[males & survived]['Age'].plot.hist(ax=axes[1, 2], title="Males Survived", bins=20)

for i in range(axes.shape[0]):
    for j in range(axes.shape[1]):
        axes[i, j].set_xlabel("Age")
        axes[i, j].set_ylabel("N passengers")

### Analyse Survived - Sex - Fare

In [None]:
# Define Grid
fig, axes = plt.subplots(nrows=1, ncols=2, sharex=True, sharey=True, figsize=(8, 4))

# Labels
fig.suptitle("Distribution of fare prices by sex for survivals and deaths", x=0.5, y=1.05)
axes[0].set_title("Females")
axes[1].set_title("Males")
fig.text(0.5, 0.01, 'common X', ha='center')
fig.text(0.04, 0.5, 'common Y', va='center', rotation='vertical')

# Figures
data[females & survived]['Fare'].hist(ax=axes[0])
data[females & died]['Fare'].hist(ax=axes[0])
data[males & survived]['Fare'].hist(ax=axes[1])
data[males & died]['Fare'].hist(ax=axes[1])
plt.show()

### Analyse SibSp

In [None]:
# Spouse and Children by Sex
print(data['SibSp'].groupby(data['Sex']).sum())
print(data['SibSp'].groupby(data['Sex']).mean())
print(data['SibSp'].groupby(data['Sex']).max())

fig, axes = plt.subplots(nrows=3, ncols=1, sharex=False)
plt.subplots_adjust(bottom=-0.5)
data['SibSp'].groupby(data['Sex']).sum().plot.bar(ax=axes[0], title="Total SibSp by Sex")
data['SibSp'].groupby(data['Sex']).mean().plot.bar(ax=axes[1], title="Average SibSp by Sex")
data['SibSp'].groupby(data['Sex']).max().plot.bar(ax=axes[2], title="Max SibSp by Sex")

### Parch links

In [None]:
# Parch by Sex
print(data['Parch'].groupby(data['Sex']).sum())
print(data['Parch'].groupby(data['Sex']).mean())
print(data['Parch'].groupby(data['Sex']).max())

fig, axes = plt.subplots(nrows=3, ncols=1, sharex=False)
plt.subplots_adjust(bottom=-0.5)
data['Parch'].groupby(data['Sex']).sum().plot.bar(ax=axes[0], title="Total Parch by Sex")
data['Parch'].groupby(data['Sex']).mean().plot.bar(ax=axes[1], title="Average Parch by Sex")
data['Parch'].groupby(data['Sex']).max().plot.bar(ax=axes[2], title="Max Parch by Sex")

### Study impact of PassengerId on the chances of survival

In [None]:
data_explore = data[['Survived', 'PassengerId']].copy()

# EVEN ID: See if even Ids have greater chances of survival
# Replace
data_explore['PassengerId'] = [1 if PassId % 2 == 0 else 1 for PassId in data_explore['PassengerId']]
# Counts
counts_survived_by_id = data_explore['Survived'].groupby(data_explore['PassengerId']).sum()

PassengerIdChances = data_explore['Survived'].groupby(data_explore['PassengerId']).sum() / data_explore['PassengerId'].value_counts()
print("Pair Ids have {evenId:.2%} chances of survival, "
      "against {unevenId:.2%} for uneven Ids\n".format(evenId=PassengerIdChances[0], unevenId=PassengerIdChances[1]))


# ID POSITION: See if passengers over half Id have greater chances
id_last = data['PassengerId'].iloc[-1]

list_split_id_greater_than_half = [0 if PassId <= id_last / 2 else 1 for PassId in data['PassengerId']]
data_explore['PassengerId'] = list_split_id_greater_than_half
chances_id_half = data_explore['Survived'].groupby(data_explore['PassengerId']).sum() / data_explore['PassengerId'].value_counts()[0]
print("Passengers with id greater than half has {id_greater_half:.2%} chances of survival against {id_lower_half:.2%} for id lower than half\n".format(id_greater_half=chances_id_half[1], id_lower_half=chances_id_half[0]))

## Output format required

In [None]:
submission = pd.read_csv("gender_submission.csv")
submission.head()

# Preprocess data

## Prepare and clean data

### Transform categorical data with One-hot encoding

In [217]:
# Work on a df copy
df = data.copy()

# Replace binary class text variables by bool
binary_cat_int = {'Sex': {'female':0, 'male':1}}
df.replace(binary_cat_int, inplace=True)

# Replace categorical data by dummy variables
df.drop(columns=['Embarked'], inplace=True)
df = pd.concat([df, pd.get_dummies(data['Embarked'], prefix='embarked')], axis=1)

### Impute missing values

In [None]:
# Age


### Build Features

In [218]:
# Add a feature "Name_length" (useless)
df['Name_length'] = df['Name'].apply(lambda x: len(x))

# Add feature boolean Age_missing (useless)
df['Age_missing'] = df['Age'].apply(lambda x: 1 if np.isnan(x) else 0)

In [219]:
#data["OHC_Code"] = np.where(obj_df["engine_type"].str.contains("ohc"), 1, other=0)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,embarked_C,embarked_Q,embarked_S,Name_length,Age_missing
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,,0,0,1,23,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,1,0,0,51,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,,0,0,1,22,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,0,0,1,44,0
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,,0,0,1,24,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,,0,0,1,21,0
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,0,0,1,28,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,,1,2,W./C. 6607,23.4500,,0,0,1,40,1
889,890,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,C148,1,0,0,21,0


### Clean data

In [220]:
# Remove NA, inf
columns_selected = ['Sex', 'Pclass', 'embarked_C', 'embarked_Q', 'embarked_S', 'Name_length']
df[columns_selected].isnull().sum()

Sex            0
Pclass         0
embarked_C     0
embarked_Q     0
embarked_S     0
Name_length    0
dtype: int64

### Split X/Y

In [221]:
X = df[columns_selected].copy()
Y = df['Survived'].copy()

In [222]:
# Transform to np.arrays for tree method
if X.shape[1] > 1:
    X = np.array(X)
    Y = np.array(Y)
else:
    # Adapt shape for 1 feature only
    X = X.reshape(-1,1)
    Y = Y.reshape(-1,1)

### Split train/test sets

In [223]:
# Define proportion train/test
proportion = .5
size_train = round(X.shape[0] * proportion)

# Train set
X_train = X[:size_train]
Y_train = Y[:size_train]

# Test set
X_test = X[size_train:]
Y_test = Y[size_train:]

# Prediction

## Benchmark

In [224]:
# Accuracy while always predicting 0 (dead)
# Train set
n_passengers_train = X_train.shape[0]
n_survived_train = Y_train.sum()
n_died_train = n_passengers - n_survived
accuracy_train_benchmark = n_died_train / n_passengers_train

# Train set
n_passengers_test = X_test.shape[0]
n_survived_test = Y_test.sum()
n_died_test = n_passengers - n_survived
accuracy_test_benchmark = n_died_test / n_passengers_test

# Display benchmark accuracy
print('{:.2%}'.format(accuracy_train_benchmark))
print('{:.2%}'.format(accuracy_test_benchmark))

60.76%
60.90%


## Train Tree model

In [225]:
clf = tree.DecisionTreeClassifier(
    criterion='gini',
    splitter='best',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    random_state=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    class_weight=None,
    presort='deprecated',
    ccp_alpha=0.0)

clf = clf.fit(X_train, Y_train)

## Test Tree model

### Evaluate accuracy on train and test sets

In [226]:
#clf.score(arr_X_test_reshaped, arr_Y_test_reshaped)
print("[benchmark] Accuracy train: {:.2f}%".format(accuracy_train_benchmark*100))
print("[tree]      Accuracy train: {:.2f}%".format(clf.score(X_train, Y_train)*100))
print("\n[benchmark] Accuracy test: {:.2f}%".format(accuracy_test_benchmark*100))
print("[tree]      Accuracy test: {:.2f}%".format(clf.score(X_test, Y_test)*100))

[benchmark] Accuracy train: 60.76%
[tree]      Accuracy train: 89.69%

[benchmark] Accuracy test: 60.90%
[tree]      Accuracy test: 77.08%


### Predict output for specific cases

In [227]:
print("Dead=0 / Survived=1\n")
print(*zip(range(len(columns_selected)), columns_selected))
for i_passenger in range(10):
    print("Passenger data: {} predicted: {} truth: {}".format(X_test[i_passenger], clf.predict([X_test[i_passenger]])[0], Y_test[i_passenger]))

Dead=0 / Survived=1

(0, 'Sex') (1, 'Pclass') (2, 'embarked_C') (3, 'embarked_Q') (4, 'embarked_S') (5, 'Name_length')
Passenger data: [ 0  2  0  0  1 33] predicted: 1 truth: 1
Passenger data: [ 1  1  0  0  1 27] predicted: 0 truth: 1
Passenger data: [ 0  3  1  0  0 30] predicted: 1 truth: 1
Passenger data: [ 1  1  0  0  1 30] predicted: 0 truth: 1
Passenger data: [ 1  2  0  0  1 21] predicted: 0 truth: 0
Passenger data: [ 1  3  0  0  1 31] predicted: 0 truth: 0
Passenger data: [ 1  1  1  0  0 31] predicted: 1 truth: 0
Passenger data: [ 1  1  1  0  0 24] predicted: 0 truth: 1
Passenger data: [ 1  3  0  0  1 19] predicted: 0 truth: 0
Passenger data: [ 1  3  1  0  0 18] predicted: 0 truth: 1


### Visual representation of algo rules

In [None]:
fn = ['Pclass', 'Female', 'Male']
cn = ['Dead', 'Survived']

tree.plot_tree(clf,
               feature_names = fn, 
               class_names=cn,
               filled = True)