In [1]:
import numpy as np
import sklearn as sk
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
rd_state = 666

In [3]:
all_data = pd.read_csv("train.csv")
challenge_input_ori = pd.read_csv("test.csv")

all_input_ori = all_data.drop('Survived', axis=1)
all_labels = all_data['Survived'].copy()

# Extract a validation set
train_input_ori, validate_input_ori, train_labels, validate_labels = train_test_split(all_input_ori, all_labels, test_size=0.15, random_state=rd_state)

# Extract a testing set
train_input_ori, test_input_ori, train_labels, test_labels = train_test_split(train_input_ori, train_labels, test_size=0.18, random_state=rd_state)
#train_input_ori, test_input_ori, train_labels, test_labels = train_test_split(all_input_ori, all_labels, test_size=0.1, random_state=rd_state)

train_both_ori = train_input_ori.copy()
train_both_ori['Survived'] = train_labels

# Sanity check
print("All data:  %s" % (all_data.shape,))
print("All input: %s" % (all_input_ori.shape,))
print("Training:  %s" % (train_input_ori.shape,))
print("Test set:  %s" % (test_input_ori.shape,))
#print("Validate:  %s" % (validate_input_ori.shape,))

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
display(train_both_ori.head() )
display(train_both_ori.describe())
display(test_input_ori.describe())

In [None]:
%matplotlib inline
train_both_ori.hist(bins=50, figsize=(20,15));

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

title_encoder = OneHotEncoder(
    categories=[["Capt", "Col", "Dr", "Lady", "Major", "Master", "Miss", 
                 "Mlle", "Mme", "Mr", "Mrs", "Ms", "Rev", "the Countess"]],
    handle_unknown='infrequent_if_exist',
    min_frequency=1,
)
embarked_encoder = OneHotEncoder(
    categories=[["S", "C", "Q"]],
    handle_unknown='infrequent_if_exist',
    min_frequency=1,
)

all_input_copy = all_input_ori.copy()
all_input_copy['Title'] = all_input_copy['Name'].str.split(', ', n=1, expand=True)[1].str.split('.', n=1, expand=True)[0]
title_encoder.fit_transform(all_input_copy[["Title"]])
embarked_encoder.fit_transform(all_input_copy[["Embarked"]])

def enrich_input(some_input_ori):
    some_input = some_input_ori.copy()
    
    # Sex as a number
    some_input["Male"] = (some_input["Sex"] == "male") * 1.0
    
    # Age buckets
    some_input['Age_2'] = 2*(some_input['Age']//2)
    some_input['Age_5'] = 5*(some_input['Age']//5)
    some_input['Age_10'] = 10*(some_input['Age']//10)
    
     # Family size
    some_input['FSize'] = some_input['Parch'] + some_input['SibSp']
    
    # Embarked as a number (Southampton (1) is the departure, then Cherbourg (2) then Queenstown (3))
    #some_input["Embarked_N"] = (some_input["Embarked"] == "Q") * 3.0 + \
    #                           (some_input["Embarked"] == "C") * 2.0 + \
    #                           (some_input["Embarked"] == "S") * 1.0
        
    some_input["Estimated_Age"] = 1.0 * (some_input["Age"] > 1) * (10*some_input["Age"]%10 == 5)
    
    # Extract and OneHotEncode title
    some_input['Title'] = some_input['Name'].str.split(', ', n=1, expand=True)[1].str.split('.', n=1, expand=True)[0]
    title_encoded = title_encoder.transform(some_input[['Title']]).toarray()
    some_input[title_encoder.get_feature_names_out()] = title_encoded

    # OneHotEncode Embarked
    embarked_encoded = embarked_encoder.transform(some_input[['Embarked']]).toarray()
    some_input[embarked_encoder.get_feature_names_out()] = embarked_encoded
    
    # Drop non-numerical data
    some_input = some_input.drop('Name', axis=1)
    some_input = some_input.drop('Sex', axis=1)
    some_input = some_input.drop('Ticket', axis=1)
    some_input = some_input.drop('Cabin', axis=1)
    some_input = some_input.drop('Embarked', axis=1)
    some_input = some_input.drop('Title', axis=1)
    
    return some_input

In [None]:
all_input = enrich_input(all_input_ori)
train_input = enrich_input(train_input_ori)
train_both = enrich_input(train_both_ori)
test_input = enrich_input(test_input_ori)
validate_input = enrich_input(validate_input_ori)
challenge_input = enrich_input(challenge_input_ori)

# Sanity check
print(" === Originals (unmodified)? ===")
print("All data:      %s" % (all_data.shape,))
print("All input:     %s" % (all_input_ori.shape,))
print("Training:      %s" % (train_input_ori.shape,))
print("Train both:    %s" % (train_both_ori.shape,))
print("Test set:      %s" % (test_input_ori.shape,))
print("Validate:      %s" % (validate_input_ori.shape,))
print("Challenge:     %s" % (challenge_input_ori.shape,))

print(" === Enriched ===")
print("All input:     %s" % (all_input.shape,))
print("Training:      %s" % (train_input.shape,))
print("Train both:    %s" % (train_both.shape,))
print("Test set:      %s" % (test_input.shape,))
print("Validate:      %s" % (validate_input.shape,))
print("Challenge:     %s" % (challenge_input.shape,))


In [None]:
display(train_both.corr(numeric_only=True))

# colormaps: https://matplotlib.org/stable/gallery/color/colormap_reference.html
train_both.plot.scatter("Age", "Fare", c="Survived", colormap="RdYlBu")

#df = pd.crosstab(pd.cut(train_both['Pclass'], 3), train_both['Male'])

#df = train_both[['Pclass','Male','Survived', 'PassengerId']] \
#          .groupby(['Pclass','Male','Survived']) \
#          .count().reset_index() \
#          .pivot(columns='Survived', index=['Pclass','Male'])
#display(df)
#df.plot.bar(stacked=True, color=['r', 'b'])

# Doesn't quite work
#df = train_both[['Pclass','Male','PassengerId']]
#display(df)
#df.plot.hist(by=['Pclass','Male'], stacked=True, color=['r','b'])

#pd.plotting.scatter_matrix(train_both, figsize=(12, 8))

In [None]:
def hist_by(column):
    df = train_both[[column, 'Survived', 'PassengerId']] \
              .groupby([column, 'Survived']) \
              .count().reset_index() \
              .pivot(columns='Survived', index=[column])
    df.plot.bar(stacked=True, color=['r', 'b'])

hist_by('Pclass')
hist_by('Male')
hist_by('Age')
hist_by('Age_2')
hist_by('Age_5')
hist_by('Age_10')
hist_by('SibSp')
hist_by('Parch')
hist_by('Estimated_Age')

# TODO: hist_by one hot encoded stuff
#hist_by('Title')
#hist_by('Embarked')


In [None]:
display(set(train_input["PassengerId"]).intersection(set(test_input["PassengerId"])))
display(set(train_input["PassengerId"]).intersection(set(validate_input["PassengerId"])))
display(set(test_input["PassengerId"]).intersection(set(validate_input["PassengerId"])))
train_input

In [None]:
from sklearn.datasets import make_circles, make_classification, make_moons
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, VotingClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import ConstantKernel, DotProduct, Exponentiation, ExpSineSquared, Matern, RationalQuadratic, RBF
from sklearn.impute import SimpleImputer
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.metrics import balanced_accuracy_score, make_scorer
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

import itertools

rd = 42


all_classifiers = dict()

# 0.76: clf = KNeighborsClassifier(3)
# 0.77: clf = SVC(kernel="linear", C=0.025, random_state=rd)
# 0.74: clf = SVC(gamma=2, C=1, random_state=rd)
# 0.74: clf = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, random_state=rd)
# 0.79: clf = RandomForestClassifier(max_depth=3, n_estimators=3, max_features=1, random_state=rd)
# 0.78: clf = RandomForestClassifier(max_depth=5, n_estimators=100, max_features=1, random_state=rd)
# 0.83: clf = MLPClassifier(alpha=0.1, max_iter=1000, random_state=rd)
# 0.77: clf = GaussianProcessClassifier(1.0 * RBF(1.0), random_state=rd)
# 0.80: clf = DecisionTreeClassifier(max_depth=3, random_state=rd)
# 0.75: clf = AdaBoostClassifier(random_state=rd)
# 0.75: clf = GaussianNB()
# 0.76: clf = QuadraticDiscriminantAnalysis()

for k in range(1, 20):
    all_classifiers["k-neighbors, k=%s" % k] = KNeighborsClassifier(k)

for k in ["linear", "poly", "rbf", "sigmoid"]: # linear, poly, rbf, sigmoid, precomputed
    for g in ["scale", "auto", 1, 2, 3, 5]:
        for c in [0.025, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]:
            all_classifiers["svc %s, g=%s, c=%s" % (k, g, c)] = SVC(kernel=k, gamma=g, C=c, random_state=rd)

for d in range(1, 10):
    for e in range(1, 10):
        for m in range(1, 20):
            all_classifiers["random-forest, depth=%s, est=%s, max_features=%s" % (d, e, m)] = \
              RandomForestClassifier(max_depth=d, n_estimators=e, max_features=m, random_state=rd)

for s in range(1, 10):
    for act in ["relu"]:               # identity, relu, logistic, tanh, relu
        for solver in ["lbfgs"]:       # lbfgs (better for small datasets), sgd or adam
            for rate in ["adaptive"]:  # constant, invscaling or adaptive
                for a in [0.1, 0.5, 1]:
                    all_classifiers["mlp, sz=%s, act=%s, solver=%s, learn_rate=%s, alpha=%s" % (s, act, solver, rate, a)] = \
                      MLPClassifier(
                       hidden_layer_sizes=(s,),
                       activation=act, 
                       solver=solver,
                       learning_rate=rate, 
                       alpha=a,
                       max_iter=2000,
                       random_state=rd
                      )
                    all_classifiers["mlp2, sz=%s, act=%s, solver=%s, learn_rate=%s, alpha=%s" % ((s, s), act, solver, rate, a)] = \
                      MLPClassifier(
                       hidden_layer_sizes=(s,s),
                       activation=act, 
                       solver=solver,
                       learning_rate=rate, 
                       alpha=a,
                       max_iter=2000,
                       random_state=rd
                      )

for k in [ConstantKernel(), DotProduct(), Exponentiation(RBF(), 2), 1.0 * RBF(1.0), Matern(), RationalQuadratic(alpha_bounds=(1e-5, 1e10))]:
    all_classifiers["gaussian-process, %s" % k] = GaussianProcessClassifier(kernel=k, random_state=rd)

for d in range(1, 10):
    all_classifiers["decision-tree, d=%s" % d] = DecisionTreeClassifier(max_depth=d, random_state=rd)
    
for d in range(1, 3):
    all_classifiers["ada-boost, d = %s" % d] = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=d),
                                                                  random_state=rd)
    
all_classifiers["gaussian-nb"] = GaussianNB()
#all_classifiers["QDA"] = QuadraticDiscriminantAnalysis()

scl = StandardScaler()
imp = SimpleImputer(strategy='mean')

max_score = 0
max_description = None
scored_classifiers = []
for description, clf in all_classifiers.items():
    try:
        pip = make_pipeline(scl, imp, clf)
        
        #pip.fit(train_input, train_labels)

        #test_score = pip.score(test_input, test_labels)        
        #validate_score = pip.score(validate_input, validate_labels)
        #score = min(test_score, validate_score)
        #print("%s: %s (%s, %s)" % (description, score, test_score, validate_score))
        
        scores = cross_val_score(pip, all_input, all_labels,
                                 scoring=make_scorer(balanced_accuracy_score),
                                 cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=10,
                                                            random_state=rd))
        score=scores.mean()
        print("%s: %s (std: %s)" % (description, scores.mean(), scores.std()))
        
        scored_classifiers.append((score, description, pip))
        
        if score > max_score:
            max_score = score
            max_description = description
    
            # The model did good, train it on all the data we have
            pip.fit(all_input, all_labels)
        
            challenge_output = pip.predict(challenge_input)
            challenge_result = challenge_input.copy()
            challenge_result["Survived"] = challenge_output
            challenge_result.to_csv("cpitrat_result.csv", columns=["PassengerId", "Survived"], index=False)
    except e:
        print("Failed on '%s': %s" % (description, e))
    
print("Max score for '%s': %s" % (max_description, max_score))

scored_classifiers.sort(key=lambda x: x[0], reverse=True)

In [None]:
# Voting of the N best classifiers

N = 10

estimators = []
for s, d, c in scored_classifiers[:N]:
    print("%s: %s" % (s, d))
    estimators.append((d, c))
    
v = VotingClassifier(estimators)

scores = cross_val_score(v, all_input, all_labels,
                         scoring=make_scorer(balanced_accuracy_score),
                         cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=10,
                                                    random_state=rd))

print("Voting score: %s (std: %s)" % (scores.mean(), scores.std()))
    
v.fit(all_input, all_labels)

challenge_output = v.predict(challenge_input)
challenge_result = challenge_input.copy()
challenge_result["Survived"] = challenge_output
challenge_result.to_csv("cpitrat_result.csv", columns=["PassengerId", "Survived"], index=False)

In [None]:
# Voting of the classifiers above a certain score, with max N per type

from collections import defaultdict

N = 3

estimators = []
types = defaultdict(int)
for s, n, c in scored_classifiers:
    t = n.split(',')[0]
    if types[t] < N:
        types[t] += 1
        estimators.append((n, c))
        print("%s: %s: %s" % (s, t, n))
    if s < 0.8095:
        break

v = VotingClassifier(estimators)

scores = cross_val_score(v, all_input, all_labels,
                         scoring=make_scorer(balanced_accuracy_score),
                         cv=RepeatedStratifiedKFold(n_splits=10, n_repeats=10,
                                                    random_state=rd))

print("Voting score: %s (std: %s)" % (scores.mean(), scores.std()))
    
v.fit(all_input, all_labels)

challenge_output = v.predict(challenge_input)
challenge_result = challenge_input.copy()
challenge_result["Survived"] = challenge_output
challenge_result.to_csv("cpitrat_result.csv", columns=["PassengerId", "Survived"], index=False)