In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils.validation import check_is_fitted
import numpy as np
import re
from utils import get_names_out_from_ColumnTransformer
from sklearn import set_config
set_config(display="diagram")

In [2]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
X = data.drop("Survived", axis=1).copy()
y = data["Survived"].copy()

In [4]:
X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.2, random_state=42, stratify=X["Sex"])

In [5]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 788 to 261
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Pclass       712 non-null    int64  
 2   Name         712 non-null    object 
 3   Sex          712 non-null    object 
 4   Age          578 non-null    float64
 5   SibSp        712 non-null    int64  
 6   Parch        712 non-null    int64  
 7   Ticket       712 non-null    object 
 8   Fare         712 non-null    float64
 9   Cabin        164 non-null    object 
 10  Embarked     710 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 66.8+ KB


In [6]:
X_train.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,712.0,712.0,578.0,712.0,712.0,712.0
mean,449.63764,2.323034,29.781436,0.546348,0.373596,31.282893
std,256.778217,0.838341,14.628503,1.110283,0.803144,44.377233
min,2.0,1.0,0.42,0.0,0.0,0.0
25%,231.75,2.0,21.0,0.0,0.0,7.8958
50%,454.5,3.0,28.0,0.0,0.0,14.45625
75%,667.25,3.0,38.0,1.0,0.0,31.275
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
X_train.describe(include='object')

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,712,712,712,164,710
unique,712,2,566,128,3
top,"Dean, Master. Bertram Vere",male,1601,C23 C25 C27,S
freq,1,461,7,4,515


In [8]:
class NumFeaturesAdder(BaseEstimator, TransformerMixin):
    """
    Add age groups, total number of relatives
    """
    def __init__(self):
        pass

    def fit(self, X, y=None):
        self.n_features_in_ = X.shape[1]
        return self

    def transform(self, X):
        aix, six, pix = [num_cols.index(e) for e in ['Age', 'SibSp', 'Parch']]
        age_cut = np.digitize(X[:, aix], [0, 10, 20, 30, 50, 100])
        relatives = X[:, six] + X[:, pix]
        X = np.c_[X, age_cut, relatives]
        return X

    def get_feature_names_out(self, input_features=None):
        feature_names = input_features + ['AgeGroup', 'Relatives']
        return np.asarray(feature_names, dtype=object)

In [9]:
class TicketConverter(BaseEstimator, TransformerMixin):
    """
    Extract acronym from ticket
    """
    def __init__(self):
        pass

    def fit(self, X, y=None):
        self.n_features_in_ = X.shape[1]
        return self

    def custom_ticket(self, s):
        no_punct = re.sub(r'[^\w\s]', '', s)
        match = re.match(r'(PC|CA|SOC|STON|SOTON|PP|A5|SC|C|WC|FCC)', no_punct)
        if match:
            return match[0]
        else:
            return 'Other' 

    def transform(self, X):
        tix = cat_cols.index('Ticket')
        vfunc = np.vectorize(self.custom_ticket)
        ticket = vfunc(s=X[:, tix])
        X = np.delete(X, tix, axis=1)
        X = np.c_[X, ticket]
        return X
    
    def get_feature_names_out(self, input_features=None):
        feature_names = input_features
        return np.asarray(feature_names, dtype=object)

In [10]:
class GroupImputer(BaseEstimator, TransformerMixin):
    """"
    Impute mean or median by group
    
    Parameters
    ----------    
    group_col : str
        A column used for calculating the aggregated value 
    metric : str
        The metric to be used for remplacement, can be one of ['mean', 'median']
    """
    def __init__(self, group_col, metric='mean'):
        self.group_col = group_col
        self.metric = metric
    
    def fit(self, X, y=None):
        impute_map = X.groupby(self.group_col).agg(self.metric)\
                                              .reset_index(drop=False)
        self.impute_map_ = impute_map
        return self 
    
    def transform(self, X, y=None):
        
        check_is_fitted(self, 'impute_map_')
        
        X = X.copy()
        
        for col in self.impute_map_.columns:
            for _, row in self.impute_map_.iterrows():
                ind = X[self.group_col] == row[self.group_col]
                X.loc[ind, col] = X.loc[ind, col].fillna(row[col])
        
        return X.values

In [11]:
num_cols = ['Fare', 'Age', 'Pclass', 'SibSp', 'Parch']
cat_cols = ['Sex', 'Embarked', 'Ticket']

num_pipeline = Pipeline([
    ('imputer', GroupImputer(group_col='Pclass', metric='median'))
    ,('add_feats', NumFeaturesAdder())
    ,('std_scaler', StandardScaler())
    ])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent"))
    ,('ticket_conv', TicketConverter())
    ,('ohe', OneHotEncoder(handle_unknown='ignore'))
    ])

miss_indicator = Pipeline(steps=[
    ('add_ind', MissingIndicator(error_on_new=False))
])

preproc = ColumnTransformer([
    ("num", num_pipeline, num_cols)
    ,("cat", cat_pipeline, cat_cols)
    ,('missing', miss_indicator, num_cols+cat_cols)
], remainder = 'drop')

In [12]:
pd.DataFrame(preproc.fit_transform(X_train), 
             columns=get_names_out_from_ColumnTransformer(preproc, X_train),
             index=X_train.index).head()

Unnamed: 0,Fare,Age,Pclass,SibSp,Parch,AgeGroup,Relatives,Sex_female,Sex_male,Embarked_C,...,Ticket_Other,Ticket_PC,Ticket_PP,Ticket_SC,Ticket_SOC,Ticket_SOTON,Ticket_STON,Ticket_WC,Age_missing,Embarked_missing
788,-0.241462,-2.100059,0.808074,0.408878,2.026471,-2.237396,1.285409,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
347,-0.342373,-0.385389,0.808074,0.408878,-0.465493,-0.23359,0.049472,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
629,-0.531041,-0.385389,0.808074,-0.492426,-0.465493,-0.23359,-0.568496,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
734,-0.412278,-0.45994,-0.385596,-0.492426,-0.465493,-0.23359,-0.568496,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
106,-0.53292,-0.609042,0.808074,-0.492426,-0.465493,-0.23359,-0.568496,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
full_pipeline = Pipeline([
    ('pre', preproc)
    ,("fit", RandomForestClassifier(random_state=42))
])

In [14]:
%%time 

param_grid = [
    {
    'pre__num__imputer__metric': ['median', 'mean'],
    'fit__bootstrap': [True],
    'fit__n_estimators': [50, 70], 
    'fit__max_features': [9, 10, 11],
    'fit__max_depth': [3, 5, 7],
    'fit__criterion': ['gini', 'entropy']
    }
]

kfolds = StratifiedKFold(shuffle=True, random_state=42)

grid_search = GridSearchCV(full_pipeline, param_grid, cv=kfolds,
                           scoring='accuracy')
                           
grid_search.fit(X_train, y_train)

CPU times: total: 1min
Wall time: 1min


In [15]:
pd.set_option('max_colwidth', 400)
(
pd.DataFrame(grid_search.cv_results_)
.loc[:, ['params', 'mean_test_score', 'std_test_score']]
.sort_values('mean_test_score', ascending=False)
.head()
)

Unnamed: 0,params,mean_test_score,std_test_score
2,"{'fit__bootstrap': True, 'fit__criterion': 'gini', 'fit__max_depth': 3, 'fit__max_features': 9, 'fit__n_estimators': 70, 'pre__num__imputer__metric': 'median'}",0.822998,0.032288
3,"{'fit__bootstrap': True, 'fit__criterion': 'gini', 'fit__max_depth': 3, 'fit__max_features': 9, 'fit__n_estimators': 70, 'pre__num__imputer__metric': 'mean'}",0.822998,0.032288
50,"{'fit__bootstrap': True, 'fit__criterion': 'entropy', 'fit__max_depth': 5, 'fit__max_features': 9, 'fit__n_estimators': 70, 'pre__num__imputer__metric': 'median'}",0.822988,0.021869
51,"{'fit__bootstrap': True, 'fit__criterion': 'entropy', 'fit__max_depth': 5, 'fit__max_features': 9, 'fit__n_estimators': 70, 'pre__num__imputer__metric': 'mean'}",0.822978,0.023243
38,"{'fit__bootstrap': True, 'fit__criterion': 'entropy', 'fit__max_depth': 3, 'fit__max_features': 9, 'fit__n_estimators': 70, 'pre__num__imputer__metric': 'median'}",0.8216,0.030122


In [16]:
y_hat_dev = grid_search.best_estimator_.predict(X_dev)
accuracy_score(y_dev, y_hat_dev)

0.8156424581005587

In [17]:
X_test = pd.read_csv('test.csv')
X_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [18]:
y_hat_test = grid_search.best_estimator_.predict(X_test)
X_test['Survived'] = y_hat_test
X_test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S,1


In [19]:
submission = X_test.loc[:, ['PassengerId', 'Survived']].copy()
submission.to_csv('my_submission.csv', index=False)