In [11]:
# Base.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Models.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
# Metrics.
from sklearn.metrics import roc_auc_score, average_precision_score
# Scalers.
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# Utilities.
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.base import TransformerMixin

import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

In [12]:
df_train = pd.read_csv('dataset/train.csv')
df_test = pd.read_csv('dataset/test.csv')

In [13]:
X = df_train.drop('Survived', axis=1)
y = df_train.Survived
X_test = df_test

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

### Preprocess

In [15]:
cat_features = ['PassengerId', 'Title', 'Name', 'Cabin', 'Ticket', 'Embarked']

In [16]:
class NaImputer (TransformerMixin):
    def __init__(self):
        pass

    
    def fit(self, X, y=None):
        return self

    
    def transform(self, X, y=None):
        X_new = self.impute_age(X)

        return X_new
    

    @staticmethod
    def impute_age(X):
        X_new = X.copy()
        '''Imputation for compliting missing "Age" values
        using median depending on passenger title.
        '''
        # Extract titles from passenger names.
        X_new['Title'] = X_new.Name.apply(lambda x : x.replace(',', '').split()[1])
        # Impute median.
        titles = ['Mr.', 'Miss.', 'Mrs.', 'Master.']
        medians = [X_new.loc[X_new.Title == x].Age.median() for x in titles]
        median_ages = dict(zip(titles, medians))

        for title, median in median_ages.items():
            cond1 = X_new['Age'].isna()
            cond2 = X_new['Title'] == title
            X_new.loc[cond1 & cond2, 'Age'] = median
        
        # Set median to other titles.
        X_new.Age.fillna(X_new.loc[X_new.Sex == 'male'].Age.median(), inplace=True)
        
        return X_new


In [17]:
class FeatureExtractor (TransformerMixin):
    def __init__(self):
        pass

    
    def fit(self, X, y=None):
        return self

    
    def transform(self, X, y=None):
        X_new = self.simplify_feats(X)
        X_new.drop(columns=cat_features, inplace=True)

        return X_new
    

    def simplify_feats(self, X):
        X_new = X.copy()
        X_new.loc[X_new['SibSp'] >= 1, 'SibSp'] = 1
        X_new.loc[X_new['Parch'] >= 1, 'Parch'] = 1

        return X_new

In [18]:
ct = ColumnTransformer(
    remainder='passthrough', 
    transformers=[
        ('Ordinal Enc', OrdinalEncoder(), ['Sex']),
        ('Age scale', StandardScaler(), ['Age']),
    ]
)

In [19]:
pipe = Pipeline([
    ('NaN impute', NaImputer()),
    ('Feature extraction', FeatureExtractor()),
    ('Feature transform', ct),
    ('Model', lgb.LGBMClassifier(
        reg_alpha=1.9877816846823654e-05,
        reg_lambda=1.696137968600218e-08,
        num_leaves=72,
        min_child_samples=76,
        learning_rate=0.30120206892454016,
        n_estimators=150,
        )
    )
])

In [20]:
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_valid)
roc_auc_score(y_valid, y_pred)

0.8530244530244532