# Google Colab Setup

In [130]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/DataScience/machine_learning/kaggle/Titanic

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/DataScience/machine_learning/kaggle/Titanic


In [2]:
import os
# Where to save the figures
PROJECT_ROOT_DIR = "."
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Libraries

In [110]:
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

# Data scaling
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

# Evaluation
from sklearn.metrics import mean_squared_error

In [4]:
# Libraries for custom Transformer
from sklearn.base import BaseEstimator, TransformerMixin

# Data Loading

In [134]:
import pandas as pd
PROJECT_ROOT = '/content/drive/MyDrive/DataScience/machine_learning/kaggle/Titanic'

train = pd.read_csv(os.path.join(PROJECT_ROOT, 'datasets', 'train.csv'))
test = pd.read_csv(os.path.join(PROJECT_ROOT, 'datasets', 'test.csv'))

In [6]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [None]:
train.shape

(891, 12)

In [7]:
train.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [8]:
train = train.set_index('PassengerId')
test = test.set_index('PassengerId')

# Data Type

In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [10]:
num_cols = [col for col in train.columns if train[col].dtype in ['int64', 'float64']]
cat_cols = [col for col in train.columns if train[col].dtype in ['object']]

print('num_cols:', num_cols)
print('cat_cols:', cat_cols)

num_cols: ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
cat_cols: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [11]:
card = [train[col].nunique() for col in cat_cols]
for a, b in zip(cat_cols, card):
    print(a, b)

low_card_cols = [col for col in cat_cols if train[col].nunique() <= 10]
high_card_cols = [col for col in cat_cols if train[col].nunique() > 10]
print('low_card_cols:', low_card_cols)
print('high_card_cols:', high_card_cols)

Name 891
Sex 2
Ticket 681
Cabin 147
Embarked 3
low_card_cols: ['Sex', 'Embarked']
high_card_cols: ['Name', 'Ticket', 'Cabin']


# Data Cleaning

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Name      891 non-null    object 
 3   Sex       891 non-null    object 
 4   Age       714 non-null    float64
 5   SibSp     891 non-null    int64  
 6   Parch     891 non-null    int64  
 7   Ticket    891 non-null    object 
 8   Fare      891 non-null    float64
 9   Cabin     204 non-null    object 
 10  Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [None]:
# Inpute Age column with median age of the corresponding class
pclass = train.Pclass.unique()
median_age_pclass = [train[train['Pclass'] == c]['Age'].median() for c in pclass]
print('median_age_pclass:', median_age_pclass)

# TODO: finish custom class for imputing median_age_pclass

median_age_pclass: [24.0, 37.0, 29.0]


## Filling NaN Values

In [13]:
imputer = ColumnTransformer([
    ('imputer_num', SimpleImputer(strategy='median'), num_cols),
    ('imputer_cat', SimpleImputer(strategy='most_frequent'), cat_cols)
])

In [14]:
clean_pipe = Pipeline([
    ('imputer', imputer)
])

In [15]:
train_clean = imputer.fit_transform(train)
train_clean

array([[0.0, 3.0, 22.0, ..., 'A/5 21171', 'B96 B98', 'S'],
       [1.0, 1.0, 38.0, ..., 'PC 17599', 'C85', 'C'],
       [1.0, 3.0, 26.0, ..., 'STON/O2. 3101282', 'B96 B98', 'S'],
       ...,
       [0.0, 3.0, 28.0, ..., 'W./C. 6607', 'B96 B98', 'S'],
       [1.0, 1.0, 26.0, ..., '111369', 'C148', 'C'],
       [0.0, 3.0, 32.0, ..., '370376', 'B96 B98', 'Q']], dtype=object)

In [16]:
train_clean = pd.DataFrame(train_clean)
train_clean.columns = num_cols + cat_cols
train_clean.set_index(train.index)
train_clean.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Name,Sex,Ticket,Cabin,Embarked
0,0.0,3.0,22.0,1.0,0.0,7.25,"Braund, Mr. Owen Harris",male,A/5 21171,B96 B98,S
1,1.0,1.0,38.0,1.0,0.0,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C
2,1.0,3.0,26.0,0.0,0.0,7.925,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,B96 B98,S
3,1.0,1.0,35.0,1.0,0.0,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S
4,0.0,3.0,35.0,0.0,0.0,8.05,"Allen, Mr. William Henry",male,373450,B96 B98,S


In [17]:
train_clean.shape

(891, 11)

In [18]:
# After imputing, all dtype will change to 'object'
# So need to reassign float64 type to num_cols
X_ = train_clean[num_cols].astype('float64')
train_clean = X_.join(train_clean[cat_cols])

In [19]:
train_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    891 non-null    float64
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    float64
 4   Parch     891 non-null    float64
 5   Fare      891 non-null    float64
 6   Name      891 non-null    object 
 7   Sex       891 non-null    object 
 8   Ticket    891 non-null    object 
 9   Cabin     891 non-null    object 
 10  Embarked  891 non-null    object 
dtypes: float64(6), object(5)
memory usage: 76.7+ KB


# Data Analysis

In [None]:
train_clean.head(2)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Name,Sex,Ticket,Cabin,Embarked
0,0.0,3.0,22.0,1.0,0.0,7.25,"Braund, Mr. Owen Harris",male,A/5 21171,B96 B98,S
1,1.0,1.0,38.0,1.0,0.0,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C


In [None]:
train_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    float64
 1   Pclass    891 non-null    float64
 2   Age       891 non-null    float64
 3   SibSp     891 non-null    float64
 4   Parch     891 non-null    float64
 5   Fare      891 non-null    float64
 6   Name      891 non-null    object 
 7   Sex       891 non-null    object 
 8   Ticket    891 non-null    object 
 9   Cabin     891 non-null    object 
 10  Embarked  891 non-null    object 
dtypes: float64(6), object(5)
memory usage: 76.7+ KB


## Age Bucket
Divide ages into several intervals, that each age interval will have different survival rate

In [68]:
train_clean['AgeBucket'] = train_clean['Age'] // 15 * 15
train_clean[['AgeBucket', 'Survived']].groupby(['AgeBucket']).mean()

Unnamed: 0_level_0,Survived
AgeBucket,Unnamed: 1_level_1
0.0,0.576923
15.0,0.337474
30.0,0.423256
45.0,0.404494
60.0,0.24
75.0,1.0


## Relatives On Board

In [69]:
train_clean['RelativesOnboard'] = train_clean.SibSp + train_clean.Parch
train_clean[['RelativesOnboard', 'Survived']].groupby(['RelativesOnboard']).mean()

Unnamed: 0_level_0,Survived
RelativesOnboard,Unnamed: 1_level_1
0.0,0.303538
1.0,0.552795
2.0,0.578431
3.0,0.724138
4.0,0.2
5.0,0.136364
6.0,0.333333
7.0,0.0
10.0,0.0


## Name Title

In [None]:
train_clean.Name.head(20)

0                               Braund, Mr. Owen Harris
1     Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                Heikkinen, Miss. Laina
3          Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                              Allen, Mr. William Henry
5                                      Moran, Mr. James
6                               McCarthy, Mr. Timothy J
7                        Palsson, Master. Gosta Leonard
8     Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                   Nasser, Mrs. Nicholas (Adele Achem)
10                      Sandstrom, Miss. Marguerite Rut
11                             Bonnell, Miss. Elizabeth
12                       Saundercock, Mr. William Henry
13                          Andersson, Mr. Anders Johan
14                 Vestrom, Miss. Hulda Amanda Adolfina
15                     Hewlett, Mrs. (Mary D Kingcome) 
16                                 Rice, Master. Eugene
17                         Williams, Mr. Charles

In [70]:
def extract(name, titles):
    for t in titles:
        if t in name:
            return t
    return np.nan

title_list=['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev',
                    'Dr', 'Ms', 'Mlle','Col', 'Capt', 'Mme', 'Countess',
                    'Don', 'Jonkheer']

train_clean['Title'] = train_clean['Name'].map(lambda name: extract(name, title_list))


In [71]:
for a, b in zip(train_clean.Title.head(10), train_clean.Name.head(10)):
    print(a, ',', b)


Mr , Braund, Mr. Owen Harris
Mrs , Cumings, Mrs. John Bradley (Florence Briggs Thayer)
Miss , Heikkinen, Miss. Laina
Mrs , Futrelle, Mrs. Jacques Heath (Lily May Peel)
Mr , Allen, Mr. William Henry
Mr , Moran, Mr. James
Mr , McCarthy, Mr. Timothy J
Master , Palsson, Master. Gosta Leonard
Mrs , Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
Mrs , Nasser, Mrs. Nicholas (Adele Achem)


In [72]:
train_clean[train_clean.Title.isna()]
# Or: # train_clean[train_clean.Title.isnull()]

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Name,Sex,Ticket,Cabin,Embarked,AgeBucket,RelativesOnboard,Title


In [73]:
train_clean.Title.value_counts()

Mr          518
Miss        180
Mrs         129
Master       40
Dr            7
Rev           6
Major         2
Col           2
Don           1
Mme           1
Ms            1
Mlle          1
Capt          1
Countess      1
Jonkheer      1
Name: Title, dtype: int64

In [96]:
train_clean.shape

(891, 15)

## Cabin to Deck 

In [74]:
train_clean.Cabin.head(10)

0    B96 B98
1        C85
2    B96 B98
3       C123
4    B96 B98
5    B96 B98
6        E46
7    B96 B98
8    B96 B98
9    B96 B98
Name: Cabin, dtype: object

In [75]:
#Turning cabin number into Deck
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
train_clean['Deck'] = train_clean['Cabin'].map(lambda x: extract(x, cabin_list))

In [76]:
for a, b in zip(train_clean.Deck.head(10), train_clean.Cabin.head(10)):
    print(a, ',', b)

B , B96 B98
C , C85
B , B96 B98
C , C123
B , B96 B98
B , B96 B98
E , E46
B , B96 B98
B , B96 B98
B , B96 B98


In [97]:
train_clean.Deck.value_counts()

B    734
C     59
E     33
D     33
A     15
F     12
G      4
T      1
Name: Deck, dtype: int64

In [77]:
train_clean.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Name,Sex,Ticket,Cabin,Embarked,AgeBucket,RelativesOnboard,Title,Deck
0,0.0,3.0,22.0,1.0,0.0,7.25,"Braund, Mr. Owen Harris",male,A/5 21171,B96 B98,S,15.0,1.0,Mr,B
1,1.0,1.0,38.0,1.0,0.0,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C,30.0,1.0,Mrs,C
2,1.0,3.0,26.0,0.0,0.0,7.925,"Heikkinen, Miss. Laina",female,STON/O2. 3101282,B96 B98,S,15.0,0.0,Miss,B
3,1.0,1.0,35.0,1.0,0.0,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,113803,C123,S,30.0,1.0,Mrs,C
4,0.0,3.0,35.0,0.0,0.0,8.05,"Allen, Mr. William Henry",male,373450,B96 B98,S,30.0,0.0,Mr,B


## Transformer

In [40]:
class AddFeature(BaseEstimator, TransformerMixin):
    def __init__(self, columns=num_cols+cat_cols):
        self.cols = columns

    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # X is a numpy array, the columns to with high cardinality
        # X_ = X.copy()
        X_df = pd.DataFrame(X, columns=num_cols+cat_cols)

        X_df['AgeBucket'] = X_df['Age'] // 15 * 15
        X_df['RelativesOnboard'] = X_df.SibSp + X_df.Parch
        X_df['Title'] = X_df['Name'].map(lambda name: extract(name, title_list))
        X_df['Deck'] = X_df['Cabin'].map(lambda x: extract(x, cabin_list))

        return X_df.to_numpy()

In [39]:
class DelFeature(BaseEstimator, TransformerMixin):
    def __init__(self, columns=num_cols+cat_cols):
        self.cols = columns

    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # X is a numpy array, the columns to with high cardinality
        # X_ = X.copy()
        X_df = pd.DataFrame(X, columns=num_cols+cat_cols)

        X_df['AgeBucket'] = X_df['Age'] // 15 * 15
        X_df['RelativesOnboard'] = X_df.SibSp + X_df.Parch
        X_df['Title'] = X_df['Name'].map(lambda name: extract(name, title_list))
        X_df['Deck'] = X_df['Cabin'].map(lambda x: extract(x, cabin_list))

        return X_df.to_numpy()

# Data Split

In [24]:
# Data split

y_train = train_clean['Survived']
X_train = train_clean.drop(['Survived'], axis=1)

# Pipleline

In [36]:
X_train.dtypes

Pclass      float64
Age         float64
SibSp       float64
Parch       float64
Fare        float64
Name         object
Sex          object
Ticket       object
Cabin        object
Embarked     object
dtype: object

In [35]:
# num_cols = ['Pclass', 'SibSp', 'Parch', 'Fare', 'AgeBucket', 'RelativesOnboard']
# cat_cols = ['Sex', 'Embarked', 'Title', 'Deck']

# num_cols = ['Pclass', 'Age', 'Parch', 'Fare', 'SibSp']
# cat_cols = ['Sex', 'Embarked', 'Title', 'Deck']
num_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]
cat_cols = [col for col in X_train.columns if X_train[col].dtype in ['object']]

print('num_cols:', num_cols)
print('cat_cols:', cat_cols)

num_cols: ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
cat_cols: ['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


In [28]:
card = [X_train[col].nunique() for col in cat_cols]
for a, b in zip(cat_cols, card):
    print(a, b)

low_card_cols = [col for col in cat_cols if X_train[col].nunique() <= 10]
high_card_cols = [col for col in cat_cols if X_train[col].nunique() > 10]
print('low_card_cols:', low_card_cols)
print('high_card_cols:', high_card_cols)

Name 891
Sex 2
Ticket 681
Cabin 147
Embarked 3
low_card_cols: ['Sex', 'Embarked']
high_card_cols: ['Name', 'Ticket', 'Cabin']


### High Card Trans

In [37]:
# High Cardinality Transformer: perform one-hot transformer for high-cardinality
# columns while avoiding generating too much dummpy features. It will only one-hot
# the classes with high proportion. A threshold can be set.

from collections import Counter

def cumulatively_categorise(column,threshold=0.75,return_categories_list=True):
    #Find the threshold value using the percentage and number of instances in the column
    threshold_value=int(threshold*len(column))
    #Initialise an empty list for our new minimised categories
    categories_list=[]
    #Initialise a variable to calculate the sum of frequencies
    s=0
    #Create a counter dictionary of the form unique_value: frequency
    counts=Counter(column)

    #Loop through the category name and its corresponding frequency after sorting the categories by descending order of frequency
    for i,j in counts.most_common():
        #Add the frequency to the global sum
        s+=dict(counts)[i]
        #Append the category name to the list
        categories_list.append(i)
        #Check if the global sum has reached the threshold value, if so break the loop
        if s>=threshold_value:
            break
    #Append the category Other to the list
    categories_list.append('Other')

    #Replace all instances not in our new categories by Other  
    new_column=column.apply(lambda x: x if x in categories_list else 'Other')

    #Return transformed column and unique values if return_categories=True
    if(return_categories_list):
        return new_column,categories_list
    #Return only the transformed column if return_categories=False
    else:
        return new_column

In [38]:
# high_card_cols = ['Regionname', 'CouncilArea']
class HighCardAggregation(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.cols = high_card_cols
        pass

    def fit(self, X, y = None):
        return self

    def transform(self, X, y = None):
        # X is a numpy array, the columns to with high cardinality
        X_ = X.copy()
        X_df = pd.DataFrame(X, columns=high_card_cols)
        
        for col in self.cols:
            # transformed_columns is a pandas dataframe
            transformed_column, trans_list = cumulatively_categorise(column=X_df[col], threshold=0.75)
#             print('transformed_column: \n', transformed_column.value_counts())
            X_ = np.c_[X_, transformed_column.to_numpy()]
            
        X_ = np.delete(X_, [i for i in range(len(self.cols))], 1)
        return X_


## Pipelines

In [83]:
class AddFeature(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.cols = columns
        self.del_cols = ['Age', 'SibSp', 'Parch', 'Name', 'Cabin', 'Ticket']
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        # X is a numpy array, the columns to with high cardinality
        # X_ = X.copy()
        X_df = pd.DataFrame(X, columns=num_cols+cat_cols)

        X_df['AgeBucket'] = X_df['Age'] // 15 * 15
        X_df['RelativesOnboard'] = X_df.SibSp + X_df.Parch
        X_df['Title'] = X_df['Name'].map(lambda name: extract(name, title_list))
        X_df['Deck'] = X_df['Cabin'].map(lambda x: extract(x, cabin_list))

        # Drop unwanted columns after creating new features
        X_df.drop(self.del_cols, axis=1, inplace=True)

        # return X_df.to_numpy()
        return X_df

In [79]:
train_clean.head(2)

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Name,Sex,Ticket,Cabin,Embarked,AgeBucket,RelativesOnboard,Title,Deck
0,0.0,3.0,22.0,1.0,0.0,7.25,"Braund, Mr. Owen Harris",male,A/5 21171,B96 B98,S,15.0,1.0,Mr,B
1,1.0,1.0,38.0,1.0,0.0,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,PC 17599,C85,C,30.0,1.0,Mrs,C


In [88]:
'''
array([[3.0, 7.25, 'male', 'S', 15.0, 1.0, 'Mr', 'B'],
       [1.0, 71.2833, 'female', 'C', 30.0, 1.0, 'Mrs', 'C'],
       [3.0, 7.925, 'female', 'S', 15.0, 0.0, 'Miss', 'B'],
       [1.0, 53.1, 'female', 'S', 30.0, 1.0, 'Mrs', 'C'],
       [3.0, 8.05, 'male', 'S', 30.0, 0.0, 'Mr', 'B']], dtype=object)
# pclass, fare, sex, embark, agebucket, RelativesOnboard, title, deck
'''

class FullPipeline:
    def __init__(self, data):
        # This is all columns except y: y is absent 
        self.num_cols = [col for col in data.columns if data[col].dtype in ['int64', 'float64']]
        self.cat_cols = [col for col in data.columns if data[col].dtype in ['object']]
        print(self.num_cols)
        print(self.cat_cols)

        self.feat_num_cols = ['Pclass', 'Fare', 'AgeBucket', 'RelativesOnboard']
        self.feat_cat_cols = ['Sex', 'Embarked', 'Title', 'Deck']

        self.low_card_cols = ['Sex', 'Embarked', 'Deck']
        self.high_card_cols = ['Title']

        self.low_card_pipe = Pipeline(steps=[
            # ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ])

        self.high_card_pipe = Pipeline(steps=[
            # ('imputer', SimpleImputer(strategy='most_frequent')),
            ('aggregation', HighCardAggregation()),
            ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
        ])

        # Categorical transformers
        self.cat_trans = ColumnTransformer(transformers=[
            ('low_card_pipeline', self.low_card_pipe, self.low_card_cols),
            ('high_card_pipeline', self.high_card_pipe, self.high_card_cols)
        ])

        # Numerical transformer
        self.num_trans = Pipeline(steps=[
            # ('imputer', SimpleImputer(strategy='median')),
            ('std_scaler', StandardScaler())
        ])

        self.num_cat_trans = ColumnTransformer(transformers=[
            ('num', self.num_trans, self.feat_num_cols),
            ('cat', self.cat_trans, self.feat_cat_cols)
        ])

        self.feat_eng = Pipeline([
            ('add_feat', AddFeature(self.num_cols + self.cat_cols)),
            # ('delete_feat', DelFeature())     
        ])

        self.imputer = ColumnTransformer([
            ('imputer_num', SimpleImputer(strategy='median'), self.num_cols),
            ('imputer_cat', SimpleImputer(strategy='most_frequent'), self.cat_cols)
        ])

        self.clean_pipe = Pipeline([
            ('imputer', self.imputer)
        ])

        self.preprocessor = Pipeline([
            ('clean', self.clean_pipe),
            ('feature_eng', self.feat_eng),
            ('num_cat', self.num_cat_trans)
        ])
    

In [89]:
'''
1. Input all columns
2. clean (eg. imputer for all)
3. feature engineering (eg. add new features, delete features)
4. num-cat transform (eg. onehot, standardscaler)
'''

full_pipeline = FullPipeline(train.drop(['Survived'], axis=1))

['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']


# Train

In [111]:
# Very raw train data

X_train_prepared = full_pipeline.preprocessor.fit_transform(train.drop(['Survived'], axis=1))

In [112]:
X_train_prepared.shape
# print(type(X_train_prepared))

(891, 18)

## Model

In [122]:
from sklearn.model_selection import cross_val_score

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train_prepared, y_train, cv=10)
forest_scores.mean()

0.8159800249687889

In [121]:
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
svm_scores = cross_val_score(svm_clf, X_train_prepared, y_train, cv=10)
svm_scores.mean()

0.8125842696629213

# Test

In [113]:
forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train_prepared, y_train)

RandomForestClassifier(random_state=42)

In [114]:
X_test_prepared = full_pipeline.preprocessor.transform(test)
print(X_test_prepared.shape)
y_pred = forest_clf.predict(X_test_prepared)

(418, 18)


In [115]:
print(y_pred.shape)
y_pred[:5]

(418,)


array([0., 0., 0., 0., 1.])

In [126]:
y_pred_class = pd.DataFrame(data={'Survived': y_pred.astype(int)}, index=test.index)

In [127]:
y_pred_class[:5]

Unnamed: 0_level_0,Survived
PassengerId,Unnamed: 1_level_1
892,0
893,0
894,0
895,0
896,1


In [135]:
y_pred_class.to_csv(os.path.join(PROJECT_ROOT, 'predictions', 'predictions.csv'))