# Bonus: Decision Tree Classifier

## Import Libraries

In [1]:
import pandas as pd
import numpy as np

# Import model made from scratch
from Models.tree import DecisionTreeClassifier

## Import Dataset
Using the titanic dataset and only keeping the categorical columns for evaluation.

In [2]:
titanic_df = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# Splitting the data into train and test
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(titanic_df, test_size=0.2, random_state=42, stratify=titanic_df['Survived'])

In [4]:
train_set

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
692,693,1,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S
481,482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0.0000,,S
527,528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S
855,856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18.0,0,1,392091,9.3500,,S
801,802,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31.0,1,1,C.A. 31921,26.2500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
359,360,1,3,"Mockler, Miss. Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q
258,259,1,1,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,,C
736,737,0,3,"Ford, Mrs. Edward (Margaret Ann Watson)",female,48.0,1,3,W./C. 6608,34.3750,,S
462,463,0,1,"Gee, Mr. Arthur H",male,47.0,0,0,111320,38.5000,E63,S


In [5]:
test_set

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
565,566,0,3,"Davies, Mr. Alfred J",male,24.0,2,0,A/4 48871,24.1500,,S
160,161,0,3,"Cribb, Mr. John Hatfield",male,44.0,0,1,371362,16.1000,,S
553,554,1,3,"Leeni, Mr. Fahim (""Philip Zenni"")",male,22.0,0,0,2620,7.2250,,C
860,861,0,3,"Hansen, Mr. Claus Peter",male,41.0,2,0,350026,14.1083,,S
241,242,1,3,"Murphy, Miss. Katherine ""Kate""",female,,1,0,367230,15.5000,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
880,881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25.0,0,1,230433,26.0000,,S
91,92,0,3,"Andreasson, Mr. Paul Edvin",male,20.0,0,0,347466,7.8542,,S
883,884,0,2,"Banfield, Mr. Frederick James",male,28.0,0,0,C.A./SOTON 34068,10.5000,,S
473,474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23.0,0,0,SC/AH Basle 541,13.7917,D,C


## Exploratory Data Analysis

In [6]:
print(train_set.columns.values)
print("Features:", len(train_set.columns))

['PassengerId' 'Survived' 'Pclass' 'Name' 'Sex' 'Age' 'SibSp' 'Parch'
 'Ticket' 'Fare' 'Cabin' 'Embarked']
Features: 12


In [7]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 692 to 507
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Name         712 non-null    object 
 4   Sex          712 non-null    object 
 5   Age          575 non-null    float64
 6   SibSp        712 non-null    int64  
 7   Parch        712 non-null    int64  
 8   Ticket       712 non-null    object 
 9   Fare         712 non-null    float64
 10  Cabin        160 non-null    object 
 11  Embarked     710 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 72.3+ KB


In [8]:
train_set.nunique()

PassengerId    712
Survived         2
Pclass           3
Name           712
Sex              2
Age             85
SibSp            7
Parch            7
Ticket         571
Fare           226
Cabin          127
Embarked         3
dtype: int64

### Examine Target Variable

In [9]:
train_set['Survived'].value_counts()

Survived
0    439
1    273
Name: count, dtype: int64

In [10]:
# Examine target variable.
print('Dead:', round(train_set['Survived'].value_counts()[0]/len(train_set) * 100,2), '% of the dataset')
print('Survived:', round(train_set['Survived'].value_counts()[1]/len(train_set) * 100,2), '% of the dataset')

Dead: 61.66 % of the dataset
Survived: 38.34 % of the dataset


## Data Preprocessing

### Drop ID

In [11]:
# Save PassengerId
train_passenger_id = train_set['PassengerId']
test_passenger_id = test_set['PassengerId']

# Drop columns
train_set.drop(columns=['PassengerId'], inplace=True)
test_set.drop(columns=['PassengerId'], inplace=True)

### Handle Missing Values

In [12]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 692 to 507
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  712 non-null    int64  
 1   Pclass    712 non-null    int64  
 2   Name      712 non-null    object 
 3   Sex       712 non-null    object 
 4   Age       575 non-null    float64
 5   SibSp     712 non-null    int64  
 6   Parch     712 non-null    int64  
 7   Ticket    712 non-null    object 
 8   Fare      712 non-null    float64
 9   Cabin     160 non-null    object 
 10  Embarked  710 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 66.8+ KB


In [13]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 179 entries, 565 to 637
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  179 non-null    int64  
 1   Pclass    179 non-null    int64  
 2   Name      179 non-null    object 
 3   Sex       179 non-null    object 
 4   Age       139 non-null    float64
 5   SibSp     179 non-null    int64  
 6   Parch     179 non-null    int64  
 7   Ticket    179 non-null    object 
 8   Fare      179 non-null    float64
 9   Cabin     44 non-null     object 
 10  Embarked  179 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 16.8+ KB


In [14]:
# Split numerical and categorical variables
num_feat = train_set.select_dtypes(include=['int', 'float'])
cat_feat = train_set.select_dtypes(include=['object'])

num_feat_test = test_set.select_dtypes(include=['int', 'float'])
cat_feat_test = test_set.select_dtypes(include=['object'])

In [15]:
def feat_null_pct(num, data):
    missing = num.isnull().sum()
    nans = missing[missing > 0].sort_values(ascending=False)
    for col in nans.index:
        print(f"{col} Null Values: {round(100 - data[col].value_counts().sum()/len(data) * 100, 2)}% of the dataset")

In [16]:
feat_null_pct(num_feat, train_set)
feat_null_pct(cat_feat, train_set)

Age Null Values: 19.24% of the dataset
Cabin Null Values: 77.53% of the dataset
Embarked Null Values: 0.28% of the dataset


In [17]:
feat_null_pct(num_feat_test, test_set)
feat_null_pct(cat_feat_test, test_set)

Age Null Values: 22.35% of the dataset
Cabin Null Values: 75.42% of the dataset


In [18]:
# Create custom imputer class
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

class FeatureImputer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        self.cat_imputer = SimpleImputer(strategy="most_frequent")
        self.num_imputer = SimpleImputer(strategy="mean")

        self.cat_imputer.fit(X[['Embarked']])
        self.num_imputer.fit(X[['Age', 'Fare']])
        return self
    
    def transform(self, X):
        X[['Embarked']] = self.cat_imputer.transform(X[['Embarked']])
        X[['Age', 'Fare']] = self.num_imputer.transform(X[['Age', 'Fare']])
            
        # Check which features contain null values
        null_columns = X.columns[X.isnull().any()]
        print(X[null_columns].isnull().sum())
        
        return X

### Feature Engineering

In [19]:
# Define a function to extract the title from a name
def extract_title(name):
    return name.split(',')[1].split('.')[0].strip()

In [20]:
# Define a function to determine whether the passenger is travelling alone or with a family member
def is_alone(row):
    if row['SibSp'] == 0 and row['Parch'] == 0:
        return 1 # Alone
    else:
        return 0 # With family

In [21]:
# Bin the ages into groups
def age_binner(age):
    if age <= 16:
        return 'Child'
    elif age <= 32:
        return 'Young adult'
    elif age <= 48:
        return 'Adult'
    elif age <= 64:
        return 'Middle aged'
    else:
        return 'Senior'

In [22]:
# Create class to create new features
class FeatureCreator(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['Title'] = X['Name'].apply(extract_title)
        X['MultipleCabin'] = X['Cabin'].apply(lambda x: 0 if pd.isna(x) else len(x.split(' ')))
        X['isAlone'] = X.apply(is_alone, axis=1)
        X['Age'] = X['Age'].apply(age_binner)
        
        return X

In [23]:
# Create feature dropper class
class FeatureDropper(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(columns=['Cabin', 'Ticket', 'Name'], axis=1)

### Encoding

In [24]:
# Create custom encoder class
class FeatureEncoder(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_encoded = X.copy()
        
        cat_cols = X.select_dtypes(include=['object'])
        
        for col in cat_cols:
            # Perform one-hot encoding
            X_encoded = pd.concat([X_encoded, pd.get_dummies(X_encoded[col], prefix=col)], axis=1)
        
        # Drop original categorical columns from DataFrame
        X_encoded.drop(cat_cols, axis=1, inplace=True)
        return X_encoded

In [25]:
def match_columns(train, test):
    # Get list of columns in training set
    train_cols = train.columns.tolist()
    
    # Get list of columns in test set
    test_cols = test.columns.tolist()
    
    # Remove any columns in test set that aren't in training set
    for col in test_cols:
        if col not in train_cols:
            test = test.drop(col, axis=1)
    
    # Add any missing columns to test set and fill with 0
    for col in train_cols:
        if col not in test_cols:
            test[col] = 0
    
    # Reorder columns in test set to match training set
    test = test[train_cols]
    
    # Return modified test set
    return test

### Handle Imbalances

In [26]:
from imblearn.over_sampling import SMOTE

def balance_classes(X_t, y_t):
    sm = SMOTE(random_state=42)
    X_t_res, y_t_res = sm.fit_resample(X_t, y_t)
    return X_t_res, y_t_res

## Modelling and Fitting the Pipeline

In [27]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([("imputer", FeatureImputer()),
                     ("featurecreator", FeatureCreator()),
                     ("dropper", FeatureDropper()),
                     ("encoder", FeatureEncoder())])

In [28]:
train_set = pipeline.fit_transform(train_set)
test_set = pipeline.transform(test_set)

Cabin    552
dtype: int64
Cabin    135
dtype: int64


In [29]:
test_set = match_columns(train_set, test_set)

In [30]:
train_set.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Fare,MultipleCabin,isAlone,Sex_female,Sex_male,Age_Adult,...,Title_Lady,Title_Major,Title_Master,Title_Miss,Title_Mlle,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir
692,1,3,0,0,56.4958,0,1,False,True,False,...,False,False,False,False,False,True,False,False,False,False
481,0,2,0,0,0.0,0,1,False,True,False,...,False,False,False,False,False,True,False,False,False,False
527,0,1,0,0,221.7792,1,1,False,True,False,...,False,False,False,False,False,True,False,False,False,False
855,1,3,0,1,9.35,0,0,True,False,False,...,False,False,False,False,False,False,True,False,False,False
801,1,2,1,1,26.25,0,0,True,False,False,...,False,False,False,False,False,False,True,False,False,False


In [31]:
test_set.head()

Unnamed: 0,Survived,Pclass,SibSp,Parch,Fare,MultipleCabin,isAlone,Sex_female,Sex_male,Age_Adult,...,Title_Lady,Title_Major,Title_Master,Title_Miss,Title_Mlle,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir
565,0,3,2,0,24.15,0,0,False,True,False,...,0,False,False,False,0,True,False,0,False,0
160,0,3,0,1,16.1,0,0,False,True,True,...,0,False,False,False,0,True,False,0,False,0
553,1,3,0,0,7.225,0,1,False,True,False,...,0,False,False,False,0,True,False,0,False,0
860,0,3,2,0,14.1083,0,0,False,True,True,...,0,False,False,False,0,True,False,0,False,0
241,1,3,1,0,15.5,0,0,True,False,False,...,0,False,False,True,0,False,False,0,False,0


# Evaluation

In [32]:
X_train = train_set.drop(['Survived'], axis=1)
y_train = train_set['Survived']

X_test = test_set.drop(['Survived'], axis=1)
y_test = test_set['Survived']

In [33]:
X_train, y_train = balance_classes(X_train, y_train)

In [34]:
X_train.head()

Unnamed: 0,Pclass,SibSp,Parch,Fare,MultipleCabin,isAlone,Sex_female,Sex_male,Age_Adult,Age_Child,...,Title_Lady,Title_Major,Title_Master,Title_Miss,Title_Mlle,Title_Mr,Title_Mrs,Title_Ms,Title_Rev,Title_Sir
0,3,0,0,56.4958,0,1,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False
1,2,0,0,0.0,0,1,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False
2,1,0,0,221.7792,1,1,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False
3,3,0,1,9.35,0,0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False
4,2,1,1,26.25,0,0,True,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [36]:
# Train model
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X_train.values, y_train.values)

# Make predictions
y_pred = clf.predict(X_test.values)

In [37]:
# Evaluate the model
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.87      0.84       110
           1       0.77      0.68      0.72        69

    accuracy                           0.80       179
   macro avg       0.79      0.78      0.78       179
weighted avg       0.80      0.80      0.80       179



In [38]:
# Compare with sklearn's DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier as SkDecisionTreeClassifier

model = SkDecisionTreeClassifier(max_depth=5)
model.fit(X_train.values, y_train.values)

y_pred_sk = model.predict(X_test.values)

print(classification_report(y_test, y_pred_sk))

              precision    recall  f1-score   support

           0       0.80      0.87      0.83       110
           1       0.76      0.65      0.70        69

    accuracy                           0.79       179
   macro avg       0.78      0.76      0.77       179
weighted avg       0.79      0.79      0.78       179



# External Dataset Evaluation
`UNCOMMENT TO TRY YOUR OWN DATA`

In [39]:
# DATA = "ENTER PATH TO DATA HERE"
# TARGET = "ENTER TARGET COLUMN NAME HERE"
# MAX_DEPTH = 5

# df = pd.read_csv(DATA)
# X = df.drop(TARGET, axis=1)
# y = df[TARGET]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# dt = DecisionTreeClassifier(max_depth=MAX_DEPTH)
# dt.fit(X_train.values, y_train.values)

# y_pred = dt.predict(X_test.values)

# print(classification_report(y_test, y_pred))