# Machine Learning Model (Decision Tree Classifier)

In [1]:
import os

In [2]:
os.getcwd()

'C:\\Users\\Dell\\Machine Learning Model'

In [3]:
os.chdir('C:\\Users\\Dell')

In [4]:
os.getcwd()

'C:\\Users\\Dell'

In [5]:
import numpy as np
import pandas as pd

In [34]:
titanic=pd.read_csv("Titanic.csv")
titanic=titanic.drop(columns=["PassengerId","Name","Ticket","Cabin","Embarked"])         #Remove Unwanted Columns
titanic.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
0,0,3,male,22.0,1,0,7.25
1,1,1,female,38.0,1,0,71.2833
2,1,3,female,26.0,0,0,7.925
3,1,1,female,35.0,1,0,53.1
4,0,3,male,35.0,0,0,8.05


# Step 1

In [35]:
X=titanic.drop(columns=["Survived"])                                #Feature Variables

In [36]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,3,male,22.0,1,0,7.25
1,1,female,38.0,1,0,71.2833
2,3,female,26.0,0,0,7.925
3,1,female,35.0,1,0,53.1
4,3,male,35.0,0,0,8.05


In [37]:
y=titanic["Survived"]                                               #Target Variables

In [38]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

# Rules

In [39]:
X.dtypes

Pclass      int64
Sex        object
Age       float64
SibSp       int64
Parch       int64
Fare      float64
dtype: object

In [40]:
#To convert Sex column from dtype object to int   (Use One Hot Encoding)

In [41]:
X=pd.get_dummies(X,columns=["Sex"],prefix="OH")

In [42]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,OH_female,OH_male
0,3,22.0,1,0,7.25,0,1
1,1,38.0,1,0,71.2833,1,0
2,3,26.0,0,0,7.925,1,0
3,1,35.0,1,0,53.1,1,0
4,3,35.0,0,0,8.05,0,1


In [43]:
X.dtypes

Pclass         int64
Age          float64
SibSp          int64
Parch          int64
Fare         float64
OH_female      uint8
OH_male        uint8
dtype: object

In [44]:
type(X)

pandas.core.frame.DataFrame

In [45]:
X.isna().sum()

Pclass         0
Age          177
SibSp          0
Parch          0
Fare           0
OH_female      0
OH_male        0
dtype: int64

In [46]:
#Since Age column has missing values, we need to fill it (Use fillna())

In [47]:
#Since Age column is Continuous Numeric, we need to fill it by the mean of Age column

In [48]:
X.fillna(value=X["Age"].mean(),inplace=True)

In [49]:
X.isna().sum()

Pclass       0
Age          0
SibSp        0
Parch        0
Fare         0
OH_female    0
OH_male      0
dtype: int64

In [50]:
X.shape

(891, 7)

In [52]:
#Since, the scale is of different types we need to convert it to Single type (Use Standardization i.e Standard Scaler)

In [53]:
from sklearn.preprocessing import StandardScaler

In [54]:
SS_scaler=StandardScaler()

In [55]:
X=SS_scaler.fit_transform(X)

In [57]:
X                                     #It has its Mean on 0

array([[ 0.82737724, -0.5924806 ,  0.43279337, ..., -0.50244517,
        -0.73769513,  0.73769513],
       [-1.56610693,  0.63878901,  0.43279337, ...,  0.78684529,
         1.35557354, -1.35557354],
       [ 0.82737724, -0.2846632 , -0.4745452 , ..., -0.48885426,
         1.35557354, -1.35557354],
       ...,
       [ 0.82737724,  0.        ,  0.43279337, ..., -0.17626324,
         1.35557354, -1.35557354],
       [-1.56610693, -0.2846632 , -0.4745452 , ..., -0.04438104,
        -0.73769513,  0.73769513],
       [ 0.82737724,  0.17706291, -0.4745452 , ..., -0.49237783,
        -0.73769513,  0.73769513]])

In [59]:
X=pd.DataFrame(X)

In [60]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.827377,-0.592481,0.432793,-0.473674,-0.502445,-0.737695,0.737695
1,-1.566107,0.638789,0.432793,-0.473674,0.786845,1.355574,-1.355574
2,0.827377,-0.284663,-0.474545,-0.473674,-0.488854,1.355574,-1.355574
3,-1.566107,0.407926,0.432793,-0.473674,0.42073,1.355574,-1.355574
4,0.827377,0.407926,-0.474545,-0.473674,-0.486337,-0.737695,0.737695


# Step 2

In [63]:
from sklearn.model_selection import train_test_split

In [64]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,stratify=y)

# Step 3

In [65]:
from sklearn.tree import DecisionTreeClassifier

In [66]:
tree=DecisionTreeClassifier()

In [67]:
tree.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

# Step 4

# Evaluate

In [68]:
tree.score(X_test,y_test)

0.7533632286995515

# Predict

In [69]:
tree.predict(X_test)

array([0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0], dtype=int64)