In [1]:
#importing necessary libraries

import pandas as pd
import numpy as np

#data preprocessing libraries
from sklearn.preprocessing import LabelEncoder

#data train test split library
from sklearn.model_selection import train_test_split

#ML algorithm libraries
from sklearn.tree import DecisionTreeClassifier

#Model evaluation library
from sklearn.metrics import accuracy_score

In [2]:
#importing data frame
df = pd.read_excel('ktm.xlsx')
df.head()

Unnamed: 0,ID,Age,Gender,Occupation,Phone Type,Current Bike,Relationship,Response
0,1,53,Male,Professional,Average,180 to 220,Complicated,Not purchased
1,2,27,Female,Self Employed,Low End,No Bike,Single,Purchased
2,3,39,Female,Unemployed,Average,180 to 220,Married,Not purchased
3,4,20,Female,Unemployed,High End,No Bike,Married,Not purchased
4,5,29,Male,Student,Average,180 to 220,Complicated,Purchased


In [3]:
#dropping id column which is irrelevant..
df.drop('ID', axis=1, inplace= True)   #inplace = true is used when we want to delete column permanent.

In [4]:
df.head()

Unnamed: 0,Age,Gender,Occupation,Phone Type,Current Bike,Relationship,Response
0,53,Male,Professional,Average,180 to 220,Complicated,Not purchased
1,27,Female,Self Employed,Low End,No Bike,Single,Purchased
2,39,Female,Unemployed,Average,180 to 220,Married,Not purchased
3,20,Female,Unemployed,High End,No Bike,Married,Not purchased
4,29,Male,Student,Average,180 to 220,Complicated,Purchased


# Label encoding the categorical column

In [5]:
le = LabelEncoder()

In [6]:
df['Gender']= le.fit_transform(df['Gender'])

In [7]:
df.head()

Unnamed: 0,Age,Gender,Occupation,Phone Type,Current Bike,Relationship,Response
0,53,1,Professional,Average,180 to 220,Complicated,Not purchased
1,27,0,Self Employed,Low End,No Bike,Single,Purchased
2,39,0,Unemployed,Average,180 to 220,Married,Not purchased
3,20,0,Unemployed,High End,No Bike,Married,Not purchased
4,29,1,Student,Average,180 to 220,Complicated,Purchased


In [8]:
df['Occupation']= le.fit_transform(df['Occupation'])
df['Phone Type']= le.fit_transform(df['Phone Type'])
df['Current Bike']= le.fit_transform(df['Current Bike'])
df['Relationship']= le.fit_transform(df['Relationship'])
df['Response']= le.fit_transform(df['Response'])

In [9]:
df.head()

Unnamed: 0,Age,Gender,Occupation,Phone Type,Current Bike,Relationship,Response
0,53,1,0,0,1,1,0
1,27,0,1,2,4,3,1
2,39,0,3,0,1,2,0
3,20,0,3,1,4,2,0
4,29,1,2,0,1,1,1


In [10]:
#checking null values
df.isnull().sum()

Age             0
Gender          0
Occupation      0
Phone Type      0
Current Bike    0
Relationship    0
Response        0
dtype: int64

In [11]:
df.describe()

Unnamed: 0,Age,Gender,Occupation,Phone Type,Current Bike,Relationship,Response
count,1518.0,1518.0,1518.0,1518.0,1518.0,1518.0,1518.0
mean,32.731225,0.462451,1.639657,1.002635,1.816864,1.633729,0.57444
std,13.258555,0.498752,0.968826,0.670496,1.410267,1.14659,0.494591
min,18.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,22.0,0.0,1.0,1.0,1.0,1.0,0.0
50%,28.0,0.0,2.0,1.0,2.0,2.0,1.0
75%,44.0,1.0,2.0,1.0,3.0,3.0,1.0
max,60.0,1.0,3.0,2.0,4.0,3.0,1.0


In [12]:
#collecting independent features
x = df.drop('Response', axis=1)

In [13]:
x.head()

Unnamed: 0,Age,Gender,Occupation,Phone Type,Current Bike,Relationship
0,53,1,0,0,1,1
1,27,0,1,2,4,3
2,39,0,3,0,1,2
3,20,0,3,1,4,2
4,29,1,2,0,1,1


In [14]:
#collecting dependent features
y = df[['Response']]

In [15]:
y.head()

Unnamed: 0,Response
0,0
1,1
2,0
3,0
4,1


In [16]:
#train-test split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3, random_state=50)

# Model building

In [17]:
dt = DecisionTreeClassifier()

In [18]:
#Always fit the model on training data i.e x_train & y_train

dt.fit(x_train,y_train)

DecisionTreeClassifier()

In [19]:
#testing model performance by testing data
dt_pred = dt.predict(x_test)

# Model evaluation

In [20]:
#testing accuracy
accuracy_score(y_test, dt_pred)

0.7741228070175439

In [21]:
dt_pred1 = dt.predict(x_train)

In [22]:
#training accuracy
accuracy_score(y_train,dt_pred1)

0.995291902071563

In [23]:
#Decision tree is a overfitting model. Its giving good accuracy for the training data and gives poor accuracy for testing data.
#In other terms it overfits the training data and cannot generalize the test data. 
#Generic model when its able to give good accuracy on training and testing data. When diff bw both accuracies is not more than 5%

## Implementing stratified k-fold cross validation

In [24]:
from sklearn.model_selection import StratifiedKFold

In [25]:
stf = StratifiedKFold(n_splits=10)

In [26]:
stf.get_n_splits(X=x, y=y)   #its showing how many splits in x and y

10

In [27]:
dt = DecisionTreeClassifier()

In [28]:
#this loop will give accuracies for each iteration
accuracy = []
for train_index, test_index in stf.split(x,y):
#     print('Train =', train_index, 'Test =', test_index )
    x1_train, x1_test = x.iloc[train_index], x.iloc[test_index] #
    y1_train, y1_test = y.iloc[train_index], y.iloc[test_index]
    dt.fit(x1_train, y1_train)
    pred = dt.predict(x1_test)
    acc = accuracy_score(y1_test, pred)
    accuracy.append(acc)

In [29]:
accuracy  #accuracy for each iteration

[0.5197368421052632,
 0.5197368421052632,
 0.5394736842105263,
 0.6381578947368421,
 0.8421052631578947,
 0.9078947368421053,
 0.8486842105263158,
 0.8618421052631579,
 0.8543046357615894,
 0.8543046357615894]

In [30]:
np.mean(accuracy)

0.7386240850470547

## Hyperparameter tuning

In [31]:
dt_tune = DecisionTreeClassifier()

In [34]:
param_dt = {'criterion':['gini','entropy'], 'max_depth':[1,2,3,4,5,6,7,8,9,10], 'splitter': ['best', 'random']}

In [32]:
from sklearn.model_selection import GridSearchCV

In [35]:
model = GridSearchCV(dt_tune, param_dt)

In [36]:
model.fit(x,y)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'splitter': ['best', 'random']})

In [37]:
model.best_params_

{'criterion': 'entropy', 'max_depth': 1, 'splitter': 'best'}

In [38]:
dt_tune = DecisionTreeClassifier(criterion='entropy', max_depth=1, splitter='best')

In [39]:
dt_tune.fit(x_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=1)

In [40]:
pred = dt_tune.predict(x_test)

In [42]:
accuracy_score(y_test, pred)

0.7828947368421053

## Post hyperparameter tuning