![Screenshot%202021-09-25%20at%2011.50.11.png](attachment:Screenshot%202021-09-25%20at%2011.50.11.png)

![Screenshot%202021-09-25%20at%2011.51.18.png](attachment:Screenshot%202021-09-25%20at%2011.51.18.png)

![Screenshot%202021-09-25%20at%2011.51.31.png](attachment:Screenshot%202021-09-25%20at%2011.51.31.png)

In [3]:
#!/usr/bin/env python
# coding: utf-8
import pandas as pd
import seaborn as sns
import re
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_curve,auc


class ExplainatoryDataAnalysis:
    def __init__(self, csv_file):
        self.df = pd.read_csv(csv_file)
    
    def frame(self):
        return self.df
    
    def info(self):
        self.df.info()
        
    def head(self, n):
        return self.df.head(n)
    
    def value_counts(self, columns):
        for col in columns:
            print("Value Counts for ", col)
            print(self.df[col].value_counts())
            print("\n")
            
    def drop(self, columns, inplace=False):
        self.df.drop(columns=columns, inplace=inplace)
        print("Dropped the columns!")
    
    def encode(self, columns, columns_renaming, encode_map):
        index = 0
        for col in columns:
            self.df[columns_renaming[index]] = self.df[col].apply(lambda x: encode_map[x])
            if col!=columns_renaming[index]:
                self.df.drop(columns=[col], inplace=True)
            index = index+1
        print("Encoding done!")
        
    def fill_underscore(self, columns):
        for col in columns:
            self.df[col] = self.df[col].apply(lambda x: re.sub("[\s\-]+", "_", x))
            self.df[col] = self.df[col].apply(lambda x: re.sub("[\(\)]+", "", x))
            self.df[col] = self.df[col].apply(lambda x: re.sub("[\<]+", "lt", x))
            self.df[col] = self.df[col].apply(lambda x: re.sub("[\&]+", "", x))
            self.df[col] = self.df[col].apply(lambda x: re.sub("[\~]+", "_", x))
            print("Space replaced with underscore for column and braces removed", col)
        print("Space replaced with underscore for all columns")
    
    def get_dummies(self, columns, drop_columns):
        category_dummies = pd.get_dummies(self.df[columns])
        self.df.drop(columns=drop_columns, inplace=True)
        self.df = pd.concat([self.df, category_dummies], axis=1)
        print("1. Created dummies for the columns", columns)
        print("2. Dropped columns", drop_columns)
        print("3. Updated data frame with dummy columns")
        return self.head(10)
   
    def corr(self, reference_column, ascending):
        return self.df.corr()[reference_column].sort_values(ascending=ascending)
    
    def select(self, columns):
        return self.df[columns]
    
    def heatmap(self, columns):
        df_selected = pd.DataFrame()
        if len(columns)>0:
            df_selected = self.df[columns]
        else:
            df_selected = self.df
            
        fig, ax = plt.subplots(figsize=(25,10))
        sns.heatmap(df_selected.corr(), cmap="YlGnBu", annot=False, fmt = '.4g', cbar=False, ax=ax)
        

In [4]:
edai = ExplainatoryDataAnalysis("./datasets/train.csv")

In [5]:
edai.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10147 entries, 0 to 10146
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   customer_id           10147 non-null  int64 
 1   destination           10147 non-null  object
 2   passanger             10147 non-null  object
 3   weather               10147 non-null  object
 4   temperature           10147 non-null  int64 
 5   time                  10147 non-null  object
 6   coupon                10147 non-null  object
 7   expiration            10147 non-null  object
 8   gender                10147 non-null  object
 9   age                   10147 non-null  object
 10  maritalStatus         10147 non-null  object
 11  has_children          10147 non-null  int64 
 12  education             10147 non-null  object
 13  occupation            10147 non-null  object
 14  income                10147 non-null  object
 15  car                   84 non-null   

In [6]:
edai.frame().describe()

Unnamed: 0,customer_id,temperature,has_children,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
count,10147.0,10147.0,10147.0,10147.0,10147.0,10147.0,10147.0,10147.0,10147.0
mean,311272.276831,63.172366,0.412634,1.0,0.563024,0.119838,0.212181,0.787819,0.568444
std,106781.701016,19.232595,0.492332,0.0,0.496037,0.324788,0.408872,0.408872,0.495318
min,123472.0,30.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,221439.0,55.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
50%,310062.0,80.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
75%,401537.0,80.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0
max,499988.0,80.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
edai.head(12)

Unnamed: 0,customer_id,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,258868,No Urgent Place,Friend(s),Sunny,80,6PM,Restaurant(<20),1d,Male,21,...,1~3,4~8,4~8,never,1,1,0,0,1,1
1,318369,Work,Alone,Sunny,80,7AM,Restaurant(<20),2h,Male,21,...,1~3,4~8,1~3,less1,1,0,0,1,0,0
2,320906,No Urgent Place,Alone,Sunny,80,10AM,Coffee House,2h,Female,21,...,gt8,4~8,1~3,1~3,1,1,0,0,1,0
3,412393,Work,Alone,Rainy,55,7AM,Restaurant(<20),2h,Female,26,...,less1,4~8,1~3,never,1,1,1,0,1,0
4,290854,Home,Alone,Snowy,30,6PM,Coffee House,1d,Male,31,...,less1,4~8,less1,never,1,1,0,0,1,0
5,438743,Work,Alone,Sunny,80,7AM,Restaurant(20-50),1d,Female,50plus,...,1~3,4~8,4~8,1~3,1,1,0,0,1,1
6,152741,Home,Alone,Sunny,80,6PM,Coffee House,1d,Female,50plus,...,never,4~8,1~3,never,1,0,0,1,0,0
7,268067,No Urgent Place,Alone,Sunny,55,6PM,Coffee House,1d,Female,21,...,1~3,1~3,1~3,less1,1,0,0,0,1,1
8,420352,Home,Alone,Sunny,80,6PM,Restaurant(<20),1d,Male,26,...,1~3,1~3,1~3,1~3,1,0,0,1,0,1
9,425469,No Urgent Place,Friend(s),Sunny,80,10PM,Bar,1d,Female,36,...,less1,1~3,less1,less1,1,1,0,0,1,1


In [8]:
edai.frame().columns

Index(['customer_id', 'destination', 'passanger', 'weather', 'temperature',
       'time', 'coupon', 'expiration', 'gender', 'age', 'maritalStatus',
       'has_children', 'education', 'occupation', 'income', 'car', 'Bar',
       'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50',
       'toCoupon_GEQ5min', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min',
       'direction_same', 'direction_opp', 'Y'],
      dtype='object')

In [9]:
edai.value_counts(columns=['destination', 'passanger', 'weather', 'temperature',
       'time', 'coupon', 'expiration', 'gender', 'age', 'maritalStatus',
       'has_children', 'education', 'occupation', 'income', 'car', 'Bar',
       'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50',
       'toCoupon_GEQ5min', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min',
       'direction_same', 'direction_opp'])

Value Counts for  destination
No Urgent Place    5045
Home               2572
Work               2530
Name: destination, dtype: int64


Value Counts for  passanger
Alone        5802
Friend(s)    2676
Partner       855
Kid(s)        814
Name: passanger, dtype: int64


Value Counts for  weather
Sunny    8015
Snowy    1142
Rainy     990
Name: weather, dtype: int64


Value Counts for  temperature
80    5203
55    3058
30    1886
Name: temperature, dtype: int64


Value Counts for  time
6PM     2576
7AM     2530
10AM    1833
2PM     1616
10PM    1592
Name: time, dtype: int64


Value Counts for  coupon
Coffee House             3191
Restaurant(<20)          2233
Carry out & Take away    1923
Bar                      1623
Restaurant(20-50)        1177
Name: coupon, dtype: int64


Value Counts for  expiration
1d    5643
2h    4504
Name: expiration, dtype: int64


Value Counts for  gender
Female    5204
Male      4943
Name: gender, dtype: int64


Value Counts for  age
21         2133
26         2

#### Based on the above analysis we can conclude floowing points:

- Total 10147 entries available.

- There are only 8 numerical features customer_id, temperature, has_children, toCoupon_GEQ5min, toCoupon_GEQ15min, toCoupon_GEQ25min, direction_same, direction_opp, Y

- customer_id, toCoupon_GEQ5min can be removed as they all have same values

- Following columns need missing values treatments:
    - car (84 non-null object), 
    - Bar (10059 non-null object), 
    - CoffeeHouse (9975 non-null object), 
    - CarryAway (10025 non-null object), 
    - RestaurantLessThan20 (10050 non-null object), 
    - Restaurant20To50 (9999 non-null object)
    
- Following approach to be taken to fill the missing values:
    - car (84 non-null object): Can be removed, mostly null vaues  
    - Bar (10059 non-null object): filling with the ffill  
    - CoffeeHouse (9975 non-null object): filling with ffill 
    - CarryAway (10025 non-null object): filling with ffill 
    - RestaurantLessThan20 (10050 non-null object): filling with ffill  
    - Restaurant20To50 (9999 non-null object): filling with ffill

In [10]:
edai.drop(columns=["customer_id", "toCoupon_GEQ5min", "car"], inplace=True)

Dropped the columns!


In [11]:
df = edai.frame()

In [12]:
df["Bar"].fillna(method='ffill',inplace=True)
df["CoffeeHouse"].fillna(method='ffill',inplace=True)
df["CarryAway"].fillna(method='ffill',inplace=True)
df["RestaurantLessThan20"].fillna(method='ffill',inplace=True)
df["Restaurant20To50"].fillna(method='ffill',inplace=True)

In [13]:
edai.value_counts(columns=['destination', 'passanger', 'weather', 'temperature',
       'time', 'coupon', 'expiration', 'gender', 'age', 'maritalStatus',
       'has_children', 'education', 'occupation', 'income', 'Bar',
       'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50', 'toCoupon_GEQ15min', 'toCoupon_GEQ25min',
       'direction_same', 'direction_opp'])

Value Counts for  destination
No Urgent Place    5045
Home               2572
Work               2530
Name: destination, dtype: int64


Value Counts for  passanger
Alone        5802
Friend(s)    2676
Partner       855
Kid(s)        814
Name: passanger, dtype: int64


Value Counts for  weather
Sunny    8015
Snowy    1142
Rainy     990
Name: weather, dtype: int64


Value Counts for  temperature
80    5203
55    3058
30    1886
Name: temperature, dtype: int64


Value Counts for  time
6PM     2576
7AM     2530
10AM    1833
2PM     1616
10PM    1592
Name: time, dtype: int64


Value Counts for  coupon
Coffee House             3191
Restaurant(<20)          2233
Carry out & Take away    1923
Bar                      1623
Restaurant(20-50)        1177
Name: coupon, dtype: int64


Value Counts for  expiration
1d    5643
2h    4504
Name: expiration, dtype: int64


Value Counts for  gender
Female    5204
Male      4943
Name: gender, dtype: int64


Value Counts for  age
21         2133
26         2

- destination, passanger, coupon, occupation, education, maritalStatus, Bar, CoffeeHouse, CarryAway, RestaurantLessThan20, Restaurant20To50 need cleaning

In [14]:
edai.fill_underscore(columns=["destination", "passanger", "coupon", "occupation", "education", "maritalStatus", "Bar", "CoffeeHouse", "CarryAway", "RestaurantLessThan20", "Restaurant20To50"])

Space replaced with underscore for column and braces removed destination
Space replaced with underscore for column and braces removed passanger
Space replaced with underscore for column and braces removed coupon
Space replaced with underscore for column and braces removed occupation
Space replaced with underscore for column and braces removed education
Space replaced with underscore for column and braces removed maritalStatus
Space replaced with underscore for column and braces removed Bar
Space replaced with underscore for column and braces removed CoffeeHouse
Space replaced with underscore for column and braces removed CarryAway
Space replaced with underscore for column and braces removed RestaurantLessThan20
Space replaced with underscore for column and braces removed Restaurant20To50
Space replaced with underscore for all columns


In [15]:
edai.value_counts(columns=['destination', 'passanger', 'weather', 'temperature',
       'time', 'coupon', 'expiration', 'gender', 'age', 'maritalStatus',
       'has_children', 'education', 'occupation', 'income', 'Bar',
       'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50',
       'toCoupon_GEQ15min', 'toCoupon_GEQ25min',
       'direction_same', 'direction_opp'])

Value Counts for  destination
No_Urgent_Place    5045
Home               2572
Work               2530
Name: destination, dtype: int64


Value Counts for  passanger
Alone      5802
Friends    2676
Partner     855
Kids        814
Name: passanger, dtype: int64


Value Counts for  weather
Sunny    8015
Snowy    1142
Rainy     990
Name: weather, dtype: int64


Value Counts for  temperature
80    5203
55    3058
30    1886
Name: temperature, dtype: int64


Value Counts for  time
6PM     2576
7AM     2530
10AM    1833
2PM     1616
10PM    1592
Name: time, dtype: int64


Value Counts for  coupon
Coffee_House            3191
Restaurantlt20          2233
Carry_out__Take_away    1923
Bar                     1623
Restaurant20_50         1177
Name: coupon, dtype: int64


Value Counts for  expiration
1d    5643
2h    4504
Name: expiration, dtype: int64


Value Counts for  gender
Female    5204
Male      4943
Name: gender, dtype: int64


Value Counts for  age
21         2133
26         2033
31       

- time 6PM to 24 hours
- expiration 1d, 2h to hours format
- age 50plus, below21 to 51 and 20
- income, remove \$, min_income and max_income.
- income, or more should have 0 in max_income
- income, Less than should be 0 in min_income

In [18]:
def hours24(x):
    regexampm = "[0-9]+([AM|PM|am|pm]+)"
    regexhour = "([0-9]+)[AM|PM|am|pm]+"
    ampm = re.findall(regexampm, x)[0]
    hour = int(re.findall(regexhour, x)[0])
    
    if ampm=='PM':
        hour = int(hour)+12
    return hour

In [19]:
df["time"] = df["time"].apply(hours24)

In [20]:
df["time"].value_counts()

18    2576
7     2530
10    1833
14    1616
22    1592
Name: time, dtype: int64

In [21]:
toHours = {'1d':24,'2h':2}
df["expiration"] = df["expiration"].apply(lambda x: toHours[x])

In [22]:
df["expiration"].value_counts()

24    5643
2     4504
Name: expiration, dtype: int64

In [23]:
def agemanipulation(age):
    correctage = 0
    if age == '50plus':
        correctage = 51
    elif age == 'below21':
        correctage = 20
    else:
        correctage = int(age)
    return correctage

In [24]:
df["age"] = df["age"].apply(agemanipulation)

In [25]:
df["age"].value_counts()

21    2133
26    2033
31    1636
51    1431
36    1065
41     879
46     538
20     432
Name: age, dtype: int64

In [26]:
def income_manipulation(x, income_end):
    income = 0
    if x == '$100000 or More':
        x = '$100000 - $0'
    if x =='Less than $12500':
        x = '$0 - $12500'
    incomes = x.split('-')
    incomes[0] = int(re.sub("[\$\s]+", "", incomes[0]))
    incomes[1] = int(re.sub("[\$\s]+", "", incomes[1]))
    if income_end=='min':
        income = incomes[0]
    else:
        income = incomes[1]
    return income

In [27]:
df["income_min"] = df["income"].apply(income_manipulation, income_end='min')
df["income_max"] = df["income"].apply(income_manipulation, income_end='max')

In [28]:
df.drop(columns=["income"], inplace=True)

In [29]:
edai.value_counts(columns=['destination', 'passanger', 'weather', 'temperature',
       'time', 'coupon', 'expiration', 'gender', 'age', 'maritalStatus',
       'has_children', 'education', 'occupation', 'income_min', 'income_max', 'Bar',
       'CoffeeHouse', 'CarryAway', 'RestaurantLessThan20', 'Restaurant20To50',
       'toCoupon_GEQ15min', 'toCoupon_GEQ25min',
       'direction_same', 'direction_opp'])

Value Counts for  destination
No_Urgent_Place    5045
Home               2572
Work               2530
Name: destination, dtype: int64


Value Counts for  passanger
Alone      5802
Friends    2676
Partner     855
Kids        814
Name: passanger, dtype: int64


Value Counts for  weather
Sunny    8015
Snowy    1142
Rainy     990
Name: weather, dtype: int64


Value Counts for  temperature
80    5203
55    3058
30    1886
Name: temperature, dtype: int64


Value Counts for  time
18    2576
7     2530
10    1833
14    1616
22    1592
Name: time, dtype: int64


Value Counts for  coupon
Coffee_House            3191
Restaurantlt20          2233
Carry_out__Take_away    1923
Bar                     1623
Restaurant20_50         1177
Name: coupon, dtype: int64


Value Counts for  expiration
24    5643
2     4504
Name: expiration, dtype: int64


Value Counts for  gender
Female    5204
Male      4943
Name: gender, dtype: int64


Value Counts for  age
21    2133
26    2033
31    1636
51    1431
36    1

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10147 entries, 0 to 10146
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   destination           10147 non-null  object
 1   passanger             10147 non-null  object
 2   weather               10147 non-null  object
 3   temperature           10147 non-null  int64 
 4   time                  10147 non-null  int64 
 5   coupon                10147 non-null  object
 6   expiration            10147 non-null  int64 
 7   gender                10147 non-null  object
 8   age                   10147 non-null  int64 
 9   maritalStatus         10147 non-null  object
 10  has_children          10147 non-null  int64 
 11  education             10147 non-null  object
 12  occupation            10147 non-null  object
 13  Bar                   10147 non-null  object
 14  CoffeeHouse           10147 non-null  object
 15  CarryAway             10147 non-null

- features eligible for dummies are destination, passanger, weather, coupon, gender, maritalStatus, education,
occupation, Bar, CoffeeHouse, CarryAway, RestaurantLessThan20, Restaurant20To50, 

- in this we still will remove Female after encoding

In [31]:
dummies = pd.get_dummies(df[["destination", "passanger", "weather", "coupon", "gender", "maritalStatus",
                            "education", "occupation", "Bar", "CoffeeHouse", "CarryAway", "RestaurantLessThan20", "Restaurant20To50"]])

In [32]:
dummies.drop(columns=["gender_Female"], inplace=True)

In [33]:
dummies.columns

Index(['destination_Home', 'destination_No_Urgent_Place', 'destination_Work',
       'passanger_Alone', 'passanger_Friends', 'passanger_Kids',
       'passanger_Partner', 'weather_Rainy', 'weather_Snowy', 'weather_Sunny',
       'coupon_Bar', 'coupon_Carry_out__Take_away', 'coupon_Coffee_House',
       'coupon_Restaurant20_50', 'coupon_Restaurantlt20', 'gender_Male',
       'maritalStatus_Divorced', 'maritalStatus_Married_partner',
       'maritalStatus_Single', 'maritalStatus_Unmarried_partner',
       'maritalStatus_Widowed', 'education_Associates_degree',
       'education_Bachelors_degree',
       'education_Graduate_degree_Masters_or_Doctorate',
       'education_High_School_Graduate', 'education_Some_High_School',
       'education_Some_college_no_degree',
       'occupation_Architecture__Engineering',
       'occupation_Arts_Design_Entertainment_Sports__Media',
       'occupation_Building__Grounds_Cleaning__Maintenance',
       'occupation_Business__Financial',
       'occupatio

In [34]:
df.drop(columns=["destination", "passanger", "weather", "coupon", "gender", "maritalStatus",
                            "education", "occupation", "Bar", "CoffeeHouse", "CarryAway", "RestaurantLessThan20", "Restaurant20To50"], inplace=True)

In [35]:
df

Unnamed: 0,temperature,time,expiration,age,has_children,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y,income_min,income_max
0,80,18,24,21,0,1,0,0,1,1,0,12500
1,80,7,2,21,0,0,0,1,0,0,37500,49999
2,80,10,2,21,0,1,0,0,1,0,0,12500
3,55,7,2,26,0,1,1,0,1,0,25000,37499
4,30,18,24,31,0,1,0,0,1,0,12500,24999
...,...,...,...,...,...,...,...,...,...,...,...,...
10142,80,18,2,20,0,1,0,1,0,0,0,12500
10143,80,18,24,26,0,0,0,1,0,1,37500,49999
10144,80,10,2,21,0,0,0,0,1,1,25000,37499
10145,30,22,2,46,1,0,0,0,1,0,100000,0


In [36]:
df = pd.concat([df, dummies], axis=1)

In [37]:
df.columns

Index(['temperature', 'time', 'expiration', 'age', 'has_children',
       'toCoupon_GEQ15min', 'toCoupon_GEQ25min', 'direction_same',
       'direction_opp', 'Y', 'income_min', 'income_max', 'destination_Home',
       'destination_No_Urgent_Place', 'destination_Work', 'passanger_Alone',
       'passanger_Friends', 'passanger_Kids', 'passanger_Partner',
       'weather_Rainy', 'weather_Snowy', 'weather_Sunny', 'coupon_Bar',
       'coupon_Carry_out__Take_away', 'coupon_Coffee_House',
       'coupon_Restaurant20_50', 'coupon_Restaurantlt20', 'gender_Male',
       'maritalStatus_Divorced', 'maritalStatus_Married_partner',
       'maritalStatus_Single', 'maritalStatus_Unmarried_partner',
       'maritalStatus_Widowed', 'education_Associates_degree',
       'education_Bachelors_degree',
       'education_Graduate_degree_Masters_or_Doctorate',
       'education_High_School_Graduate', 'education_Some_High_School',
       'education_Some_college_no_degree',
       'occupation_Architecture__Eng

In [41]:
cor = df.corr()["Y"].sort_values(ascending=False)

In [57]:
cor.tail(20)

Restaurant20To50_never          -0.035691
passanger_Kids                  -0.039346
maritalStatus_Married_partner   -0.040028
has_children                    -0.040851
occupation_Retired              -0.041164
income_max                      -0.045708
age                             -0.051105
CarryAway_less1                 -0.052814
weather_Snowy                   -0.064951
weather_Rainy                   -0.065554
Bar_never                       -0.067526
destination_Home                -0.074576
destination_Work                -0.076882
toCoupon_GEQ15min               -0.081642
coupon_Coffee_House             -0.091667
coupon_Restaurant20_50          -0.095728
toCoupon_GEQ25min               -0.098778
passanger_Alone                 -0.100172
CoffeeHouse_never               -0.127925
coupon_Bar                      -0.140365
Name: Y, dtype: float64

In [58]:
class Modals:
    def __init__(self, dataframe):
        self.df = dataframe
        
    def train_test_split(self, target_column, test_size, random_state):
        self.X = self.df.drop(columns=[target_column]).copy()
        self.y = self.df[target_column]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size=test_size, random_state=random_state)
        
    def get_best_parameters_dt(self, params_grid):
        clf = DecisionTreeClassifier(random_state=80)
        cv_model = GridSearchCV(clf, param_grid = params_grid)
        cv_model.fit(self.X_train, self.y_train)
        print(cv_model.best_params_)
        self.dtree_best_params = cv_model.best_params_
        return cv_model.best_params_
        
    def dtree(self, depths):
        ascore = []
        dectreearr = []
        for depth in range(1, depths):
            dectree = DecisionTreeClassifier(max_depth=depth, criterion='gini')
            dectree_train = dectree.fit(self.X_train, self.y_train)
            y_pred = dectree_train.predict(self.X_test)
            acc_score = accuracy_score(self.y_test,y_pred)*100
            ascore.append(acc_score)
            dectreearr.append(dectree_train)
            print("#", depth, "Decision tree accuracy Score=",accuracy_score(self.y_test,y_pred)*100)
        self.dtree_scores = {'ascore':ascore, 'dectree':dectreearr}
        return {'ascore':ascore, 'dectree':dectreearr}
    
    def scale(self, arr):
        arr_scaled = np.array(arr)-np.array(arr).min()
        return arr_scaled
    
    def bar(self, arr) :
        fig, ax = plt.subplots(figsize=(15,10))
        ax.bar(range(1,len(arr)+1), arr)
        return arr
    
    def decision_tree_classification(self, max_depth, criterion):
        dtc = DecisionTreeClassifier(max_depth=max_depth, criterion=criterion)
        dtc = dtc.fit(self.X_train, self.y_train)
        self.y_pred = dtc.predict(self.X_test)
        self.accurecy_score_pt = accuracy_score(self.y_test,self.y_pred)*100
        self.cm = confusion_matrix(self.y_test,self.y_pred)
        self.probability = dtc.predict_proba(self.X_test)
        self.positive_probability = self.probability[:,1]
        print("Accurecy Score is:", self.accurecy_score_pt)
        print("Following is the Confusion Matrix.")
        print("-------------------------------------")
        fig, ax = plt.subplots(figsize=(5,3))
        sns.heatmap(self.cm, cmap="YlGnBu", annot=True, fmt = '.4g', cbar=False, ax=ax)
    
    def measurements(self):
        TN = self.cm[0,0] #Actually 0, but predicted as 0: TN
        FP = self.cm[0,1] #Actually 0, but predicted as 1 : FP
        FN = self.cm[1,0] #Actually 1, but predicted as 0 : FN
        TP = self.cm[1,1] #Actually 1, but predicted as 1 : TP
        N = len(self.y_test)
        
        accuracy = (TN+TP)/N
        recall = TP/(TP+FN)
        specificity = TN/(TN+FP)
        fpr = 1-specificity
        precision = TP/(TP+FP)
        f1_score_value = f1_score(self.y_test, self.y_pred)
        frame_contents = {"Measurement":["Accuracy", "Recall or TPR or Sensitivity", "Specificity", "FPR or 1-Specificity", "Precision", "F1 Score"], "Values": [accuracy, recall, specificity, fpr, precision, f1_score_value]}
        df_measurements = pd.DataFrame(frame_contents)
        return {"measurements":df_measurements}
        

In [60]:
mod = Modals(edai.frame())

In [62]:
mod.train_test_split(target_column="Y", test_size=0.2, random_state=25)

In [63]:
mod.get_best_parameters_dt(params_grid = {'max_depth':list(range(1,500)), 'criterion':['gini', 'entropy']})

{'criterion': 'gini', 'max_depth': 4}


{'criterion': 'gini', 'max_depth': 4}

In [64]:
mod.dtree(depths=20)

# 1 Decision tree accuracy Score= 57.04433497536946
# 2 Decision tree accuracy Score= 58.522167487684726
# 3 Decision tree accuracy Score= 61.62561576354679
# 4 Decision tree accuracy Score= 63.05418719211823
# 5 Decision tree accuracy Score= 62.70935960591133
# 6 Decision tree accuracy Score= 61.92118226600986
# 7 Decision tree accuracy Score= 62.758620689655174
# 8 Decision tree accuracy Score= 62.5615763546798
# 9 Decision tree accuracy Score= 62.51231527093596
# 10 Decision tree accuracy Score= 60.78817733990147
# 11 Decision tree accuracy Score= 58.91625615763547
# 12 Decision tree accuracy Score= 57.83251231527093
# 13 Decision tree accuracy Score= 57.931034482758626
# 14 Decision tree accuracy Score= 57.33990147783251
# 15 Decision tree accuracy Score= 57.487684729064036
# 16 Decision tree accuracy Score= 57.24137931034483
# 17 Decision tree accuracy Score= 57.19211822660098
# 18 Decision tree accuracy Score= 57.0935960591133
# 19 Decision tree accuracy Score= 56.79802955665024


{'ascore': [57.04433497536946,
  58.522167487684726,
  61.62561576354679,
  63.05418719211823,
  62.70935960591133,
  61.92118226600986,
  62.758620689655174,
  62.5615763546798,
  62.51231527093596,
  60.78817733990147,
  58.91625615763547,
  57.83251231527093,
  57.931034482758626,
  57.33990147783251,
  57.487684729064036,
  57.24137931034483,
  57.19211822660098,
  57.0935960591133,
  56.79802955665024],
 'dectree': [DecisionTreeClassifier(max_depth=1),
  DecisionTreeClassifier(max_depth=2),
  DecisionTreeClassifier(max_depth=3),
  DecisionTreeClassifier(max_depth=4),
  DecisionTreeClassifier(max_depth=5),
  DecisionTreeClassifier(max_depth=6),
  DecisionTreeClassifier(max_depth=7),
  DecisionTreeClassifier(max_depth=8),
  DecisionTreeClassifier(max_depth=9),
  DecisionTreeClassifier(max_depth=10),
  DecisionTreeClassifier(max_depth=11),
  DecisionTreeClassifier(max_depth=12),
  DecisionTreeClassifier(max_depth=13),
  DecisionTreeClassifier(max_depth=14),
  DecisionTreeClassifier(m