In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import metrics
%matplotlib inline

# Bank Marketing Data -- A Decision Tree Approach

## Goal
The goal is to predict if the client will subscribe(yes/no) to a term deposit by building a classification model using Decision Tree

In [6]:
from IPython.display import display, HTML

def show_examples_of_data(dataframe, data_information, catgeogry_cutoff):
    '''
       purpose To show the data to provide the data scientist an understand of the data
       
       input:
          dataframe          The data frame that contains the dataset
          data_information   Information about the categorical, missing values etc..
          descriptons        A series that contains a description for each filename
    '''
    
    data_dictionary = pd.DataFrame(columns=["Field", "Value"])
    
    for index, row in data_information.iterrows():
        
         values = ""
         column_cnt = data_information.loc[index, '# of Categories']
         if column_cnt <= catgeogry_cutoff:
                value = dataframe[index].unique()
         else: 
                value = str(dataframe[index].min()) + " to "  + str(dataframe[index].max())             
         row_data = []
         row_data.append(index)
         #row_data.append(description.loc[index])
         row_data.append(value)
    
         data_dictionary.loc[len(data_dictionary.index)] = row_data
                
    return data_dictionary


In [7]:
def info_about_columns(dataframe, data_science_descriptions):
    '''
        A reusable function that will create a dataframe to contain in another 
        dataframe the following : dataypes, Number of Unique Categories, Categories 
        per sample and the type of variable missing values and missing values %
        
        input : A dataframe where data and categories will be retrieved
        series : The data science explamation for each data type
    '''
        
    if data_science_descriptions == None:
        data_science_descriptions = dataframe.copy().dtypes
        data_science_descriptions = \
            data_science_descriptions.replace(data_science_descriptions.to_list(), "NA")
                 
    dataframe_info_about_columns = pd.concat([
          dataframe.dtypes, 
          dataframe.nunique(), 
          round(dataframe.nunique()*100/len(dataframe)),
          data_science_descriptions,
          dataframe.isna().sum(),
          dataframe.isna().sum() * 100 / len(dataframe)], axis=1)
    

    dataframe_info_about_columns.columns=[
                                     'DataType', 
                                     '# of Categories', 
                                     'categories/sample ratio', 
                                     'Data Science Type',
                                     'missing values',
                                     'missing values %']
    
    return dataframe_info_about_columns

## Step 1 Load the Data
> * Load `bank.csv' data
> * Check the first five observations
> * Check if there are any null values

In [8]:
bank_data_df = pd.read_csv("bank.csv")
bank_data_df.head(5)

In [9]:
# Always display the shape
print(f"Number of rows {bank_data_df.shape[0]} \
        Number of cols {bank_data_df.shape[1]}")

Number of rows 11162         Number of cols 17


In [11]:
# Check for Null Values
print("Number of missing elements is ", bank_data_df.isna().sum().sum())

Number of missing elements is  0


In [12]:
info = info_about_columns(bank_data_df, None)
info

Unnamed: 0,DataType,# of Categories,categories/sample ratio,Data Science Type,missing values,missing values %
age,int64,76,1.0,,0,0.0
job,object,12,0.0,,0,0.0
marital,object,3,0.0,,0,0.0
education,object,4,0.0,,0,0.0
default,object,2,0.0,,0,0.0
balance,int64,3805,34.0,,0,0.0
housing,object,2,0.0,,0,0.0
loan,object,2,0.0,,0,0.0
contact,object,3,0.0,,0,0.0
day,int64,31,0.0,,0,0.0


In [13]:
display(HTML(show_examples_of_data(bank_data_df, info, 100).to_html()))

Unnamed: 0,Field,Value
0,age,"[59, 56, 41, 55, 54, 42, 60, 37, 28, 38, 30, 29, 46, 31, 35, 32, 49, 43, 26, 40, 33, 23, 48, 45, 36, 52, 53, 39, 57, 51, 44, 24, 50, 27, 34, 47, 25, 58, 61, 68, 75, 22, 69, 66, 85, 72, 90, 67, 71, 21, 74, 65, 62, 83, 70, 76, 77, 19, 73, 63, 20, 78, 95, 64, 79, 82, 18, 86, 84, 87, 92, 81, 80, 93, 88, 89]"
1,job,"[admin., technician, services, management, retired, blue-collar, unemployed, entrepreneur, housemaid, unknown, self-employed, student]"
2,marital,"[married, single, divorced]"
3,education,"[secondary, tertiary, primary, unknown]"
4,default,"[no, yes]"
5,balance,-6847 to 81204
6,housing,"[yes, no]"
7,loan,"[no, yes]"
8,contact,"[unknown, cellular, telephone]"
9,day,"[5, 6, 7, 8, 9, 12, 13, 14, 15, 16, 19, 20, 21, 23, 26, 27, 28, 29, 30, 2, 3, 4, 11, 17, 18, 24, 1, 10, 22, 25, 31]"


Step 2: Transfromer<br>
> * Create a trasnformer pipeline for numeric and categorical features.<br>
> * numerical features will be imputed and scaled.<br>
> * Categorical features will be imputed and encoded<br>
Create a Column transformer

In [20]:
# Create a Numerical Transformer
numerical_transformer = Pipeline(steps=[ 
    ('imputer', SimpleImputer(strategy='median')), 
    ('scaler', StandardScaler()) ])
numerical_transformer

In [21]:
# Create a Categorical Transformer
categorical_transformer = Pipeline(steps =[
    ('imputer', SimpleImputer(strategy='constant', fill_value="unknown")),
    ('onehsot', OneHotEncoder(handle_unknown='ignore'))
])
categorical_transformer

In [22]:
numerical_only =  ['age', 'balance', 'day', 'duration', 'pdays', 'previous']
categorical_only = [ 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
# Create a ColumnTransformer
transformer = ColumnTransformer(
    transformers=[
        ('numericalDataProcessing', numerical_transformer, numerical_only),
        ('categoricalDataProcessing', categorical_transformer, categorical_only)
    ]
)

## Step 3: Classifier
> * Create a pipeline for the decision tree classifier as well as the transformer.<br>  
> * Encode the target variable using LabelEncoder


In [39]:
# Create a Numerical Transformer
ml_pipeline = Pipeline(steps=[ 
    ('preprocesing', transformer), 
    ('ml', DecisionTreeClassifier(random_state=1, max_depth=2)) 
])
ml_pipeline

### Step 4: Model
> * Create a pipeline for the decision tree classifier as well as the transformer
> *  Encode the target variable using `LabelEncoder`

In [40]:
transformer = LabelEncoder()
bank_data_df['deposit_category'] = \
    transformer.fit_transform(bank_data_df.deposit)
bank_data_df.drop('deposit', axis=1)

X = bank_data_df.drop('deposit_category', axis=1)
y = bank_data_df['deposit_category']
(Xtrain,Xtest,ytrain,ytest) = train_test_split(X,y)
print("Shape of Xtrain, ytrain", Xtrain.shape, ytrain.shape)
print("Shape of Xtest, ytest", Xtest.shape, ytest.shape)



Shape of Xtrain, ytrain (8371, 17) (8371,)
Shape of Xtest, ytest (2791, 17) (2791,)


In [38]:
# Run the workflow
ml_pipeline.fit(Xtrain,ytrain)

TypeError: fit_transform() takes 2 positional arguments but 3 were given