In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import metrics

# Bank Marketing Data - A Decision Tree Approach

## Aim:
The aim of this attempt is to predict if the client will subscribe (yes/no) to a term deposit, by building a classification model using Decision Tree.
### Step 1: Load the data
- Load `bank.csv' data
- Check the first five observations
- Check if there are any null values

In [16]:
# Load the dataset
file_path = 'bank.csv'  # Make sure to replace with your actual path
bank_data = pd.read_csv(file_path)

# Display the first 5 rows
print(bank_data.head())

# Check for null values
print(bank_data.isnull().sum())


   age         job  marital  education default  balance housing loan  contact  \
0   59      admin.  married  secondary      no     2343     yes   no  unknown   
1   56      admin.  married  secondary      no       45      no   no  unknown   
2   41  technician  married  secondary      no     1270     yes   no  unknown   
3   55    services  married  secondary      no     2476     yes   no  unknown   
4   54      admin.  married   tertiary      no      184      no   no  unknown   

   day month  duration  campaign  pdays  previous poutcome deposit  
0    5   may      1042         1     -1         0  unknown     yes  
1    5   may      1467         1     -1         0  unknown     yes  
2    5   may      1389         1     -1         0  unknown     yes  
3    5   may       579         1     -1         0  unknown     yes  
4    5   may       673         2     -1         0  unknown     yes  
age          0
job          0
marital      0
education    0
default      0
balance      0
housing  

## Summay of data

### Categorical Variables :
**[1] job      :** admin,technician, services, management, retired, blue-collar, unemployed, entrepreneur,
               housemaid, unknown, self-employed, student
<br>**[2] marital  :** married, single, divorced
<br>**[3] education:** secondary, tertiary, primary, unknown
<br>**[4] default  :** yes, no
<br>**[5] housing  :** yes, no
<br>**[6] loan     :** yes, no 
<br>**[7] deposit  :** yes, no ** (Dependent Variable)**
<br>**[8] contact  :** unknown, cellular, telephone
<br>**[9] month    :** jan, feb, mar, apr, may, jun, jul, aug, sep, oct, nov, dec
<br>**[10] poutcome:** unknown, other, failure, success

### Numerical Variables:
**[1] age 
<br>[2] balance
<br>[3] day
<br>[4] duration
<br>[5] campaign
<br>[6] pdays
<br>[7] previous **

### Step 2: Transformer
- Create a trasnformer pipeline for numeric and categorical features. numerical features will be imputed and scaled. Categorical features will be imputed and encoded
- Create a Column transformer

In [17]:
# Identify the categorical and numerical columns
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 
                        'contact', 'month', 'poutcome']
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Create the pipeline for numerical features
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('scaler', StandardScaler())  # Scale the features
])

# Create the pipeline for categorical features
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent category
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode the categories
])

# Create the column transformer that applies the correct pipeline to each subset of features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),  # Apply numerical pipeline to numerical features
        ('cat', categorical_pipeline, categorical_features)  # Apply categorical pipeline to categorical features
    ]
)

# Now you can use this `preprocessor` in a pipeline with a model, for example, a Decision Tree


### Step 3: Classifier
- Create a pipeline for the decision tree classifier as well as the transformer
- Encode the target variable using `LabelEncoder`

In [18]:
# Separate features (X) and target (y)
X = bank_data.drop(columns=['deposit'])  # Features (all columns except 'deposit')
y = bank_data['deposit']  # Target variable ('deposit')

# Encode the target variable using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # Converts 'yes' -> 1, 'no' -> 0

# Create the pipeline for the decision tree classifier along with the preprocessor
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Apply the preprocessing pipeline
    ('classifier', DecisionTreeClassifier(random_state=42))  # Decision Tree classifier
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# Train the model
model_pipeline.fit(X_train, y_train)

# Evaluate the model
train_accuracy = model_pipeline.score(X_train, y_train)
test_accuracy = model_pipeline.score(X_test, y_test)

train_accuracy, test_accuracy


(1.0, 0.7847118542848611)

### Step 4: Model
- Create a pipeline for the decision tree classifier as well as the transformer
- Encode the target variable using `LabelEncoder`

In [19]:
# Load the data (assuming data is loaded already as 'bank_data')
X = bank_data.drop(columns=['deposit'])  # Features (all columns except 'deposit')
y = bank_data['deposit']  # Target variable ('deposit')

# Encode the target variable using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)  # 'yes' -> 1, 'no' -> 0

# Identify the categorical and numerical columns
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 
                        'contact', 'month', 'poutcome']
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

# Create the pipeline for numerical features
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('scaler', StandardScaler())  # Scale the features
])

# Create the pipeline for categorical features
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Impute missing values with the most frequent category
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode the categories
])

# Create the column transformer that applies the correct pipeline to each subset of features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),  # Apply numerical pipeline to numerical features
        ('cat', categorical_pipeline, categorical_features)  # Apply categorical pipeline to categorical features
    ]
)

# Create a pipeline for the Decision Tree Classifier and Preprocessing steps
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Apply preprocessing to the data
    ('classifier', DecisionTreeClassifier(random_state=42))  # Decision Tree classifier
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# Train the model
model_pipeline.fit(X_train, y_train)

# Evaluate the model
train_accuracy = model_pipeline.score(X_train, y_train)
test_accuracy = model_pipeline.score(X_test, y_test)

# Print the results
print("Training Accuracy: ", train_accuracy)
print("Testing Accuracy: ", test_accuracy)


Training Accuracy:  1.0
Testing Accuracy:  0.7847118542848611
