In [199]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

In [193]:
data = pd.read_csv('bank_data.csv')

# convert to binary values of housing, loan, default
data['housing'] = data['housing'].map({'yes' : 1, 'no' : 0})
data['default'] = data['default'].map({'yes': 1, 'no' :0})
data['loan'] = data['loan'].map({'yes' :1, 'no' :0})


# encode martial type
data['marital_single'] = data['marital'].apply(lambda x:1 if 'single' in x else 0)
data['marital_married'] = data['marital'].apply(lambda x:1 if 'married' in x else 0)
data['marital_divorced'] = data['marital'].apply(lambda x:1 if 'divorced' in x else 0)

# encode education type
data['education_primary'] = data['education'].apply(lambda x:1 if 'primary' in x else 0 )
data['education_secondary'] = data['education'].apply(lambda x:1 if 'secondary' in x else 0 )
data['education_tertiary'] = data['education'].apply(lambda x:1 if 'tertiary' in x else 0 )
data['education_others'] = data['education'].apply(lambda x:1 if 'others' in x else 0 )

# encode contact type
data['contact_cellular'] = data['contact'].apply(lambda x:1 if 'cellular' in x else 0)
data['contact_telephone'] = data['contact'].apply(lambda x:1 if 'telephone' in x else 0)
data['contact_unknown'] = data['contact'].apply(lambda x:1 if 'unknown' in x else 0)

# one hot encode of job type
def job_type(input):
    if input == 'self-employed' or input == 'entrepreneur' or input == 'unemployed' or input == 'housemaid' or input == 'student' or input == 'unknown' :
        return 'other_job'
    else:
        return input
data['job'] = data['job'].apply(job_type)

data['blue-collar'] = data['job'].apply(lambda x:1 if 'blue-collar' in x else 0)
data['management'] = data['job'].apply(lambda x:1 if 'management' in x else 0)
data['technician'] = data['job'].apply(lambda x:1 if 'technician' in x else 0)
data['admin'] = data['job'].apply(lambda x:1 if 'admin' in x else 0)
data['services'] = data['job'].apply(lambda x:1 if 'services' in x else 0)
data['retired'] = data['job'].apply(lambda x:1 if 'retired' in x else 0)
data['other_job'] = data['job'].apply(lambda x:1 if 'other_job' in x else 0)

# one hot encoding of poutcome
data['poutcome_unknown'] = data['poutcome'].apply(lambda x:1 if 'unknown' in x else 0)
data['poutcome_failure'] = data['poutcome'].apply(lambda x:1 if 'failure' in x else 0)
data['poutcome_other'] = data['poutcome'].apply(lambda x:1 if 'other' in x else 0)
data['poutcome_success'] = data['poutcome'].apply(lambda x:1 if 'success' in x else 0)

# one hot encoding of y values
data['y'] = data['y'].map({'yes' :1, 'no' :0})

# one hot encoding of months
data['jan_month'] = data['month'].apply(lambda x:1 if 'jan' in x else 0)
data['feb_month'] = data['month'].apply(lambda x:1 if 'feb' in x else 0)
data['mar_month'] = data['month'].apply(lambda x:1 if 'mar' in x else 0)
data['apr_month'] = data['month'].apply(lambda x:1 if 'apr' in x else 0)
data['may_month'] = data['month'].apply(lambda x:1 if 'may' in x else 0)
data['jun_month'] = data['month'].apply(lambda x:1 if 'jun' in x else 0)
data['jul_month'] = data['month'].apply(lambda x:1 if 'jul' in x else 0)
data['aug_month'] = data['month'].apply(lambda x:1 if 'aug' in x else 0)
data['sep_month'] = data['month'].apply(lambda x:1 if 'sep' in x else 0)
data['oct_month'] = data['month'].apply(lambda x:1 if 'oct' in x else 0)
data['nov_month'] = data['month'].apply(lambda x:1 if 'nov' in x else 0)
data['dec_month'] = data['month'].apply(lambda x:1 if 'dec' in x else 0)


# remove columns
data = data.drop(columns=['education', 'marital', 'contact', 'job', 'month', 'poutcome'])

# one hot encoding of data set
data = pd.get_dummies(data)




In [212]:
# split the dataset into features and target
X = data.drop(columns=['y'])
y = data['y']

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=40 )

# Hyperparameter grid
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Grid Search
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and evaluation
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", grid_search.best_score_)







Best Parameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 0.9069835903509303
