In [1]:
# Import the data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from ml_utils import train_test_split_marketing,\
    fill_missing,\
    build_encoders,\
    encode_categorical,\
    build_target_encoder,\
    encode_target

df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m14/datasets/bank_marketing.csv')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,83,retired,divorced,primary,no,0,no,no,telephone,31,may,664,1,77.0,3,success,no
1,32,technician,married,secondary,no,1242,yes,no,,2,jun,183,3,,0,,no
2,38,blue-collar,single,secondary,no,68,no,no,,5,jun,90,2,,0,,no
3,30,services,single,secondary,no,677,yes,no,cellular,21,nov,108,1,,0,,no
4,66,retired,married,primary,no,2173,no,no,cellular,15,jul,178,1,181.0,5,failure,no


In [2]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split_marketing(df)
X_train.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,25431.0,25431.0,25431.0,25431.0,25431.0,4655.0,25431.0
mean,40.913806,1353.540128,15.836381,257.774684,2.780937,226.843824,0.581534
std,10.61723,3018.184488,8.345164,258.556998,3.115439,117.716936,1.928374
min,18.0,-8019.0,1.0,0.0,1.0,1.0,0.0
25%,33.0,71.0,8.0,103.0,1.0,133.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,196.0,0.0
75%,48.0,1403.0,21.0,318.0,3.0,330.5,0.0
max,95.0,98417.0,31.0,3785.0,63.0,871.0,51.0


# Missing Values

In [3]:
# Fill the missing values using the imported function
X_train_filled = fill_missing(X_train)
X_test_filled = fill_missing(X_test)
X_train_filled.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
2544,34,blue-collar,married,secondary,no,328,yes,yes,cellular,21,nov,18,7,-1.0,0,nonexistent
3891,30,technician,married,secondary,no,484,yes,no,unknown,6,may,703,1,-1.0,0,nonexistent
19464,42,entrepreneur,divorced,secondary,no,31,no,no,unknown,18,jun,120,1,-1.0,0,nonexistent
31170,27,management,single,tertiary,no,3,yes,yes,cellular,22,jul,506,1,-1.0,0,nonexistent
22216,30,management,single,secondary,no,19,no,no,cellular,26,aug,191,2,-1.0,0,nonexistent


In [4]:
# Create the encoders for categorical variables (use X_train_filled)
encoders = build_encoders(X_train_filled)
encoders

[{'column': 'job',
  'multi_col_output': True,
  'encoder': OneHotEncoder(handle_unknown='infrequent_if_exist', max_categories=5,
                sparse_output=False)},
 {'column': 'marital',
  'multi_col_output': True,
  'encoder': OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)},
 {'column': 'education',
  'multi_col_output': False,
  'encoder': OrdinalEncoder(categories=[['primary', 'secondary', 'tertiary']],
                 handle_unknown='use_encoded_value', unknown_value=-1)},
 {'column': 'default',
  'multi_col_output': False,
  'encoder': OrdinalEncoder(categories=[['no', 'yes']], handle_unknown='use_encoded_value',
                 unknown_value=-1)},
 {'column': 'housing',
  'multi_col_output': False,
  'encoder': OrdinalEncoder(categories=[['no', 'yes']], handle_unknown='use_encoded_value',
                 unknown_value=-1)},
 {'column': 'loan',
  'multi_col_output': False,
  'encoder': OrdinalEncoder(categories=[['no', 'yes']], handle_unknown='us

In [5]:
# Encode X_train_filled and X_test_filled
X_train_encoded = encode_categorical(X_train_filled, encoders)
X_test_encoded = encode_categorical(X_test_filled, encoders)

X_train_encoded.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,x0_admin.,x0_blue-collar,x0_management,...,x0_telephone,x0_unknown,x0_nonexistent,x0_other,x0_success,education,default,housing,loan,month
0,34,328,21,18,7,-1.0,0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,10.0
1,30,484,6,703,1,-1.0,0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,4.0
2,42,31,18,120,1,-1.0,0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0
3,27,3,22,506,1,-1.0,0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,1.0,6.0
4,30,19,26,191,2,-1.0,0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,7.0


In [6]:
# Encode y_train and y_test
y_encoder = build_target_encoder(y_train)
y_train_encoded = encode_target(y_train, y_encoder)
y_test_encoded = encode_target(y_test, y_encoder)

In [7]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=13)
model.fit(X_train_encoded, y_train_encoded)
y_pred = model.predict(X_test_encoded)
print(balanced_accuracy_score(y_test_encoded, y_pred))

0.5753911268989494


In [8]:
# Import new data and test with the model

new_df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m14/lesson_3/datasets/bank_marketing_new_data.csv')

In [9]:
# Split the new data into X and y


In [10]:
# Apply preprocessing to the X and y data


In [11]:
# Make predictions and check the balanced accuracy score


0.5769138944243607
