In [32]:
import pandas as pd
import numpy as np
import catboost

In [33]:
#loading the cleaned dataframe for preprocessing

df = pd.read_csv('Peruvian_Bank_Data/clean_df.csv', header = 0)
df.head()

Unnamed: 0,age,job,marital,education,in_default,avg_yearly_balance,housing_loan,personal_loan,contact_method,day,month,duration,campaign_contacts,prev_days,previous_contacts,prev_outcome,term_deposit
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,5,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,5,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,5,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,5,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,5,198,1,-1,0,unknown,no


I believe the best way to preprocess this is to provide dummy labels to the categorical data and then use a RobustScaler for avg_yearly_balance, and duration because of the outliers. Prev_days will also get a robustscaling, but this will be done last because it is subject to change -- I'm concerned about the negative one being abrasive to the model. But we will see.  MinMax scaler for campaign_contacts and previous_contacts and age. Let's begin and see how it goes. I will do the scaling first and then the dummy variables -- the dummy variables make the dataframe a bit unbearable to scroll through. 

In [34]:
from sklearn.preprocessing import RobustScaler, MinMaxScaler

In [52]:
#applying robustscaler to avg_yearly_balance, duration, prev_days
X = pd.DataFrame(df['avg_yearly_balance'])
RobSca = RobustScaler().fit_transform(X)

In [62]:
df2 = df #creating a copy just in case something goes wrong
#going off to replace the avg_yearly_balance values
df2['avg_yearly_balance'] = RobSca

In [65]:
df2.drop(columns = ['avg_yearly_values'], inplace = True) #made a mistake in the nomenclature

In [67]:
df.head() #looks successful, let's continue. As practice let's create a function for the remaining required preprocessing

Unnamed: 0,age,job,marital,education,in_default,avg_yearly_balance,housing_loan,personal_loan,contact_method,day,month,duration,campaign_contacts,prev_days,previous_contacts,prev_outcome,term_deposit
0,58,management,married,tertiary,no,1.247241,yes,no,unknown,5,5,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,-0.308315,yes,no,unknown,5,5,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,-0.328182,yes,yes,unknown,5,5,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,0.778514,yes,no,unknown,5,5,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,-0.328918,no,no,unknown,5,5,198,1,-1,0,unknown,no


In [69]:
#creating a function to make the rest of the preprocessing simpler
def preproc(dataframe, column, scalertype):
    X = pd.DataFrame(dataframe[column])
    scaler = scalertype.fit_transform(X)
    dataframe[column] = scaler
    

In [70]:
preproc(df2, 'duration', RobustScaler())

In [72]:
preproc(df2, 'prev_days', RobustScaler())

In [73]:
#minmax for campaign_contacts and previous_contacts and age
preproc(df2, 'campaign_contacts', MinMaxScaler())
preproc(df2, 'previous_contacts', MinMaxScaler())
preproc(df2, 'age', MinMaxScaler())
df2.head()

Unnamed: 0,age,job,marital,education,in_default,avg_yearly_balance,housing_loan,personal_loan,contact_method,day,month,duration,campaign_contacts,prev_days,previous_contacts,prev_outcome,term_deposit
0,0.519481,management,married,tertiary,no,1.247241,yes,no,unknown,5,5,0.373272,0.0,0.0,0.0,unknown,no
1,0.337662,technician,single,secondary,no,-0.308315,yes,no,unknown,5,5,-0.133641,0.0,0.0,0.0,unknown,no
2,0.194805,entrepreneur,married,secondary,no,-0.328182,yes,yes,unknown,5,5,-0.479263,0.0,0.0,0.0,unknown,no
3,0.376623,blue-collar,married,unknown,no,0.778514,yes,no,unknown,5,5,-0.40553,0.0,0.0,0.0,unknown,no
4,0.194805,unknown,single,unknown,no,-0.328918,no,no,unknown,5,5,0.082949,0.0,0.0,0.0,unknown,no


In [75]:
#decided also here that I should do campaign contacts
preproc(df2, 'campaign_contacts', MinMaxScaler())
df2.head()

Unnamed: 0,age,job,marital,education,in_default,avg_yearly_balance,housing_loan,personal_loan,contact_method,day,month,duration,campaign_contacts,prev_days,previous_contacts,prev_outcome,term_deposit
0,0.519481,management,married,tertiary,no,1.247241,yes,no,unknown,5,5,0.373272,0.0,0.0,0.0,unknown,no
1,0.337662,technician,single,secondary,no,-0.308315,yes,no,unknown,5,5,-0.133641,0.0,0.0,0.0,unknown,no
2,0.194805,entrepreneur,married,secondary,no,-0.328182,yes,yes,unknown,5,5,-0.479263,0.0,0.0,0.0,unknown,no
3,0.376623,blue-collar,married,unknown,no,0.778514,yes,no,unknown,5,5,-0.40553,0.0,0.0,0.0,unknown,no
4,0.194805,unknown,single,unknown,no,-0.328918,no,no,unknown,5,5,0.082949,0.0,0.0,0.0,unknown,no


In [80]:
#Getting dummies now for the categorical data
#df_ready = this dataframe is ready for modeling
df_ready = pd.get_dummies(df2, columns = ['job','marital','education','in_default','housing_loan','personal_loan','contact_method','prev_outcome'])


In [81]:
df_ready.head()

Unnamed: 0,age,avg_yearly_balance,day,month,duration,campaign_contacts,prev_days,previous_contacts,term_deposit,job_admin.,...,housing_loan_yes,personal_loan_no,personal_loan_yes,contact_method_cellular,contact_method_telephone,contact_method_unknown,prev_outcome_failure,prev_outcome_other,prev_outcome_success,prev_outcome_unknown
0,0.519481,1.247241,5,5,0.373272,0.0,0.0,0.0,no,0,...,1,1,0,0,0,1,0,0,0,1
1,0.337662,-0.308315,5,5,-0.133641,0.0,0.0,0.0,no,0,...,1,1,0,0,0,1,0,0,0,1
2,0.194805,-0.328182,5,5,-0.479263,0.0,0.0,0.0,no,0,...,1,0,1,0,0,1,0,0,0,1
3,0.376623,0.778514,5,5,-0.40553,0.0,0.0,0.0,no,0,...,1,1,0,0,0,1,0,0,0,1
4,0.194805,-0.328918,5,5,0.082949,0.0,0.0,0.0,no,0,...,0,1,0,0,0,1,0,0,0,1
