In [28]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split


In [52]:
df_ori = pd.DataFrame(pd.read_csv('Peruvian_Bank_Data/clean_df.csv'))
df_ori.head()

Unnamed: 0,age,job,marital,education,in_default,avg_yearly_balance,housing_loan,personal_loan,contact_method,day,month,duration,campaign_contacts,prev_days,previous_contacts,prev_outcome,term_deposit
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,5,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,5,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,5,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,5,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,5,198,1,-1,0,unknown,no


In [71]:
#looking to get a coefficient reading so we are going to process the data with normalization and throw into logreg
df = df_ori.copy()
df = pd.get_dummies(data = df, columns = ['in_default', 'job', 'marital', 'education', 'contact_method', 'prev_outcome', 'housing_loan', 'personal_loan']) 
df_X = df.drop(columns = ['term_deposit'])
df_y = df['term_deposit']
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size = 0.3, random_state = 44)

In [93]:
#normalizing the X sets
X_train = normalize(X_train, axis = 0)
X_test = normalize(X_test, axis = 0)

In [94]:
X_train = pd.DataFrame(data = X_train, columns = df_X.columns)
X_test = pd.DataFrame(data = X_test, columns = df_X.columns)

In [95]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
logregcoefs = abs(logreg.coef_)

In [102]:
df_coefs = pd.DataFrame(data = logregcoefs, columns = df_X.columns)
df_coefs = normalize(df_coefs)
df_coefs = pd.DataFrame(data = df_coefs, columns = df_X.columns)
df_coefs.head()

Unnamed: 0,age,avg_yearly_balance,day,month,duration,campaign_contacts,prev_days,previous_contacts,in_default_no,in_default_yes,...,contact_method_telephone,contact_method_unknown,prev_outcome_failure,prev_outcome_other,prev_outcome_success,prev_outcome_unknown,housing_loan_no,housing_loan_yes,personal_loan_no,personal_loan_yes
0,0.235849,0.018822,0.214702,0.212125,0.196326,0.1458,0.057282,0.078252,0.228117,0.054877,...,0.048727,0.241073,0.066801,0.02002,0.496396,0.271081,0.066663,0.265154,0.186027,0.157168


In [103]:
#preprocessing the data more appropriately for Random Forest
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import classification_report

def preproc(dataframe, column, scalertype):
    X = pd.DataFrame(dataframe[column])
    scaler = scalertype.fit_transform(X)
    dataframe[column] = scaler
    
preproc(X_train, 'avg_yearly_balance', RobustScaler())
preproc(X_train, 'duration', RobustScaler())
preproc(X_train, 'prev_days', RobustScaler())
preproc(X_train, 'campaign_contacts', MinMaxScaler())
preproc(X_train, 'previous_contacts', MinMaxScaler())
preproc(X_train, 'age', MinMaxScaler())
preproc(X_train, 'campaign_contacts', MinMaxScaler())

preproc(X_test, 'avg_yearly_balance', RobustScaler())
preproc(X_test, 'duration', RobustScaler())
preproc(X_test, 'prev_days', RobustScaler())
preproc(X_test, 'campaign_contacts', MinMaxScaler())
preproc(X_test, 'previous_contacts', MinMaxScaler())
preproc(X_test, 'age', MinMaxScaler())
preproc(X_test, 'campaign_contacts', MinMaxScaler())

print(X_train.head())
print(X_test.head())

        age  avg_yearly_balance       day     month  duration  \
0  0.139152           -0.087583  0.002148  0.004187  0.326425   
1  0.068924            0.050377  0.002548  0.002534  0.065524   
2  0.213228           -0.044310  0.008599  0.005779  0.189399   
3  0.022207            0.153695  0.000608  0.001096 -0.460518   
4  0.074394           -0.482000  0.002474  0.003444  0.764050   

   campaign_contacts  prev_days  previous_contacts  in_default_no  \
0           0.004456  -0.666693           0.000000       0.004090   
1           0.001877  -0.031032           0.000000       0.001733   
2           0.012334  -1.086023           0.000000       0.005645   
3           0.000727  24.492991           0.001346       0.000681   
4           0.033110  -0.471003           0.000000       0.003364   

   in_default_yes  ...  contact_method_telephone  contact_method_unknown  \
0             0.0  ...                  0.000000                     0.0   
1             0.0  ...                  0.

In [108]:
#Grid Search
parameter_grid = {'n_estimators' : [100,1000,5000], 'max_depth' : [10, 20, 30, 'None'], 'max_features' : ['auto', 'log2']}
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
grid = GridSearchCV(estimator = rfc, param_grid = parameter_grid, scoring = ['recall', 'accuracy'])
grid.cv_results_


AttributeError: 'GridSearchCV' object has no attribute 'cv_results_'