In [1]:
import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from time import time

import sys
import sklearn
from sklearn.model_selection import train_test_split,cross_val_score,KFold,StratifiedKFold,ShuffleSplit,StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import PolynomialFeatures,LabelEncoder,Imputer,RobustScaler, StandardScaler, MinMaxScaler,FunctionTransformer
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.feature_selection import RFE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.pipeline import Pipeline,make_pipeline

from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix,roc_auc_score, make_scorer
from sklearn_pandas import DataFrameMapper
from pandas.api.types import is_string_dtype, is_numeric_dtype
from scipy import stats
from scipy.stats import skew,randint
from scipy.special import boxcox1p
from scipy.stats import randint as sp_randint
%matplotlib inline

In [2]:
def print_feature_importances(model,X):
    important_features = pd.Series(data=rf_model.feature_importances_,index=X.columns)
    important_features.sort_values(ascending=False,inplace=True)
    print(important_features.head(50))
    
def get_cat_columns_by_type(df):
    out = []
    for colname,col_values in df.items():
        if is_string_dtype(col_values):
            out.append((colname,'string') )
        elif not is_numeric_dtype(col_values):
            out.append((colname,'categorical') )
    return out       

def get_numeric_columns(df):
    out = []
    for colname,col_values in df.items():
        if is_numeric_dtype(col_values):
            out.append(colname)
    return out       
    
def get_missing_values_percentage(df):
    missing_values_counts_list = df.isnull().sum()
    total_values = np.product(df.shape)
    total_missing = missing_values_counts_list.sum()
    # percent of data that is missing
    return (total_missing/total_values) * 100


def convert_to_str_type(df_in,columns,inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
        
    for col in columns:
        df[col] = df[col].astype(str)
    return df

def extract_and_drop_target_column(df_in, y_name, inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
    if not is_numeric_dtype(df[y_name]):
        df[y_name] = df[y_name].cat.codes
        y = df[y_name].values
    else:
        y = df[y_name].copy()
    df.drop([y_name], axis=1, inplace=True)
    return (df,y)

def get_cat_and_numerical_cols(df):
    cat_cols = get_cat_columns_by_type(df)
    cat_cols = [cat_cols[i][0] for i in range(len(cat_cols))]
    num_cols = [col for col in df.columns if col not in cat_cols]
    return cat_cols,num_cols

    
def clean_df(df,pipelines):
    cat_cols, num_cols = get_cat_and_numerical_cols(df)
    dfs = []
    if len(num_cols):
        df1 = df[num_cols]
        print(df1.shape)
        pipelines['pipe_df_missing_num'].fit(df1)
        data1 = pipelines['pipe_df_missing_num'].transform(df1)
        df1 = pd.DataFrame(data1,columns=num_cols)
        dfs.append(df1)
    
    if len(cat_cols):
        df2 = df[cat_cols]
        print(df2.shape)
        pipelines['pipe_df_missing_cat'].fit(df2)
        data2 = pipelines['pipe_df_missing_cat'].transform(df2)     
        df2 = pd.DataFrame(data2,columns=cat_cols)
        dfs.append(df2)
        
        
    return pd.concat(dfs,axis=1)


def handle_encoding(df,one_hot=False):
    lbl = LabelEncoder()
    cat_cols,_ = get_cat_and_numerical_cols(df)
    print('len of cat cols = {}'.format(len(cat_cols)))
    for colname in cat_cols:
        lbl.fit(list(df[colname].values)) 
        df[colname] = lbl.transform(list(df[colname].values))
        
    if one_hot:
        return pd.get_dummies(df,columns=cat_cols,dummy_na=True)
    else:
        return df
    

def get_iqr_min_max(df,cols):
    out = {}
    for colname, col_values in df.items():
        if colname not in cols:
            continue
        quartile75, quartile25 = np.percentile(col_values, [75 ,25])
        ## Inter Quartile Range ##
        IQR = quartile75 - quartile25
        min_value = quartile25 - (IQR*1.5)
        max_value = quartile75 + (IQR*1.5)
        out[colname] = (min_value,max_value)
    return out

def remove_skew(df,threshold=0.75,lambda_in=0.15):
    cat_cols, num_cols = get_cat_and_numerical_cols(df)
    skewed_cols = df[num_cols].apply(lambda x: skew(x)).sort_values(ascending=False)
    skewness = pd.DataFrame({'Skew' :skewed_cols})
    skewness_log = skewness[abs(skewness) > threshold]
    skewness_other = skewness[abs(skewness) <= threshold]
    skewed_features_log = skewness_log.index
    skewed_features_other = skewness_other.index
    lambda_ = 0.0
    for feature in skewed_features_log:
        df[feature] = boxcox1p(df[feature],lambda_)
        lambda_ = lambda_in
    for feature in skewed_features_other:
        df[feature] = boxcox1p(df[feature],lambda_)
    return df

def bin_numerical_columns(df_in,cols,inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
        
    for col in cols.keys():
        bins = cols[col]
        buckets_ = np.linspace(bins[0],bins[1],bins[2])
        df[col] = pd.cut(df[col],buckets_,include_lowest=True)
    return df

# Utility function to report best scores
def report_best_scores(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [3]:

def preprocess_df2(df,id_col= None,df_test=None,test_id=None,
                   new_features_func=None,
                   date_col=None,
                   convert_to_cat_cols=None,
                   bin_columns_dict=None,
                   remove_skewness=False,
                   skew_threshold=0.75,
                   boxcox_lambda=0.15
                  ):
    
            
    if id_col is not None:
        combined.drop(id_col, axis=1,inplace=True)
        if df_test is not None and test_id is not None:
            test_id = df_test[id_col].copy()
        else: test_id = None
           
   
    if new_features_func is not None:
        df = new_features_func(df)
    
        
    if convert_to_cat_cols is not None:
        df = convert_to_str_type(df,convert_to_cat_cols,inplace=True)
        
    if bin_columns_dict is not None:
        df = bin_numerical_columns(df,bin_columns_dict,inplace=True)
    
    return df,test_id

def create_cleaning_pipelines(log_y=False,one_hot=False):
    def log_of_y(y):
        if log_y:
            return np.log1p(y)
        else: 
            return y
        
    pipeline_y = make_pipeline(FunctionTransformer(log_of_y))
    pipeline_df_missing_num = make_pipeline(Imputer(strategy='median',axis=0))
    pipeline_df_missing_cat = make_pipeline(Imputer(strategy='most_frequent',axis=0)
                                            
                                            )
    
    return {'pipe_y':pipeline_y,
            'pipe_df_missing_num':pipeline_df_missing_num,
            'pipe_df_missing_cat':pipeline_df_missing_cat
           }



In [4]:
def add_new_features1(df):
    print('In add new features 1')
    print(df.head())
    df['DepsIncomeComined'] = df['NumberOfDependents'] * df['MonthlyIncome']
    df['Times90DaysLateDebtRatio'] = df['NumberOfTimes90DaysLate'] * df['DebtRatio']
    df['Times90DaysLateRevolving'] = df['NumberOfTimes90DaysLate'] * df['RevolvingUtilizationOfUnsecuredLines']
    return df
def add_new_features2(df):
    df['DepsIncomeComined'] = df['NumberOfDependents'] * df['MonthlyIncome']
    df['Times90DaysLateDebtRatio'] = df['NumberOfTimes90DaysLate'] * df['DebtRatio']
    df['Times90DaysLateRevolving'] = df['NumberOfTimes90DaysLate'] * df['RevolvingUtilizationOfUnsecuredLines']
    df['RevolvingUtilizationOfUnsecuredLines-2'] = df['RevolvingUtilizationOfUnsecuredLines'] ** 2
    df['RevolvingUtilizationOfUnsecuredLines-3'] = df['RevolvingUtilizationOfUnsecuredLines'] ** 3
    df['RevolvingUtilizationOfUnsecuredLines-sqrt'] = np.sqrt(df['RevolvingUtilizationOfUnsecuredLines'])
    
    return df


In [5]:
PATH = "data/give_me_credit/"
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}test.csv', low_memory=False)
columns = ['Id', 'SeriousDlqin2yrs','RevolvingUtilizationOfUnsecuredLines', 'age',
                 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
                 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
                 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
                 'NumberOfDependents']

columns = [col.replace('-','') for col in columns]
    
df_raw.columns= columns
df_test.columns = columns
df_test.drop(['SeriousDlqin2yrs'], axis=1, inplace=True)
df = df_raw.copy()
df,y = extract_and_drop_target_column(df,'SeriousDlqin2yrs',inplace=True)

n_train = df.shape[0]
n_test = df_test.shape[0]
    
pipelines = create_cleaning_pipelines(log_y=True)
combined = pd.concat((df, df_test)).reset_index(drop=True)
combined,test_id = preprocess_df2(combined,id_col='Id',
                                    df_test=df_test,test_id='Id',
                                    new_features_func=add_new_features2,
                                    convert_to_cat_cols=[
                                            'NumberOfTime3059DaysPastDueNotWorse',
                                            'NumberOfTime6089DaysPastDueNotWorse'
                                     ]
                                
                                   )
print(combined.shape)
combined = handle_encoding(combined,one_hot=True)
print('combined shape = {}'.format(combined.shape) )
combined = clean_df(combined,pipelines)

combined = remove_skew(combined,threshold=0.75,lambda_in=0.15)

get_missing_values_percentage(combined)

y = clean_df(pd.DataFrame(y),pipelines)

print(get_missing_values_percentage(y))
print(y.head())
y = y.values.ravel()
print(y.shape)




(251503, 16)
len of cat cols = 2
combined shape = (251503, 46)
(251503, 46)
(150000, 1)
0.0
   SeriousDlqin2yrs
0               1.0
1               0.0
2               0.0
3               0.0
4               0.0
(150000,)


In [6]:
pipeline1 = Pipeline([('robust',RobustScaler()),
                      ('GBClassifier',GradientBoostingClassifier())])




In [7]:
pipeline2 = Pipeline([('Robust',RobustScaler()),
                           ('Standard',StandardScaler()),
                           ('GBClassifier',GradientBoostingClassifier())])

In [8]:
pipeline1.named_steps

{'robust': RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
        with_scaling=True),
 'GBClassifier': GradientBoostingClassifier(criterion='friedman_mse', init=None,
               learning_rate=0.1, loss='deviance', max_depth=3,
               max_features=None, max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=100,
               n_iter_no_change=None, presort='auto', random_state=None,
               subsample=1.0, tol=0.0001, validation_fraction=0.1,
               verbose=0, warm_start=False)}

In [9]:
pipeline1.steps

[('robust',
  RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
         with_scaling=True)),
 ('GBClassifier',
  GradientBoostingClassifier(criterion='friedman_mse', init=None,
                learning_rate=0.1, loss='deviance', max_depth=3,
                max_features=None, max_leaf_nodes=None,
                min_impurity_decrease=0.0, min_impurity_split=None,
                min_samples_leaf=1, min_samples_split=2,
                min_weight_fraction_leaf=0.0, n_estimators=100,
                n_iter_no_change=None, presort='auto', random_state=None,
                subsample=1.0, tol=0.0001, validation_fraction=0.1,
                verbose=0, warm_start=False))]

In [10]:
pipeline1.named_steps['GBClassifier'].criterion

'friedman_mse'

In [11]:
def pipe_line_fit(self, X, y):
    X_transformed = X
    for name, estimator in self.steps[:-1]:
        X_transformed = estimator.fit_transform(X_transformed, y)
    self.steps[-1][1].fit(X_transformed, y)
    return self

In [12]:
processing_pipeline1 = make_pipeline(RobustScaler(),
                                     GradientBoostingClassifier())

processing_pipeline2 = make_pipeline(RobustScaler(),
                                     StandardScaler(),
                                     GradientBoostingClassifier())

In [13]:
processing_pipeline3 = make_pipeline(RobustScaler(),
                                     StandardScaler(),
                                     PolynomialFeatures(),
                                     GradientBoostingClassifier())

In [14]:
df = combined[:n_train]
df_test = combined[n_train:]
stratify_col = y

X_train,X_test,y_train,y_test = train_test_split(df,y,test_size=0.10,
                                  stratify=y,shuffle = True,random_state=20)

stratify_X_train = stratify_col[:X_train.shape[0]].copy()
X_train.shape,X_test.shape,y_train.shape,y_test.shape, stratify_X_train.shape
X_train,X_valid,y_train,y_valid = train_test_split(X_train,y_train,test_size=0.2,
                                  stratify=stratify_X_train,shuffle = True,random_state=20)
X_train.shape,X_valid.shape,y_train.shape,y_valid.shape

((108000, 46), (27000, 46), (108000,), (27000,))

In [15]:
pipe_model1 = processing_pipeline1.fit(X_train,y_train)
pipe1_auc = roc_auc_score(y_valid,pipe_model1.predict_proba(X_valid)[:, 1])
print("AUC for Gradient Boost Pipeline1: {:.6f}".format(pipe1_auc))

AUC for Gradient Boost Pipeline1: 0.859772


In [16]:
pipe_model2 = processing_pipeline2.fit(X_train,y_train)
pipe2_auc = roc_auc_score(y_valid,pipe_model2.predict_proba(X_valid)[:, 1])
print("AUC for Gradient Boost Pipeline2: {:.6f}".format(pipe2_auc))

AUC for Gradient Boost Pipeline2: 0.859774


In [17]:
preds_model1 = pipe_model1.predict(X_valid)
confusion = confusion_matrix(y_valid, preds_model1)
print("Confusion matrix:\n{}".format(confusion))

Confusion matrix:
[[24893   246]
 [ 1505   356]]


In [18]:
preds_model2 = pipe_model2.predict(X_valid)
confusion = confusion_matrix(y_valid, preds_model2)
print("Confusion matrix:\n{}".format(confusion))

Confusion matrix:
[[24893   246]
 [ 1505   356]]


In [19]:
params = {'gradientboostingclassifier__n_estimators':[200,300,500,800,1000,1500],
          "gradientboostingclassifier__max_features": randint(10,45),
          "gradientboostingclassifier__min_samples_split": randint(2, 11),
          "gradientboostingclassifier__min_samples_leaf": randint(1, 11),
          "gradientboostingclassifier__subsample":[0.6,0.7,0.75,0.8,0.9]
         }

kfold = KFold(n_splits=5,shuffle=True,random_state=0)
start = time()
randomSearch_p2 = RandomizedSearchCV(processing_pipeline2,
                                     param_distributions=params,
                                     n_iter=20,n_jobs=6,
                                     scoring='roc_auc',
                                     cv=kfold).fit(X_train,y_train)

print('training took {} mins'.format((time() - start)/60.))
randomSearch_p2_auc = roc_auc_score(y_valid,randomSearch_p2.predict_proba(X_valid)[:, 1])
print("AUC for Randomized Search Gradient Boost: {:.6f}".format(randomSearch_p2_auc))


training took 43.96106515328089 mins
AUC for Randomized Search Gradient Boost: 0.861863


In [20]:
joblib.dump(randomSearch_p2,'randomSearch_pipe2_credit.pkl')
report_best_scores(randomSearch_p2.cv_results_)

Model with rank: 1
Mean validation score: 0.864 (std: 0.005)
Parameters: {'gradientboostingclassifier__max_features': 17, 'gradientboostingclassifier__min_samples_leaf': 7, 'gradientboostingclassifier__min_samples_split': 7, 'gradientboostingclassifier__n_estimators': 300, 'gradientboostingclassifier__subsample': 0.75}

Model with rank: 2
Mean validation score: 0.864 (std: 0.005)
Parameters: {'gradientboostingclassifier__max_features': 24, 'gradientboostingclassifier__min_samples_leaf': 6, 'gradientboostingclassifier__min_samples_split': 5, 'gradientboostingclassifier__n_estimators': 300, 'gradientboostingclassifier__subsample': 0.7}

Model with rank: 3
Mean validation score: 0.864 (std: 0.005)
Parameters: {'gradientboostingclassifier__max_features': 10, 'gradientboostingclassifier__min_samples_leaf': 8, 'gradientboostingclassifier__min_samples_split': 4, 'gradientboostingclassifier__n_estimators': 800, 'gradientboostingclassifier__subsample': 0.75}



In [21]:
randomSearch_p2 = joblib.load('randomSearch_pipe2_credit.pkl')
report_best_scores(randomSearch_p2.cv_results_)

Model with rank: 1
Mean validation score: 0.864 (std: 0.005)
Parameters: {'gradientboostingclassifier__max_features': 17, 'gradientboostingclassifier__min_samples_leaf': 7, 'gradientboostingclassifier__min_samples_split': 7, 'gradientboostingclassifier__n_estimators': 300, 'gradientboostingclassifier__subsample': 0.75}

Model with rank: 2
Mean validation score: 0.864 (std: 0.005)
Parameters: {'gradientboostingclassifier__max_features': 24, 'gradientboostingclassifier__min_samples_leaf': 6, 'gradientboostingclassifier__min_samples_split': 5, 'gradientboostingclassifier__n_estimators': 300, 'gradientboostingclassifier__subsample': 0.7}

Model with rank: 3
Mean validation score: 0.864 (std: 0.005)
Parameters: {'gradientboostingclassifier__max_features': 10, 'gradientboostingclassifier__min_samples_leaf': 8, 'gradientboostingclassifier__min_samples_split': 4, 'gradientboostingclassifier__n_estimators': 800, 'gradientboostingclassifier__subsample': 0.75}



In [22]:
randomSearch_p2 = joblib.load('randomSearch_pipe2_credit.pkl')
randomSearch_p2_auc = roc_auc_score(y_valid,randomSearch_p2.predict_proba(X_valid)[:, 1])
print("AUC for Randomized Search Gradient Boost: {:.6f}".format(randomSearch_p2_auc))

AUC for Randomized Search Gradient Boost: 0.861863


In [23]:
rand_p2_auc = roc_auc_score(y_test,randomSearch_p2.predict_proba(X_test)[:, 1])
print("AUC on Test for P2: {:.6f}".format(rand_p2_auc))

AUC on Test for P2: 0.867854


In [24]:
preds_rand_p2 = randomSearch_p2.predict(X_valid)
confusion = confusion_matrix(y_valid, preds_rand_p2)
print("Confusion matrix:\n{}".format(confusion))

Confusion matrix:
[[24881   258]
 [ 1514   347]]


In [25]:
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}test.csv', low_memory=False)
columns = ['Id', 'SeriousDlqin2yrs','RevolvingUtilizationOfUnsecuredLines', 'age',
                 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
                 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
                 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
                 'NumberOfDependents']

columns = [col.replace('-','') for col in columns]
    
df_raw.columns= columns
df_test.columns = columns
df_test.drop(['SeriousDlqin2yrs'], axis=1, inplace=True)
df = df_raw.copy()
df,y = extract_and_drop_target_column(df,'SeriousDlqin2yrs',inplace=True)

n_train = df.shape[0]
n_test = df_test.shape[0]
    
pipelines = create_cleaning_pipelines(log_y=True)
combined = pd.concat((df, df_test)).reset_index(drop=True)
combined,test_id = preprocess_df2(combined,id_col='Id',
                                    df_test=df_test,
                                    test_id='Id'
                                
                                   )
print(combined.shape)
combined = handle_encoding(combined,one_hot=True)
print('combined shape = {}'.format(combined.shape) )
combined = clean_df(combined,pipelines)

combined = remove_skew(combined,threshold=0.75,lambda_in=0.2)

get_missing_values_percentage(combined)

y = clean_df(pd.DataFrame(y),pipelines)

print(get_missing_values_percentage(y))
print(y.head())
y = y.values.ravel()
print(y.shape)




(251503, 10)
len of cat cols = 0
combined shape = (251503, 10)
(251503, 10)
(150000, 1)
0.0
   SeriousDlqin2yrs
0               1.0
1               0.0
2               0.0
3               0.0
4               0.0
(150000,)


In [26]:
df = combined[:n_train]
df_test = combined[n_train:]
stratify_col = y

X_train,X_test,y_train,y_test = train_test_split(df,y,test_size=0.10,
                                  stratify=y,shuffle = True,random_state=20)

stratify_X_train = stratify_col[:X_train.shape[0]].copy()
X_train.shape,X_test.shape,y_train.shape,y_test.shape, stratify_X_train.shape
X_train,X_valid,y_train,y_valid = train_test_split(X_train,y_train,test_size=0.2,
                                  stratify=stratify_X_train,shuffle = True,random_state=20)
X_train.shape,X_valid.shape,y_train.shape,y_valid.shape

((108000, 10), (27000, 10), (108000,), (27000,))

In [27]:
pipe_model3 = processing_pipeline3.fit(X_train,y_train)
pipe3_auc = roc_auc_score(y_valid,pipe_model3.predict_proba(X_valid)[:, 1])
print("AUC for Gradient Boost Pipeline3: {:.6f}".format(pipe3_auc))

AUC for Gradient Boost Pipeline3: 0.859008


In [28]:
params = {'gradientboostingclassifier__n_estimators':[200,300,500,800,1000,1500],
          'gradientboostingclassifier__max_features': randint(1,11),
          'gradientboostingclassifier__min_samples_split': randint(2, 11),
          'gradientboostingclassifier__min_samples_leaf': randint(1, 11),
          'gradientboostingclassifier__subsample':[0.6,0.7,0.8,0.9],
          'polynomialfeatures__degree': [1,2,3],
          
         }

kfold = KFold(n_splits=5,shuffle=True,random_state=0)
start = time()
randomSearch_p3 = RandomizedSearchCV(processing_pipeline3,
                                     param_distributions=params,
                                     n_iter=20,n_jobs=6,
                                     scoring='roc_auc',
                                     cv=kfold).fit(X_train,y_train)

print('training took {} mins'.format((time() - start)/60.))
randomSearch_p3_auc = roc_auc_score(y_valid,randomSearch_p3.predict_proba(X_valid)[:, 1])
print("AUC on Randomized Search for P3: {:.6f}".format(randomSearch_p3_auc))


training took 35.36313624779383 mins
AUC on Randomized Search for P3: 0.861847


In [32]:
joblib.dump(randomSearch_p3,'./randomSearch_pipe3_credit.pkl')
report_best_scores(randomSearch_p3.cv_results_)

Model with rank: 1
Mean validation score: 0.865 (std: 0.005)
Parameters: {'gradientboostingclassifier__max_features': 1, 'gradientboostingclassifier__min_samples_leaf': 2, 'gradientboostingclassifier__min_samples_split': 8, 'gradientboostingclassifier__n_estimators': 500, 'gradientboostingclassifier__subsample': 0.9, 'polynomialfeatures__degree': 1}

Model with rank: 2
Mean validation score: 0.865 (std: 0.005)
Parameters: {'gradientboostingclassifier__max_features': 2, 'gradientboostingclassifier__min_samples_leaf': 9, 'gradientboostingclassifier__min_samples_split': 4, 'gradientboostingclassifier__n_estimators': 500, 'gradientboostingclassifier__subsample': 0.6, 'polynomialfeatures__degree': 1}

Model with rank: 3
Mean validation score: 0.865 (std: 0.005)
Parameters: {'gradientboostingclassifier__max_features': 10, 'gradientboostingclassifier__min_samples_leaf': 4, 'gradientboostingclassifier__min_samples_split': 2, 'gradientboostingclassifier__n_estimators': 300, 'gradientboostingcla

In [33]:
randomSearch_p3 = joblib.load('./randomSearch_pipe3_credit.pkl')
randomSearch_p3_auc = roc_auc_score(y_valid,randomSearch_p3.predict_proba(X_valid)[:, 1])
print("AUC for Randomized Search Gradient Boost: {:.6f}".format(randomSearch_p3_auc))

AUC for Randomized Search Gradient Boost: 0.861847


In [31]:
rand_p3_auc = roc_auc_score(y_test,randomSearch_p3.predict_proba(X_test)[:, 1])
print("AUC on Test for P3: {:.6f}".format(rand_p3_auc))

AUC on Test for P3: 0.867939
