In [1]:
import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from time import time
import sklearn
from sklearn.model_selection import train_test_split,cross_val_score,KFold,StratifiedKFold,ShuffleSplit,StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.feature_selection import RFE

from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix,roc_auc_score
from sklearn_pandas import DataFrameMapper
from pandas.api.types import is_string_dtype, is_numeric_dtype
from scipy import stats
from scipy.stats import skew
from scipy.special import boxcox1p
from scipy.stats import randint as sp_randint
%matplotlib inline

In [2]:
def print_feature_importances(model,X):
    important_features = pd.Series(data=rf_model.feature_importances_,index=X.columns)
    important_features.sort_values(ascending=False,inplace=True)
    print(important_features.head(50))
    
def get_cat_columns_by_type(df):
    out = []
    for colname,col_values in df.items():
        if is_string_dtype(col_values):
            out.append((colname,'string') )
        elif not is_numeric_dtype(col_values):
            out.append((colname,'categorical') )
    return out       

def get_numeric_columns(df):
    out = []
    for colname,col_values in df.items():
        if is_numeric_dtype(col_values):
            out.append(colname)
    return out       
    
def get_missing_values_percentage(df):
    missing_values_counts_list = df.isnull().sum()
    total_values = np.product(df.shape)
    total_missing = missing_values_counts_list.sum()
    # percent of data that is missing
    return (total_missing/total_values) * 100

def get_missing_columns(df1,df2):
    missing1 = []
    missing2 = []
    for colname in df1.columns:
        if colname not in df2.columns:
            missing2.append(colname)
    for colname in df2.columns:
        if colname not in df1.columns:
            missing1.append(colname)        
    return (missing1,missing2)


def convert_to_str_type(df_in,columns,inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
        
    for col in columns:
        df[col] = df[col].astype(str)
    return df

    
def handle_missing_values(df_in,cat_cols=[], num_cols=[],na_dict=None,add_nan_col=True,inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
 
    if na_dict is None:
        na_dict = {}

    for colname, col_values in df.items():   
        if colname not in num_cols:
            continue
        if pd.isnull(col_values).sum():
            df[colname+'_na'] = pd.isnull(col_values)
            filler = na_dict[colname] if colname in na_dict else col_values.median()
            df[colname] = col_values.fillna(filler)
            na_dict[colname] = filler
    for colname in cat_cols:
        if colname not in df.columns:
            continue
        df[colname].fillna(df[colname].mode()[0], inplace=True)
        lbl = LabelEncoder() 
        lbl.fit(list(df[colname].values)) 
        df[colname] = lbl.transform(list(df[colname].values))
    
    return (df,na_dict)



def scale_num_cols(df_in, mapper, inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
        
    if mapper is None:
        map_f = [([c],StandardScaler()) for c in df.columns if is_numeric_dtype(df[c])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return (df,mapper)



def extract_and_drop_target_column(df_in, y_name, inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
    if not is_numeric_dtype(df[y_name]):
        df[y_name] = df[y_name].cat.codes
        y = df[y_name].values
    else:
        y = df[y_name]
    df.drop([y_name], axis=1, inplace=True)
    return (df,y)

def print_mse(m,X_train, X_valid, y_train, y_valid):
    res = [mean_squared_error(y_train,m.predict(X_train)),
                mean_squared_error(y_valid,m.predict(X_valid)),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    print('MSE Training set = {}, MSE Validation set = {}, score Training Set = {}, score on Validation Set = {}'.format(res[0],res[1],res[2], res[3]))
    if hasattr(m, 'oob_score_'):
          print('OOB Score = {}'.format(m.oob_score_))      

def get_iqr_min_max(df,cols):
    out = {}
    for colname, col_values in df.items():
        if colname not in cols:
            continue
        quartile75, quartile25 = np.percentile(col_values, [75 ,25])
        ## Inter Quartile Range ##
        IQR = quartile75 - quartile25
        min_value = quartile25 - (IQR*1.5)
        max_value = quartile75 + (IQR*1.5)
        out[colname] = (min_value,max_value)
    return out


def bin_numerical_columns(df_in,cols,inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
        
    for col in cols.keys():
        bins = cols[col]
        buckets_ = np.linspace(bins[0],bins[1],bins[2])
        df[col] = pd.cut(df[col],buckets_,include_lowest=True)
    return df

In [3]:
def preprocess_df(df_train,df_test=None,
                  log_y=False,
                  id_col= None,
                  target_col=None,
                  convert_to_cat_cols=None,
                  remove_skewness=False,
                  skew_threshold=0.75,
                  boxcox_lambda=0.15,
                  scale_mapper=None,
                  bin_columns_dict=None,
                  new_features_func=None):
    
    if target_col is not None:
        df,y = extract_and_drop_target_column(df_train,target_col,inplace=True)
        print(y.head())
        if log_y:
            y = np.log1p(y)
            print('222')
    else:
        y = None
        print('333')
        
    combined = pd.concat((df, df_test)).reset_index(drop=True)
    
    
    if df_test is not None and id_col is not None:
        test_id = df_test['Id'].copy()
        combined.drop('Id', axis=1,inplace=True)
    else: test_id = None
   
    if new_features_func is not None:
        combined = new_features_func(combined)
    
    
    if convert_to_cat_cols is not None:
        combined = convert_to_str_type(combined,convert_to_cat_cols,inplace=True)
    
        
    if bin_columns_dict is not None:
        combined = bin_numerical_columns(combined,bin_columns_dict,inplace=True)
    
    
    cat_cols = get_cat_columns_by_type(combined)
    cat_cols = [cat_cols[i][0] for i in range(len(cat_cols))]
    num_cols = [col for col in combined.columns if col not in cat_cols]
    
    combined = pd.get_dummies(combined,columns=cat_cols, dummy_na=True)
    
    n_train = df.shape[0]
    n_test = df_test.shape[0]
      
    
    combined,d = handle_missing_values(combined,cat_cols=cat_cols,
                                       num_cols=num_cols,inplace=True)
    
    print(d)
    if remove_skewness:
        skewed_cols = combined[num_cols].apply(lambda x: skew(x)).sort_values(ascending=False)
        skewness = pd.DataFrame({'Skew' :skewed_cols})
        skewness_log = skewness[abs(skewness) > skew_threshold]
        skewness_other = skewness[abs(skewness) <= skew_threshold]
        skewed_features_log = skewness_log.index
        skewed_features_other = skewness_other.index
        lambda_ = 0.0
        for feature in skewed_features_log:
            combined[feature] = boxcox1p(combined[feature],lambda_)
        lambda_ = boxcox_lambda
        for feature in skewed_features_other:
            combined[feature] = boxcox1p(combined[feature],lambda_)
    
    if scale_mapper is not None:
        map_f = [([c],scale_mapper) for c in num_cols]
        mapper = DataFrameMapper(map_f).fit(combined)
    else:
        mapper = None
        
    combined,_ = scale_num_cols(combined,mapper,inplace=True) 
    
    print(get_missing_values_percentage(combined))
    
    return combined,df,y,cat_cols,num_cols,test_id,n_train,n_test


In [4]:
def add_new_features1(df):
    df['DepsIncomeComined'] = df['NumberOfDependents'] * df['MonthlyIncome']
    df['Times90DaysLateDebtRatio'] = df['NumberOfTimes90DaysLate'] * df['DebtRatio']
    df['Times90DaysLateRevolving'] = df['NumberOfTimes90DaysLate'] * df['RevolvingUtilizationOfUnsecuredLines']
    return df
def add_new_features2(df):
    df['DepsIncomeComined'] = df['NumberOfDependents'] * df['MonthlyIncome']
    df['Times90DaysLateDebtRatio'] = df['NumberOfTimes90DaysLate'] * df['DebtRatio']
    df['Times90DaysLateRevolving'] = df['NumberOfTimes90DaysLate'] * df['RevolvingUtilizationOfUnsecuredLines']
    df['RevolvingUtilizationOfUnsecuredLines-2'] = df['RevolvingUtilizationOfUnsecuredLines'] ** 2
    df['RevolvingUtilizationOfUnsecuredLines-3'] = df['RevolvingUtilizationOfUnsecuredLines'] ** 3
    df['RevolvingUtilizationOfUnsecuredLines-sqrt'] = np.sqrt(df['RevolvingUtilizationOfUnsecuredLines'])
    
    return df
def add_new_features3(df):
    return df

def add_new_features4(df):
    return df

    

In [7]:
PATH = "./data/give_me_credit/"
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}test.csv', low_memory=False)

FileNotFoundError: [Errno 2] File b'data/give_me_credit/train.csv' does not exist: b'data/give_me_credit/train.csv'

In [6]:
df_raw.info()

NameError: name 'df_raw' is not defined

In [9]:
df_raw.describe()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,75000.5,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.757222
std,43301.414527,0.249746,249.755371,14.771866,4.192781,2037.818523,14384.67,5.145951,4.169304,1.129771,4.155179,1.115086
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,37500.75,0.0,0.029867,41.0,0.0,0.175074,3400.0,5.0,0.0,0.0,0.0,0.0
50%,75000.5,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0
75%,112500.25,0.0,0.559046,63.0,0.0,0.868254,8249.0,11.0,0.0,2.0,0.0,1.0
max,150000.0,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


In [10]:
df_raw.head()

Unnamed: 0.1,Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,2,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,3,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,4,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,5,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [12]:
columns = ['Id', 'SeriousDlqin2yrs','RevolvingUtilizationOfUnsecuredLines', 'age',
                 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
                 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
                 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
                 'NumberOfDependents']
df_raw.columns= columns
df_test.columns = columns
df_test.drop(['SeriousDlqin2yrs'], axis=1, inplace=True)
df = df_raw.copy()

In [13]:
#Find most important features relative to target
corr = df.corr()
corr.sort_values(['SeriousDlqin2yrs'], ascending = False, inplace = True)
print(corr.SeriousDlqin2yrs)


SeriousDlqin2yrs                        1.000000
NumberOfTime30-59DaysPastDueNotWorse    0.125587
NumberOfTimes90DaysLate                 0.117175
NumberOfTime60-89DaysPastDueNotWorse    0.102261
NumberOfDependents                      0.046048
Id                                      0.002801
RevolvingUtilizationOfUnsecuredLines   -0.001802
NumberRealEstateLoansOrLines           -0.007038
DebtRatio                              -0.007602
MonthlyIncome                          -0.019746
NumberOfOpenCreditLinesAndLoans        -0.029669
age                                    -0.115386
Name: SeriousDlqin2yrs, dtype: float64


In [14]:

combined,df,y,cat_cols,num_cols,test_id,n_train,n_test = preprocess_df(
                                       df_train=df,df_test=df_test,
                                       target_col='SeriousDlqin2yrs',
                                       id_col='Id',
                                       remove_skewness=True,
                                       skew_threshold=0.75,
                                       boxcox_lambda=0.2,
                                       scale_mapper=RobustScaler()
                                       )



0    1
1    0
2    0
3    0
4    0
Name: SeriousDlqin2yrs, dtype: int64
{'MonthlyIncome': 5400.0, 'NumberOfDependents': 0.0}
0.0


In [None]:
df.info()

In [None]:
df_raw['SeriousDlqin2yrs'].head()

In [None]:
y.head()

In [15]:
iqr_ranges = get_iqr_min_max(combined,num_cols)
iqr_ranges

{'DebtRatio': (-0.6898509855776909, 1.8761315545181199),
 'MonthlyIncome': (4.768564869602322, 5.522452016090394),
 'NumberOfDependents': (-1.5, 2.5),
 'NumberOfOpenCreditLinesAndLoans': (1.300166838962975, 3.3058475995996432),
 'NumberOfTime30-59DaysPastDueNotWorse': (0.0, 0.0),
 'NumberOfTime60-89DaysPastDueNotWorse': (0.0, 0.0),
 'NumberOfTimes90DaysLate': (0.0, 0.0),
 'NumberRealEstateLoansOrLines': (-2.1583421225545587, 3.5972368709242644),
 'RevolvingUtilizationOfUnsecuredLines': (-0.9014455427324771,
  1.6425972222125693),
 'age': (2.9691248157200487, 3.8137150538445965)}

In [None]:
combined.shape,df.shape,df_test.shape

In [16]:
df = combined[:n_train]
df_test = combined[n_train:]
stratify_col = y

X_train,X_test,y_train,y_test = train_test_split(df,y,test_size=0.10,
                                  stratify=y,shuffle = True,random_state=20)

stratify_X_train = stratify_col[:X_train.shape[0]].copy()
X_train.shape,X_test.shape,y_train.shape,y_test.shape, stratify_X_train.shape

((135000, 12), (15000, 12), (135000,), (15000,), (135000,))

In [17]:
X_train,X_valid,y_train,y_valid = train_test_split(X_train,y_train,test_size=0.2,
                                  stratify=stratify_X_train,shuffle = True,random_state=20)
X_train.shape,X_valid.shape,y_train.shape,y_valid.shape

((108000, 12), (27000, 12), (108000,), (27000,))

In [18]:
rf_model = RandomForestClassifier(n_estimators=300,random_state=10, n_jobs=-1).fit(X_train,y_train)
rf_auc = roc_auc_score(y_valid,rf_model.predict_proba(X_valid)[:, 1])
print("AUC for Random Forest: {:.6f}".format(rf_auc))

AUC for Random Forest: 0.842684


In [20]:
preds_rf = rf_model.predict(X_valid)
confusion = confusion_matrix(y_valid, preds_rf)
print("Confusion matrix:\n{}".format(confusion))

Confusion matrix:
[[24836   303]
 [ 1524   337]]


In [19]:
gb_model = GradientBoostingClassifier(n_estimators=300,random_state=10).fit(X_train,y_train)
gb_auc = roc_auc_score(y_valid,gb_model.predict_proba(X_valid)[:, 1])
print("AUC for Gradient Boost: {:.6f}".format(gb_auc))

AUC for Gradient Boost: 0.862961


In [21]:
preds_gb = gb_model.predict(X_valid)
confusion = confusion_matrix(y_valid, preds_gb)
print("Confusion matrix:\n{}".format(confusion))

Confusion matrix:
[[24874   265]
 [ 1508   353]]


In [22]:
print_feature_importances(rf_model,X_train)

RevolvingUtilizationOfUnsecuredLines    0.191734
DebtRatio                               0.174215
MonthlyIncome                           0.140418
age                                     0.129085
NumberOfOpenCreditLinesAndLoans         0.090661
NumberOfTimes90DaysLate                 0.088518
NumberOfTime30-59DaysPastDueNotWorse    0.053856
NumberOfTime60-89DaysPastDueNotWorse    0.048158
NumberOfDependents                      0.041042
NumberRealEstateLoansOrLines            0.034271
MonthlyIncome_na                        0.005344
NumberOfDependents_na                   0.002700
dtype: float64


In [23]:
print_feature_importances(gb_model,X_train)

RevolvingUtilizationOfUnsecuredLines    0.191734
DebtRatio                               0.174215
MonthlyIncome                           0.140418
age                                     0.129085
NumberOfOpenCreditLinesAndLoans         0.090661
NumberOfTimes90DaysLate                 0.088518
NumberOfTime30-59DaysPastDueNotWorse    0.053856
NumberOfTime60-89DaysPastDueNotWorse    0.048158
NumberOfDependents                      0.041042
NumberRealEstateLoansOrLines            0.034271
MonthlyIncome_na                        0.005344
NumberOfDependents_na                   0.002700
dtype: float64


In [25]:
df_raw['NumberOfTime30-59DaysPastDueNotWorse'].value_counts()

0     126018
1      16033
2       4598
3       1754
4        747
5        342
98       264
6        140
7         54
8         25
9         12
96         5
10         4
12         2
13         1
11         1
Name: NumberOfTime30-59DaysPastDueNotWorse, dtype: int64

In [28]:
PATH = "data/give_me_credit/"
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}test.csv', low_memory=False)
columns = ['Id', 'SeriousDlqin2yrs','RevolvingUtilizationOfUnsecuredLines', 'age',
                 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
                 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
                 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
                 'NumberOfDependents']
df_raw.columns= columns
df_test.columns = columns
df_test.drop(['SeriousDlqin2yrs'], axis=1, inplace=True)
df = df_raw.copy()
combined,df,y,cat_cols,num_cols,test_id,n_train,n_test = preprocess_df(
                                       df_train=df,df_test=df_test,
                                       target_col='SeriousDlqin2yrs',
                                       id_col='Id',
                                       convert_to_cat_cols=[
                                       'NumberOfTime30-59DaysPastDueNotWorse',
                                       'NumberOfTime60-89DaysPastDueNotWorse'
                                       ],
                                       new_features_func=add_new_features1,
                                       remove_skewness=True,
                                       skew_threshold=0.75,
                                       boxcox_lambda=0.2,
                                       scale_mapper=RobustScaler()
                                       )

df = combined[:n_train]
df_test = combined[n_train:]
stratify_col = y

X_train,X_test,y_train,y_test = train_test_split(df,y,test_size=0.10,
                                  stratify=y,shuffle = True,random_state=20)

stratify_X_train = stratify_col[:X_train.shape[0]].copy()
X_train.shape,X_test.shape,y_train.shape,y_test.shape, stratify_X_train.shape
X_train,X_valid,y_train,y_valid = train_test_split(X_train,y_train,test_size=0.2,
                                  stratify=stratify_X_train,shuffle = True,random_state=20)
X_train.shape,X_valid.shape,y_train.shape,y_valid.shape

0    1
1    0
2    0
3    0
4    0
Name: SeriousDlqin2yrs, dtype: int64
{'MonthlyIncome': 5400.0, 'NumberOfDependents': 0.0, 'DepsIncomeComined': 0.0}
0.0


((108000, 46), (27000, 46), (108000,), (27000,))

In [29]:
rf_model = RandomForestClassifier(n_estimators=300,random_state=10, n_jobs=-1).fit(X_train,y_train)
rf_auc = roc_auc_score(y_valid,rf_model.predict_proba(X_valid)[:, 1])
print("AUC for Random Forest: {:.6f}".format(rf_auc))
preds_rf = rf_model.predict(X_valid)
confusion = confusion_matrix(y_valid, preds_rf)
print("Confusion matrix:\n{}".format(confusion))

AUC for Random Forest: 0.839244
Confusion matrix:
[[24844   295]
 [ 1521   340]]


In [30]:
gb_model = GradientBoostingClassifier(n_estimators=300,random_state=10).fit(X_train,y_train)
gb_auc = roc_auc_score(y_valid,gb_model.predict_proba(X_valid)[:, 1])
print("AUC for Gradient Boost: {:.6f}".format(gb_auc))

preds_gb = gb_model.predict(X_valid)
confusion = confusion_matrix(y_valid, preds_gb)
print("Confusion matrix:\n{}".format(confusion))

AUC for Gradient Boost: 0.862075
Confusion matrix:
[[24869   270]
 [ 1496   365]]


In [31]:
PATH = "data/give_me_credit/"
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}test.csv', low_memory=False)
columns = ['Id', 'SeriousDlqin2yrs','RevolvingUtilizationOfUnsecuredLines', 'age',
                 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
                 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
                 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
                 'NumberOfDependents']
df_raw.columns= columns
df_test.columns = columns
df_test.drop(['SeriousDlqin2yrs'], axis=1, inplace=True)
df = df_raw.copy()
combined,df,y,cat_cols,num_cols,test_id,n_train,n_test = preprocess_df(
                                       df_train=df,df_test=df_test,
                                       target_col='SeriousDlqin2yrs',
                                       id_col='Id',
                                       convert_to_cat_cols=[
                                       'NumberOfTime30-59DaysPastDueNotWorse',
                                       'NumberOfTime60-89DaysPastDueNotWorse'
                                       ],
                                       new_features_func=add_new_features2,
                                       remove_skewness=True,
                                       skew_threshold=0.75,
                                       boxcox_lambda=0.2,
                                       scale_mapper=RobustScaler()
                                       )

df = combined[:n_train]
df_test = combined[n_train:]
stratify_col = y

X_train,X_test,y_train,y_test = train_test_split(df,y,test_size=0.10,
                                  stratify=y,shuffle = True,random_state=20)

stratify_X_train = stratify_col[:X_train.shape[0]].copy()
X_train.shape,X_test.shape,y_train.shape,y_test.shape, stratify_X_train.shape
X_train,X_valid,y_train,y_valid = train_test_split(X_train,y_train,test_size=0.2,
                                  stratify=stratify_X_train,shuffle = True,random_state=20)
X_train.shape,X_valid.shape,y_train.shape,y_valid.shape

0    1
1    0
2    0
3    0
4    0
Name: SeriousDlqin2yrs, dtype: int64
{'MonthlyIncome': 5400.0, 'NumberOfDependents': 0.0, 'DepsIncomeComined': 0.0}
0.0


((108000, 49), (27000, 49), (108000,), (27000,))

In [27]:
rf_model = RandomForestClassifier(n_estimators=300,random_state=10, n_jobs=-1).fit(X_train,y_train)
rf_auc = roc_auc_score(y_valid,rf_model.predict_proba(X_valid)[:, 1])
print("AUC for Random Forest: {:.6f}".format(rf_auc))
preds_rf = rf_model.predict(X_valid)
confusion = confusion_matrix(y_valid, preds_rf)
print("Confusion matrix:\n{}".format(confusion))

AUC for Random Forest: 0.837253
Confusion matrix:
[[24837   302]
 [ 1521   340]]


In [32]:
gb_model = GradientBoostingClassifier(n_estimators=300,random_state=10).fit(X_train,y_train)
gb_auc = roc_auc_score(y_valid,gb_model.predict_proba(X_valid)[:, 1])
print("AUC for Gradient Boost: {:.6f}".format(gb_auc))

preds_gb = gb_model.predict(X_valid)
confusion = confusion_matrix(y_valid, preds_gb)
print("Confusion matrix:\n{}".format(confusion))

AUC for Gradient Boost: 0.861906
Confusion matrix:
[[24876   263]
 [ 1490   371]]
