In [20]:
import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import datetime
from time import time
import sklearn
from sklearn.model_selection import train_test_split,cross_val_score,KFold,StratifiedKFold,ShuffleSplit,StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.feature_selection import RFE,SelectPercentile,f_regression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import ElasticNet

from sklearn.pipeline import Pipeline,make_pipeline

from sklearn.externals import joblib
from sklearn.metrics import confusion_matrix,roc_auc_score, make_scorer
from sklearn_pandas import DataFrameMapper
from pandas.api.types import is_string_dtype, is_numeric_dtype
from scipy import stats
from scipy.stats import skew,randint
from scipy.special import boxcox1p
from scipy.stats import randint as sp_randint
%matplotlib inline

In [21]:
def print_feature_importances(model,X):
    important_features = pd.Series(data=rf_model.feature_importances_,index=X.columns)
    important_features.sort_values(ascending=False,inplace=True)
    print(important_features.head(50))
    
def get_cat_columns_by_type(df):
    out = []
    for colname,col_values in df.items():
        if is_string_dtype(col_values):
            out.append((colname,'string') )
        elif not is_numeric_dtype(col_values):
            out.append((colname,'categorical') )
    return out       

def get_numeric_columns(df):
    out = []
    for colname,col_values in df.items():
        if is_numeric_dtype(col_values):
            out.append(colname)
    return out       
    
def get_missing_values_percentage(df):
    missing_values_counts_list = df.isnull().sum()
    total_values = np.product(df.shape)
    total_missing = missing_values_counts_list.sum()
    # percent of data that is missing
    return (total_missing/total_values) * 100

def get_missing_columns(df1,df2):
    missing1 = []
    missing2 = []
    for colname in df1.columns:
        if colname not in df2.columns:
            missing2.append(colname)
    for colname in df2.columns:
        if colname not in df1.columns:
            missing1.append(colname)        
    return (missing1,missing2)


def convert_to_str_type(df_in,columns,inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
        
    for col in columns:
        df[col] = df[col].astype(str)
    return df

    
def handle_missing_values(df_in,cat_cols=[], num_cols=[],na_dict=None,add_nan_col=True,inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
 
    if na_dict is None:
        na_dict = {}

    for colname, col_values in df.items():   
        if colname not in num_cols:
            continue
        if pd.isnull(col_values).sum():
            df[colname+'_na'] = pd.isnull(col_values)
            filler = na_dict[colname] if colname in na_dict else col_values.median()
            df[colname] = col_values.fillna(filler)
            na_dict[colname] = filler
    for colname in cat_cols:
        if colname not in df.columns:
            continue
        df[colname].fillna(df[colname].mode()[0], inplace=True)
        lbl = LabelEncoder() 
        lbl.fit(list(df[colname].values)) 
        df[colname] = lbl.transform(list(df[colname].values))
    
    return (df,na_dict)



def scale_num_cols(df_in, mapper, inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
        
    if mapper is None:
        map_f = [([c],StandardScaler()) for c in df.columns if is_numeric_dtype(df[c])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return (df,mapper)



def extract_and_drop_target_column(df_in, y_name, inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
    if not is_numeric_dtype(df[y_name]):
        df[y_name] = df[y_name].cat.codes
        y = df[y_name].values
    else:
        y = df[y_name]
    df.drop([y_name], axis=1, inplace=True)
    return (df,y)

def print_mse(m,X_train, X_valid, y_train, y_valid):
    res = [mean_squared_error(y_train,m.predict(X_train)),
                mean_squared_error(y_valid,m.predict(X_valid)),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    print('MSE Training set = {}, MSE Validation set = {}, score Training Set = {}, score on Validation Set = {}'.format(res[0],res[1],res[2], res[3]))
    if hasattr(m, 'oob_score_'):
          print('OOB Score = {}'.format(m.oob_score_))      

def get_iqr_min_max(df,cols):
    out = {}
    for colname, col_values in df.items():
        if colname not in cols:
            continue
        quartile75, quartile25 = np.percentile(col_values, [75 ,25])
        ## Inter Quartile Range ##
        IQR = quartile75 - quartile25
        min_value = quartile25 - (IQR*1.5)
        max_value = quartile75 + (IQR*1.5)
        out[colname] = (min_value,max_value)
    return out


def bin_numerical_columns(df_in,cols,inplace=False):
    if(inplace):
        df = df_in
    else:
        df = df_in.copy()
        
    for col in cols.keys():
        bins = cols[col]
        buckets_ = np.linspace(bins[0],bins[1],bins[2])
        df[col] = pd.cut(df[col],buckets_,include_lowest=True)
    return df

In [22]:
def preprocess_df(df_train,df_test=None,
                  log_y=False,
                  id_col= None,test_id=None,
                  target_col=None,
                  convert_to_cat_cols=None,
                  remove_skewness=False,
                  skew_threshold=0.75,
                  boxcox_lambda=0.15,
                  scale_mapper=None,
                  bin_columns_dict=None,
                  new_features_func=None):
    
    if target_col is not None:
        df,y = extract_and_drop_target_column(df_train,target_col,inplace=True)
        print(y.head())
        if log_y:
            y = np.log1p(y)
            
    else:
        y = None
        
        
    combined = pd.concat((df, df_test)).reset_index(drop=True)
    
    
    if id_col is not None:
        combined.drop(id_col, axis=1,inplace=True)
        if test_id is not None:
            test_id = df_test[id_col].copy()
        else: test_id = None
   
    if new_features_func is not None:
        combined = new_features_func(combined)
    
    
    if convert_to_cat_cols is not None:
        combined = convert_to_str_type(combined,convert_to_cat_cols,inplace=True)
    
        
    if bin_columns_dict is not None:
        combined = bin_numerical_columns(combined,bin_columns_dict,inplace=True)
    
    
    cat_cols = get_cat_columns_by_type(combined)
    cat_cols = [cat_cols[i][0] for i in range(len(cat_cols))]
    num_cols = [col for col in combined.columns if col not in cat_cols]
    
    combined = pd.get_dummies(combined,columns=cat_cols, dummy_na=True)
    
    n_train = df.shape[0]
    n_test = df_test.shape[0]
      
    
    combined,d = handle_missing_values(combined,cat_cols=cat_cols,
                                       num_cols=num_cols,inplace=True)
    
    print(d)
    if remove_skewness:
        skewed_cols = combined[num_cols].apply(lambda x: skew(x)).sort_values(ascending=False)
        skewness = pd.DataFrame({'Skew' :skewed_cols})
        skewness_log = skewness[abs(skewness) > skew_threshold]
        skewness_other = skewness[abs(skewness) <= skew_threshold]
        skewed_features_log = skewness_log.index
        skewed_features_other = skewness_other.index
        lambda_ = 0.0
        for feature in skewed_features_log:
            combined[feature] = boxcox1p(combined[feature],lambda_)
        lambda_ = boxcox_lambda
        for feature in skewed_features_other:
            combined[feature] = boxcox1p(combined[feature],lambda_)
    
    if scale_mapper is not None:
        map_f = [([c],scale_mapper) for c in num_cols]
        mapper = DataFrameMapper(map_f).fit(combined)
    else:
        mapper = None
        
    combined,_ = scale_num_cols(combined,mapper,inplace=True) 
    
    print(get_missing_values_percentage(combined))
    
    return combined,df,y,cat_cols,num_cols,test_id,n_train,n_test


In [23]:
def add_new_features1(df):
    df['DepsIncomeComined'] = df['NumberOfDependents'] * df['MonthlyIncome']
    df['Times90DaysLateDebtRatio'] = df['NumberOfTimes90DaysLate'] * df['DebtRatio']
    df['Times90DaysLateRevolving'] = df['NumberOfTimes90DaysLate'] * df['RevolvingUtilizationOfUnsecuredLines']
    return df
def add_new_features2(df):
    df['DepsIncomeComined'] = df['NumberOfDependents'] * df['MonthlyIncome']
    df['Times90DaysLateDebtRatio'] = df['NumberOfTimes90DaysLate'] * df['DebtRatio']
    df['Times90DaysLateRevolving'] = df['NumberOfTimes90DaysLate'] * df['RevolvingUtilizationOfUnsecuredLines']
    df['RevolvingUtilizationOfUnsecuredLines-2'] = df['RevolvingUtilizationOfUnsecuredLines'] ** 2
    df['RevolvingUtilizationOfUnsecuredLines-3'] = df['RevolvingUtilizationOfUnsecuredLines'] ** 3
    df['RevolvingUtilizationOfUnsecuredLines-sqrt'] = np.sqrt(df['RevolvingUtilizationOfUnsecuredLines'])
    
    return df


In [24]:
PATH = "data/give_me_credit/"
df_raw = pd.read_csv(f'{PATH}train.csv', low_memory=False)
df_test = pd.read_csv(f'{PATH}test.csv', low_memory=False)
columns = ['Id', 'SeriousDlqin2yrs','RevolvingUtilizationOfUnsecuredLines', 'age',
                 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio', 'MonthlyIncome',
                 'NumberOfOpenCreditLinesAndLoans', 'NumberOfTimes90DaysLate',
                 'NumberRealEstateLoansOrLines', 'NumberOfTime60-89DaysPastDueNotWorse',
                 'NumberOfDependents']
df_raw.columns= columns
df_test.columns = columns
df_test.drop(['SeriousDlqin2yrs'], axis=1, inplace=True)
df = df_raw.copy()
combined,df,y,cat_cols,num_cols,test_id,n_train,n_test = preprocess_df(
                                       df_train=df,df_test=df_test,
                                       target_col='SeriousDlqin2yrs',
                                       id_col='Id',test_id='Id',
                                       convert_to_cat_cols=[
                                       'NumberOfTime30-59DaysPastDueNotWorse',
                                       'NumberOfTime60-89DaysPastDueNotWorse'
                                       ],
                                       new_features_func=add_new_features2,
                                       remove_skewness=True,
                                       skew_threshold=0.75,
                                       boxcox_lambda=0.2,
                                       scale_mapper=RobustScaler()
                                       )

df = combined[:n_train]
df_test = combined[n_train:]
stratify_col = y

X_train,X_test,y_train,y_test = train_test_split(df,y,test_size=0.10,
                                  stratify=y,shuffle = True,random_state=20)

stratify_X_train = stratify_col[:X_train.shape[0]].copy()
X_train.shape,X_test.shape,y_train.shape,y_test.shape, stratify_X_train.shape
X_train,X_valid,y_train,y_valid = train_test_split(X_train,y_train,test_size=0.2,
                                  stratify=stratify_X_train,shuffle = True,random_state=20)
X_train.shape,X_valid.shape,y_train.shape,y_valid.shape

0    1
1    0
2    0
3    0
4    0
Name: SeriousDlqin2yrs, dtype: int64
{'MonthlyIncome': 5400.0, 'NumberOfDependents': 0.0, 'DepsIncomeComined': 0.0}
0.0


((108000, 49), (27000, 49), (108000,), (27000,))

In [25]:
rf_randomSearch = joblib.load('randomSearch_rf_credit.pkl')
rf_model_rank1 =  joblib.load('rf_model_credit_rank1.pkl')
rf_model_rank2 =  joblib.load('rf_model_credit_rank2.pkl')
rf_model_rank3 =  joblib.load('rf_model_credit_rank3.pkl')

gb_randomSearch = joblib.load('randomSearch_gb_credit.pkl')
gb_model_rank1 =  joblib.load('gb_model_credit_rank1.pkl')
gb_model_rank2 =  joblib.load('gb_model_credit_rank2.pkl')
gb_model_rank3 =  joblib.load('gb_model_credit_rank3.pkl')
gb_rfe_model =    joblib.load('rfe_model_credit.pkl')

knn_randomSearch = joblib.load('randomSearch_knn.pkl')
knn_model_rank1 =  joblib.load('knn_model_rank1.pkl')
knn_model_rank2 =  joblib.load('knn_model_rank2.pkl')
knn_model_rank3 =  joblib.load('knn_model_rank3.pkl')

nb_model = joblib.load('nb_model.pkl')

FileNotFoundError: [Errno 2] No such file or directory: 'randomSearch_rf_credit.pkl'

In [None]:
preds_X_valid = {}
preds_X_valid['rf_randomSearch'] = rf_randomSearch.predict_proba(X_valid)[:, 1]
preds_X_valid['rf_model_rank1'] =  rf_model_rank1.predict_proba(X_valid)[:, 1]
preds_X_valid['rf_model_rank2'] =  rf_model_rank2.predict_proba(X_valid)[:, 1]
preds_X_valid['rf_model_rank3'] =  rf_model_rank3.predict_proba(X_valid)[:, 1]

preds_X_valid['gb_randomSearch'] =    gb_randomSearch.predict_proba(X_valid)[:, 1]
preds_X_valid['gb_model_rank1'] =     gb_model_rank1.predict_proba(X_valid)[:, 1]
preds_X_valid['gb_model_rank2'] =     gb_model_rank2.predict_proba(X_valid)[:, 1]
preds_X_valid['gb_model_rank3'] =     gb_model_rank3.predict_proba(X_valid)[:, 1]
preds_X_valid['gb_rfe_model'] =       gb_rfe_model.predict_proba(X_valid)[:, 1]

preds_X_valid['knn_randomSearch'] = knn_randomSearch.predict_proba(X_valid)[:, 1]
preds_X_valid['knn_model_rank1'] =  knn_model_rank1.predict_proba(X_valid)[:, 1]
preds_X_valid['knn_model_rank2'] =  knn_model_rank2.predict_proba(X_valid)[:, 1]
preds_X_valid['knn_model_rank3'] =  knn_model_rank3.predict_proba(X_valid)[:, 1]
preds_X_valid['nb_model'] =         nb_model.predict_proba(X_valid)[:, 1]

In [None]:
preds_df_valid = pd.DataFrame(preds_X_valid,columns = preds_X_valid.keys())
preds_df_valid.head()

In [None]:
preds_df_valid.shape

In [None]:
preds_X_test = {}
preds_X_test['rf_randomSearch'] = rf_randomSearch.predict_proba(X_test)[:, 1]
preds_X_test['rf_model_rank1'] =  rf_model_rank1.predict_proba(X_test)[:, 1]
preds_X_test['rf_model_rank2'] =  rf_model_rank2.predict_proba(X_test)[:, 1]
preds_X_test['rf_model_rank3'] =  rf_model_rank3.predict_proba(X_test)[:, 1]

preds_X_test['gb_randomSearch'] =    gb_randomSearch.predict_proba(X_test)[:, 1]
preds_X_test['gb_model_rank1'] =     gb_model_rank1.predict_proba(X_test)[:, 1]
preds_X_test['gb_model_rank2'] =     gb_model_rank2.predict_proba(X_test)[:, 1]
preds_X_test['gb_model_rank3'] =     gb_model_rank3.predict_proba(X_test)[:, 1]
preds_X_test['gb_rfe_model'] =       gb_rfe_model.predict_proba(X_test)[:, 1]

preds_X_test['knn_randomSearch'] = knn_randomSearch.predict_proba(X_test)[:, 1]
preds_X_test['knn_model_rank1'] =  knn_model_rank1.predict_proba(X_test)[:, 1]
preds_X_test['knn_model_rank2'] =  knn_model_rank2.predict_proba(X_test)[:, 1]
preds_X_test['knn_model_rank3'] =  knn_model_rank3.predict_proba(X_test)[:, 1]
preds_X_test['nb_model'] =         nb_model.predict_proba(X_test)[:, 1]

In [None]:
preds_df_test = pd.DataFrame(preds_X_test,columns = preds_X_test.keys())
preds_df_test.head()

In [None]:
meta_pipeline = make_pipeline(StandardScaler(),
                              ElasticNet(warm_start=True))


In [26]:
params = {'elasticnet__alpha':[0.001,0.01,0.1,1.],
          'elasticnet__l1_ratio': [0.4,0.5,0.6,0.7,0.8,0.9],
          'elasticnet__max_iter':[1000,2000,5000,10000],
          'elasticnet__selection':['cyclic','random']
         }

rs_meta_elastic = RandomizedSearchCV(meta_pipeline,param_distributions=params,
                                          n_jobs=6, n_iter=20).fit(preds_df_valid,y_valid)


NameError: name 'preds_df_valid' is not defined

In [27]:
params = {'elasticnet__alpha':[0.001,0.01,0.1,1.],
          'elasticnet__l1_ratio': [0.4,0.5,0.6,0.7,0.8,0.9],
          'elasticnet__max_iter':[1000,2000,5000,10000],
          'elasticnet__selection':['cyclic','random']
         }

rs_meta_elastic = RandomizedSearchCV(meta_pipeline,param_distributions=params,
                                          n_jobs=6, n_iter=20).fit(preds_df_test,y_test)




ValueError: Found input variables with inconsistent numbers of samples: [0, 15000]

In [28]:
preds_test = {}
preds_test['rf_randomSearch'] = rf_randomSearch.predict_proba(df_test)[:, 1]
preds_test['rf_model_rank1'] =  rf_model_rank1.predict_proba(df_test)[:, 1]
preds_test['rf_model_rank2'] =  rf_model_rank2.predict_proba(df_test)[:, 1]
preds_test['rf_model_rank3'] =  rf_model_rank3.predict_proba(df_test)[:, 1]

preds_test['gb_randomSearch'] =    gb_randomSearch.predict_proba(df_test)[:, 1]
preds_test['gb_model_rank1'] =     gb_model_rank1.predict_proba(df_test)[:, 1]
preds_test['gb_model_rank2'] =     gb_model_rank2.predict_proba(df_test)[:, 1]
preds_test['gb_model_rank3'] =     gb_model_rank3.predict_proba(df_test)[:, 1]
preds_test['gb_rfe_model'] =       gb_rfe_model.predict_proba(df_test)[:, 1]

preds_test['knn_randomSearch'] = knn_randomSearch.predict_proba(df_test)[:, 1]
preds_test['knn_model_rank1'] =  knn_model_rank1.predict_proba(df_test)[:, 1]
preds_test['knn_model_rank2'] =  knn_model_rank2.predict_proba(df_test)[:, 1]
preds_test['knn_model_rank3'] =  knn_model_rank3.predict_proba(df_test)[:, 1]
preds_test['nb_model'] =         nb_model.predict_proba(df_test)[:, 1]

NameError: name 'rf_randomSearch' is not defined

In [29]:
preds_test_df = pd.DataFrame(preds_test,columns = preds_test.keys())

In [30]:
meta_preds_test = rs_meta_elastic.predict(preds_test_df)

NameError: name 'rs_meta_elastic' is not defined

In [31]:
meta_df_submit = pd.DataFrame({'Id':test_id, 'Probability': pd.Series(meta_preds_test)},
              columns=['Id', 'Probability'])
meta_df_submit.head()

NameError: name 'meta_preds_test' is not defined

In [32]:
meta_df_submit.tail()

NameError: name 'meta_df_submit' is not defined