# Imported modules

In [144]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from tpot import TPOTClassifier
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
import eli5
from eli5.sklearn import PermutationImportance
%config InlineBackend.figure_format='retina'

In [78]:
X_train = pd.read_csv('data/train_features.csv')
X_test = pd.read_csv('data/test_features.csv')
y_train = pd.read_csv('data/train_labels.csv')['charged_off']
sample_submission = pd.read_csv('data/sample_submission.csv')

X_train.shape, X_test.shape, y_train.shape

((1309457, 103), (26724, 103), (1309457,))

In [79]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, train_size=40000, 
    random_state=42, stratify=y_train)

X_train.shape, X_val.shape, y_train.shape, y_val.shape



((40000, 103), (1269457, 103), (40000,), (1269457,))

# Basic Feature Engineering

In [80]:
def wrangle(X):
    X = X.copy()
    
    # Drop some columns
    X = X.drop(columns='id')  # id is random
    X = X.drop(columns=['member_id', 'url', 'desc'])  # All null
    X = X.drop(columns='title')  # Duplicative of purpose
    X = X.drop(columns='grade')  # Duplicative of sub_grade
    
    # Transform sub_grade from "A1" - "G5" to 1.1 - 7.5
    def wrangle_sub_grade(x):
        first_digit = ord(x[0]) - 64
        second_digit = int(x[1])
        return first_digit + second_digit/10
    
    X['sub_grade'] = X['sub_grade'].apply(wrangle_sub_grade)

    # Convert percentages from strings to floats
    X['int_rate'] = X['int_rate'].str.strip('%').astype(float)
    X['revol_util'] = X['revol_util'].str.strip('%').astype(float)
        
    # Transform earliest_cr_line to an integer: how many days it's been open
    X['earliest_cr_line'] = pd.to_datetime(X['earliest_cr_line'], infer_datetime_format=True)
    X['earliest_cr_line'] = pd.Timestamp.today() - X['earliest_cr_line']
    X['earliest_cr_line'] = X['earliest_cr_line'].dt.days
    
    # Create features for three employee titles: teacher, manager, owner
    X['emp_title'] = X['emp_title'].str.lower()
    X['emp_title_teacher'] = X['emp_title'].str.contains('teacher', na=False)
    X['emp_title_manager'] = X['emp_title'].str.contains('manager', na=False)
    X['emp_title_owner']   = X['emp_title'].str.contains('owner', na=False)
    
    # Drop categoricals with high cardinality
    X = X.drop(columns=['emp_title', 'zip_code'])
    
    # Transform features with many nulls to binary flags
    many_nulls = ['sec_app_mths_since_last_major_derog',
                  'sec_app_revol_util',
                  'sec_app_earliest_cr_line',
                  'sec_app_mort_acc',
                  'dti_joint',
                  'sec_app_collections_12_mths_ex_med',
                  'sec_app_chargeoff_within_12_mths',
                  'sec_app_num_rev_accts',
                  'sec_app_open_act_il',
                  'sec_app_open_acc',
                  'revol_bal_joint',
                  'annual_inc_joint',
                  'sec_app_inq_last_6mths',
                  'mths_since_last_record',
                  'mths_since_recent_bc_dlq',
                  'mths_since_last_major_derog',
                  'mths_since_recent_revol_delinq',
                  'mths_since_last_delinq',
                  'il_util',
                  'emp_length',
                  'mths_since_recent_inq',
                  'mo_sin_old_il_acct',
                  'mths_since_rcnt_il',
                  'num_tl_120dpd_2m',
                  'bc_util',
                  'percent_bc_gt_75',
                  'bc_open_to_buy',
                  'mths_since_recent_bc']

    for col in many_nulls:
        X[col] = X[col].isnull()
    
    # For features with few nulls, do mean imputation
    for col in X:
        if X[col].isnull().sum() > 0:
            X[col] = X[col].fillna(X[col].mean())
    
    # Return the wrangled dataframe
    return X

In [81]:
X_train = wrangle(X_train)
X_val = wrangle(X_val)
X_test = wrangle(X_test)
X_train.shape, X_val.shape, X_test.shape

((40000, 98), (1269457, 98), (26724, 98))

# Random Forest

In [89]:
pipe = make_pipeline(
    ce.OrdinalEncoder(), 
    RandomForestClassifier(
        n_estimators=100, 
        class_weight='balanced', 
        min_samples_leaf=0.005, 
        oob_score=True, 
        n_jobs=-1)
)

cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

array([0.72166073, 0.71818529, 0.70690291, 0.72988441, 0.71853243])

# OOB Random Forest

In [90]:
pipe.fit(X_train, y_train)
y_pred_proba = pipe.named_steps['randomforestclassifier'].oob_decision_function_[:, 1]
print('ROC AUC, Out-of-Bag estimate:', roc_auc_score(y_train, y_pred_proba))

ROC AUC, Out-of-Bag estimate: 0.7171838390078721


### Tune some hyperparams...

In [91]:
max_depths = list(range(2, 12, 2)) + [None]

for max_depth in max_depths:
    
    pipe = make_pipeline(
        ce.OrdinalEncoder(), 
        RandomForestClassifier(
            n_estimators=100, 
            class_weight='balanced', 
            max_depth=max_depth, 
            oob_score=True, 
            n_jobs=-1
        )
    )
        
    pipe.fit(X_train, y_train)
    y_pred_proba = pipe.named_steps['randomforestclassifier'].oob_decision_function_[:, 1]
    print('Max Depth:', max_depth)
    print('ROC AUC, OOB:', roc_auc_score(y_train, y_pred_proba))

Max Depth: 2
ROC AUC, OOB: 0.704247609936399
Max Depth: 4
ROC AUC, OOB: 0.7110098305233249
Max Depth: 6
ROC AUC, OOB: 0.7152228828813233
Max Depth: 8
ROC AUC, OOB: 0.7147043861681475
Max Depth: 10
ROC AUC, OOB: 0.712440469192982
Max Depth: None
ROC AUC, OOB: 0.6921191473369269


### eli5 permutation importance

In [149]:
import eli5
from eli5.sklearn import PermutationImportance

encoder = ce.OrdinalEncoder()
X_train_transformed = encoder.fit_transform(X_train)

model = RandomForestClassifier(
    n_estimators=100, 
    class_weight='balanced', 
    min_samples_leaf=0.005, 
    n_jobs=-1)

model.fit(X_train_transformed, y_train)
permuter = PermutationImportance(model, scoring='roc_auc', n_iter=1, cv='prefit')
permuter.fit(X_train_transformed, y_train)

PermutationImportance(cv='prefit',
           estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=0.005,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False),
           n_iter=1, random_state=None, refit=True, scoring='roc_auc')

In [97]:
eli5.show_weights(permuter, top=None, feature_names=X_train_transformed.columns.tolist())


Weight,Feature
0.0215  ± 0.0000,sub_grade
0.0148  ± 0.0000,int_rate
0.0132  ± 0.0000,term
0.0106  ± 0.0000,dti
0.0048  ± 0.0000,acc_open_past_24mths
0.0039  ± 0.0000,loan_amnt
0.0038  ± 0.0000,funded_amnt
0.0033  ± 0.0000,installment
0.0033  ± 0.0000,tot_hi_cred_lim
0.0032  ± 0.0000,mort_acc


Use Permutation Importance weights for feature selection...

In [147]:
subset = X_train.columns[permuter.feature_importances_ > 0]

pipe = make_pipeline(
    ce.OrdinalEncoder(), 
    RandomForestClassifier(
        n_estimators=100, 
        class_weight='balanced', 
        min_samples_leaf=0.005, 
        n_jobs=-1)
)

cross_val_score(pipe, X_train[subset], y_train, cv=5, scoring='roc_auc')

array([0.71977165, 0.71899949, 0.70487939, 0.73095455, 0.71941743])

# XGBoost

In [150]:
pipe = make_pipeline(ce.OrdinalEncoder(),
                     XGBClassifier(learning_rate=0.1, max_depth=3, min_child_weight=12, 
                     n_estimators=100, n_jobs=-1, subsample=0.9000000000000001))

cross_val_score(pipe, X_train[subset], y_train, cv=5, scoring='roc_auc')

array([0.72612208, 0.72333177, 0.71364379, 0.73662022, 0.72758173])

In [151]:
pipe.fit(X_train, y_train)
roc_auc_score(y_val, pipe.predict_proba(X_val)[:, 1])

0.7201518905672316

# Kaggle Submission

In [152]:
submission = sample_submission.copy()
submission['charged_off'] = pipe.predict_proba(X_test)[:, 1]
submission.to_csv('submission-004.csv', index=False)