Import All Packages

In [28]:
import pandas as pd
import numpy as np
import time

import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")
import seaborn as sns

import matplotlib.pyplot as plt
sns.set(style="white", color_codes=True)
from matplotlib.pyplot import cm
%matplotlib inline

from sklearn.manifold import TSNE

from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
import xgboost as xgb
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import StratifiedKFold
import sklearn.metrics
# from sklearn.metrics import balanced_accuracy_score

from sklearn.metrics import roc_auc_score, accuracy_score
# from sklearn.metrics import balanced_accuracy_score
from imblearn.over_sampling import SMOTE

ImportError: cannot import name 'balanced_accuracy_score'

Load All Data

In [3]:
def load_data(maxLines, duplicates):

    """
        Import datasets, select features and define the default-flag collumn.
    """
    
    col_acq = ['LoanID','Channel','SellerName','OrInterestRate','OrUnpaidPrinc','OrLoanTerm',
            'OrDate','FirstPayment','OrLTV','OrCLTV','NumBorrow','DTIRat','CreditScore',
            'FTHomeBuyer','LoanPurpose','PropertyType','NumUnits','OccStatus','PropertyState',
            'Zip','MortInsPerc','ProductType','CoCreditScore','MortInsType','RelMortInd']
    
    extended_selec_acq = ['LoanID', 'OrLTV', 'LoanPurpose', 'DTIRat', 'PropertyType', 'FTHomeBuyer', 'Channel', 'SellerName','OrInterestRate', 'CreditScore', 'NumBorrow', 'OrDate'] 
    col_acq_subset = extended_selec_acq 
    
    col_per = ['LoanID','MonthRep','Servicer','CurrInterestRate','CAUPB','LoanAge','MonthsToMaturity',
              'AdMonthsToMaturity','MaturityDate','MSA','CLDS','ModFlag','ZeroBalCode','ZeroBalDate',
              'LastInstallDate','ForeclosureDate','DispositionDate','PPRC','AssetRecCost','MHRC',
              'ATFHP','NetSaleProceeds','CreditEnhProceeds','RPMWP','OFP','NIBUPB','PFUPB','RMWPF',
              'FPWA','ServicingIndicator'] 
    
    extended_selec_per = ['LoanID', 'MonthsToMaturity', 'CurrInterestRate', 'ForeclosureDate', 'LoanAge', 'CLDS', 'MaturityDate','ZeroBalCode', 'MonthRep']
    
    col_per_subset =  extended_selec_per 
    
    lines_to_read = maxLines
    aquisition_frame = pd.read_csv('C:/Users/bebxadvberb/Documents/AI/Trusted AI/Acquisition_2007Q4.txt', sep='|', names=col_acq, usecols=col_acq_subset, index_col=False, nrows=lines_to_read )
    performance_frame = pd.read_csv('C:/Users/bebxadvberb/Documents/AI/Trusted AI/Performance_2007Q4.txt', sep='|', names=col_per, usecols=col_per_subset, index_col=False, nrows=lines_to_read) 

    """ Fix the IDs in the observation set by fixing their reporting date AND requiring that the files are healthy. """
    
    observation_frame = performance_frame[(performance_frame.MonthRep == '12/01/2017') & 
                                (   (performance_frame.CLDS == '0') | 
                                    (performance_frame.CLDS == '1') | 
                                    (performance_frame.CLDS == '2')
                                )
                                ]
    obs_ids = observation_frame.LoanID
    
    """ Load only the observation IDs in the performance frame initially. """
    pf = performance_frame[performance_frame.LoanID.isin(obs_ids)]
    
    """ Keep only the reporting dates that are in our performance period (MM/DD/YYYY format). """
    pf_obs = pf[
                    (pf.MonthRep == '01/01/2018') | 
                    (pf.MonthRep == '02/01/2018') |
                    (pf.MonthRep == '03/01/2018') |
                    (pf.MonthRep == '04/01/2018') |
                    (pf.MonthRep == '05/01/2018') |
                    (pf.MonthRep == '06/01/2018') |
                    (pf.MonthRep == '07/01/2018') |
                    (pf.MonthRep == '08/01/2018') |
                    (pf.MonthRep == '09/01/2018') |
                    (pf.MonthRep == '10/01/2018') |
                    (pf.MonthRep == '11/01/2018') |
                    (pf.MonthRep == '12/01/2018') 
                ]
    
    """ 
    Find the LoanIDs of those loans where a default appears in our performance period.
    """
    pf_obs_defaults = pf_obs[
                            (pf_obs.CLDS != '0') &
                            (pf_obs.CLDS != '1') &
                            (pf_obs.CLDS != '2') &
                            (pf_obs.CLDS != 'X')
                        ].LoanID
    
    pf_obs_defaults.drop_duplicates(keep='last', inplace=True)
    
    """ Merge the acquisition and performance frames. """
    df = pd.merge(aquisition_frame, observation_frame, on = 'LoanID', how='inner')
    
    df['Default'] = 0
    df.loc[df['LoanID'].isin(pf_obs_defaults), 'Default'] = 1
    
    
    return df

In [4]:
def make_target_var(df):
    df.rename(index=str, columns={'ForeclosureDate': 'Default'}, inplace= True)
    df['Default'].fillna(0, inplace=True)
    df.loc[df['Default'] != 0, 'Default'] = 1

In [5]:
def get_na_columns(df):
    na_columns = df.columns[df.isnull().any()]
    return na_columns

In [6]:
def get_cat_feat(df):
    cat_feat = df.select_dtypes(include=['object']).columns
    return cat_feat

def get_num_feat(df):
    num_feat = df.select_dtypes(exclude=['object']).columns
    return num_feat

In [7]:
def normalize(df):
    df_norm = df
    df_norm = df_norm.apply(lambda x: (x - x.min()) / (x.max() - x.min()))
    return df_norm

In [8]:
def makeDateNumeric(text):
    numMonths = int(text[:2])
    numYears = int(text[3:7])
    result = (numYears - 2000) * 12 + numMonths
    return result


def makeDayNumeric(text):
    numMonths = int(text[:2])
    numYears = int(text[6:10])
    result = (numYears - 2000) * 12 + numMonths
    return result

def make_dates_numeric(df):
    # TRANSFORM DATES TO NUMBER OF MONTHS (STARTING FROM 01/2000)
    df['MonthRep'] = df['MonthRep'].apply(makeDayNumeric)
    df['OrDate'] = df['OrDate'].apply(makeDateNumeric)
    df['FirstPayment'] = df['FirstPayment'].apply(makeDateNumeric)
    df['MaturityDate'] = df['MaturityDate'].apply(makeDateNumeric)

In [9]:
def to_2D(df):
        # TSNE - Dimentionality reduction
    print ('Training T-SNE ...')

    tsne = TSNE(n_components=2, random_state=1, n_iter=250, verbose=1)
    df_2D = tsne.fit_transform(df)
    df_2D = pd.DataFrame(df_2D)
    return df_2D

In [10]:
def tsne_visual(df):   
        # TSNE - Dimentionality reduction
    print ('Training T-SNE ...')

    tsne = TSNE(n_components=2, random_state=1, n_iter=250, verbose=1)
    df_2D = tsne.fit_transform(df.drop('Default', axis=1))
    df_2D = pd.DataFrame(df_2D)
    
    
    #Making plot
    plt.figure(figsize=(20,10))
    color = cm.rainbow(np.linspace(0,1, max(map(abs, set(df['Default'])))+1))
    for i in range(0, len(df_2D)):
        c1 = plt.scatter(df_2D.iloc[i][0], 
                         df_2D.iloc[i][1],
                     c = color[int(df.iloc[i]['Default'])],
                     marker=r"$ {} $".format(int(df.iloc[i]['Default'])), s=150, edgecolors='none')

    plt.xlabel('T-SNE Dim 1')
    plt.ylabel('T-SNE Dim 2')
    plt.title('Actives')
    plt.legend(loc='best')
    plt.grid(True)
    plt.savefig('Actives' + '.pdf', format='pdf', dpi=900)
    plt.show()

MAIN

In [11]:
df = load_data(100000, False)
# make_dates_numeric(df)

# FORCE FEATURES TO BE INCLUDED
df = df[pd.notnull(df['CreditScore'])]
# df = df[pd.notnull(df['CLDS'])]
df = df[pd.notnull(df['DTIRat'])]

# FORCE FEATURES TO BE EXCLUDED
df = df.drop('LoanID',axis=1)

# FORCE FEATURE TO BE CATEGORICAL
# df['Zip'] = df['Zip'].astype('object',copy=False)
# df['MSA'] = df['MSA'].astype('object',copy=False)


na_columns = get_na_columns(df)
df = df.drop(na_columns,axis=1)

cat_feat = get_cat_feat(df)
cat_data = df[cat_feat]
cat_data = cat_data.apply(LabelEncoder().fit_transform)

num_feat = get_num_feat(df)
num_data = normalize(df[num_feat])

df = pd.concat([num_data, cat_data], axis=1)
print(df.columns)

Index(['OrInterestRate', 'OrLTV', 'NumBorrow', 'DTIRat', 'CreditScore',
       'CurrInterestRate', 'LoanAge', 'MonthsToMaturity', 'Default', 'Channel',
       'SellerName', 'OrDate', 'FTHomeBuyer', 'LoanPurpose', 'PropertyType',
       'MonthRep', 'MaturityDate', 'CLDS'],
      dtype='object')


MAIN 2

In [12]:
"""
    Split the target variable from the input variables.
""" 
y = df['Default']
X = df.drop(['Default'], axis=1)

In [13]:
print(y.value_counts())

0.0    144
1.0     16
Name: Default, dtype: int64


StratifiedKFold(n_splits=10, random_state=None, shuffle=False)


In [14]:
"""
    Balance classes with Synthetic Minority Oversampling Technique (SMOTE) or Random Undersampling.
    Choose here either sampling_method = "Random Under Sampling" or "SMOTE". 
    Save n_1 and n_0 separately to apply a correction to the resampling when using the model to predict.
"""     

n_1 = sum(y)
n_0 = len(y) - sum(y)

sm = SMOTE()

X_cols = X.columns
X, y = sm.fit_sample(X, y) # fit_sample takes a dataframe, but returns an array. 
(X, y) = (pd.DataFrame(X, columns=X_cols), pd.Series(y))
print(y.value_counts())

0.0    144
1.0    144
dtype: int64


In [16]:
# X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
# y = np.array([0, 0, 1, 1])
skf = StratifiedKFold(n_splits=10)
skf.get_n_splits(X, y)

print(skf)

from xgboost import XGBClassifier
model = XGBClassifier()

StratifiedKFold(n_splits=10, random_state=None, shuffle=False)


In [24]:
for train_index, test_index in skf.split(X, y):
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    print(y_train.isnull().values.any())
    
    # fit model on training data
    model.fit(X_train, y_train)
    # make predictions for test data
    y_pred = model.predict(X_test)
    predictions = [round(value) for value in y_pred]
    # evaluate predictions
    auc = roc_auc_score(y_test, predictions)
    accuracy = accuracy_score(y_test, predictions)
    print("Accuracy: %.2f%%" % (accuracy * 100.0), "AUC: %.2f%%" % (auc * 100.0))

False
144.0 288
Accuracy: 83.33% AUC: 83.33%
False
144.0 288
Accuracy: 93.33% AUC: 93.33%
False
144.0 288
Accuracy: 93.33% AUC: 93.33%
False
144.0 288
Accuracy: 96.67% AUC: 96.67%
False
144.0 288
Accuracy: 89.29% AUC: 89.29%
False
144.0 288
Accuracy: 100.00% AUC: 100.00%
False
144.0 288
Accuracy: 100.00% AUC: 100.00%
False
144.0 288
Accuracy: 100.00% AUC: 100.00%
False
144.0 288
Accuracy: 100.00% AUC: 100.00%
False
144.0 288
Accuracy: 96.43% AUC: 96.43%
