Import All Packages

In [11]:
import pandas as pd
import numpy as np

import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
warnings.filterwarnings("ignore")
import seaborn as sns

import matplotlib.pyplot as plt
sns.set(style="white", color_codes=True)
from matplotlib.pyplot import cm
%matplotlib inline

from sklearn.manifold import TSNE

from sklearn.preprocessing import LabelEncoder

Load All Data

In [102]:
def load_data(maxLines):

    #  The features of Acquisition file
    col_acq = ['LoanID', 'Channel', 'SellerName', 'OrInterestRate', 'OrUnpaidPrinc', 'OrLoanTerm',
               'OrDate', 'FirstPayment', 'OrLTV', 'OrCLTV', 'NumBorrow', 'DTIRat', 'CreditScore',
               'FTHomeBuyer', 'LoanPurpose', 'PropertyType', 'NumUnits', 'OccStatus', 'PropertyState',
               'Zip', 'MortInsPerc', 'ProductType', 'CoCreditScore', 'MortInsType', 'RelMortInd']

    #  The features of Performance file
    col_per = ['LoanID', 'MonthRep', 'Servicer', 'CurrInterestRate', 'CAUPB', 'LoanAge', 'MonthsToMaturity',
               'AdMonthsToMaturity', 'MaturityDate', 'MSA', 'CLDS', 'ModFlag', 'ZeroBalCode', 'ZeroBalDate',
               'LastInstallDate', 'ForeclosureDate', 'DispositionDate', 'PPRC', 'AssetRecCost', 'MHRC',
               'ATFHP', 'NetSaleProceeds', 'CreditEnhProceeds', 'RPMWP', 'OFP', 'NIBUPB', 'PFUPB', 'RMWPF',
               'FPWA', 'ServicingIndicator']

    linesToRead = maxLines

    aquisition_frame = pd.read_csv('C:/Users/bebxadvberb/Documents/AI/Trusted AI/Acquisition_2007Q4.txt', sep='|', names=col_acq, nrows= linesToRead)
    performance_frame = pd.read_csv('C:/Users/bebxadvberb/Documents/AI/Trusted AI/Performance_2007Q4.txt', sep='|', names=col_per, index_col=False, nrows = linesToRead)

    # performance_frame.drop_duplicates(subset='LoanID', keep='last', inplace=True)

    # Merge the two DF's together using inner join
    df = pd.merge(aquisition_frame, performance_frame, on = 'LoanID', how='inner')
    
    return df

In [4]:
def make_target_var(df):
    df.rename(index=str, columns={'ForeclosureDate': 'Default'}, inplace= True)
    df['Default'].fillna(0, inplace=True)
    df.loc[df['Default'] != 0, 'Default'] = 1

In [24]:
def get_na_columns(df):
    na_columns = df.columns[df.isnull().any()]
    return na_columns

In [None]:
def get_cat_feat(df):
    cat_feat = df.select_dtypes(include=['object']).columns
    return cat_feat

def get_num_feat(df):
    num_feat = df.select_dtypes(exclude=['object']).columns
    return num_feat

In [60]:
def normalize(df):
    df_norm = df
    df_norm = df_norm.apply(lambda x: (x - x.min()) / (x.max() - x.min()))
    return df_norm

In [20]:
def makeDateNumeric(text):
    numMonths = int(text[:2])
    numYears = int(text[3:7])
    result = (numYears - 2000) * 12 + numMonths
    return result


def makeDayNumeric(text):
    numMonths = int(text[:2])
    numYears = int(text[6:10])
    result = (numYears - 2000) * 12 + numMonths
    return result

def make_dates_numeric(df):
    # TRANSFORM DATES TO NUMBER OF MONTHS (STARTING FROM 01/2000)
    df['MonthRep'] = df['MonthRep'].apply(makeDayNumeric)
    df['OrDate'] = df['OrDate'].apply(makeDateNumeric)
    df['FirstPayment'] = df['FirstPayment'].apply(makeDateNumeric)
    df['MaturityDate'] = df['MaturityDate'].apply(makeDateNumeric)

In [36]:
def to_2D(df):
        # TSNE - Dimentionality reduction
    print ('Training T-SNE ...')

    tsne = TSNE(n_components=2, random_state=1, n_iter=250, verbose=1)
    df_2D = tsne.fit_transform(df)
    df_2D = pd.DataFrame(df_2D)
    return df_2D

In [56]:
def tsne_visual(df, df_2D):   
    #Making plot
    plt.figure(figsize=(20,10))
    color = cm.rainbow(np.linspace(0,1, max(map(abs, set(df['Default'])))+1))
    for i in range(0, len(df_2D)):
        c1 = plt.scatter(df_2D.iloc[i][0], 
                         df_2D.iloc[i][1],
                     c = color[int(df.iloc[i]['Default'])],
                     marker=r"$ {} $".format(int(df.iloc[i]['Default'])), s=150, edgecolors='none')

    plt.xlabel('T-SNE Dim 1')
    plt.ylabel('T-SNE Dim 2')
    plt.title('Actives')
    plt.legend(loc='best')
    plt.grid(True)
    plt.savefig('Actives' + '.pdf', format='pdf', dpi=900)
    plt.show()

MAIN

In [None]:
df = load_data(100000)
make_target_var(df)
# make_dates_numeric(df)


df = df[pd.notnull(df['CreditScore'])]
# df = df[pd.notnull(df['CLDS'])]


na_columns = get_na_columns(df)
print("Na Columns")
print(na_columns)
df = df.drop(na_columns,axis=1)

cat_feat = get_cat_feat(df)
cat_data = df[cat_feat]
cat_data = cat_data.apply(LabelEncoder().fit_transform)

num_feat = get_num_feat(df)
num_data = normalize(df[num_feat])

df = pd.concat([num_data, cat_data], axis=1)


df_2D = to_2D(df)
tsne_visual(df,df_2D)

Na Columns
Index(['DTIRat', 'MortInsPerc', 'CoCreditScore', 'MortInsType', 'Servicer',
       'CAUPB', 'AdMonthsToMaturity', 'CLDS', 'ZeroBalCode', 'ZeroBalDate',
       'LastInstallDate', 'DispositionDate', 'PPRC', 'AssetRecCost', 'MHRC',
       'ATFHP', 'NetSaleProceeds', 'CreditEnhProceeds', 'RPMWP', 'OFP',
       'NIBUPB', 'PFUPB', 'RMWPF', 'FPWA', 'ServicingIndicator'],
      dtype='object')
Training T-SNE ...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 99939 samples in 0.609s...
[t-SNE] Computed neighbors for 99939 samples in 10.671s...
[t-SNE] Computed conditional probabilities for sample 1000 / 99939
[t-SNE] Computed conditional probabilities for sample 2000 / 99939
[t-SNE] Computed conditional probabilities for sample 3000 / 99939
[t-SNE] Computed conditional probabilities for sample 4000 / 99939
[t-SNE] Computed conditional probabilities for sample 5000 / 99939
[t-SNE] Computed conditional probabilities for sample 6000 / 99939
[t-SNE] Computed conditional probab

MAIN 2

In [89]:
df = load_data()
make_target_var(df)
# make_dates_numeric(df)

df = df[pd.notnull(df['CLDS'])]

df.CLDS.unique()

array(['0', 'X', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11',
       '12', '13', '14', '15', '16', '17'], dtype=object)