In [1]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix
import dask.dataframe as dd
from dask_ml.preprocessing import OneHotEncoder, DummyEncoder, Categorizer, LabelEncoder, StandardScaler
pd.set_option('display.max_rows', 200)

In [2]:
approved_data = dd.read_csv('../data/approved1.csv')

In [3]:
approved_data.dtypes

id                        int64
addr_state               object
annual_inc              float64
application_type         object
disbursement_method      object
dti                     float64
earliest_cr_line         object
emp_length               object
emp_title                object
fico_range_high         float64
fico_range_low          float64
grade                    object
home_ownership           object
initial_list_status      object
installment             float64
int_rate                float64
issue_d                  object
loan_amnt               float64
open_acc                float64
pub_rec                 float64
pub_rec_bankruptcies    float64
purpose                  object
sub_grade                object
term                     object
verification_status      object
zip_code                 object
loan_status              object
dtype: object

In [123]:
# number of missing in id
# print('Number of missing in addr_state: ', approved_data.addr_state.isna().sum().compute())
# print('Number of missing in annual_inc: ', approved_data.annual_inc.isna().sum().compute())
# print('Number of missing in disbursement_method: ', approved_data.disbursement_method.isna().sum().compute())
# print('Number of missing in dti: ', approved_data.dti.isna().sum().compute())
# print('Number of missing in earliest_cr_line: ', approved_data.earliest_cr_line.isna().sum().compute())
# print('Number of missing in emp_length: ', approved_data.emp_length.isna().sum().compute())
# print('Number of missing in fico_range_low: ', approved_data.fico_range_low.isna().sum().compute())
# print('Number of missing in fico_range_high: ', approved_data.fico_range_high.isna().sum().compute())
# print('Number of missing in grade: ', approved_data.grade.isna().sum().compute())
# print('Number of missing in home_ownership: ', approved_data.home_ownership.isna().sum().compute())
# print('Number of missing in initial_list_status: ', approved_data.initial_list_status.isna().sum().compute())
# print('Number of missing in installment: ', approved_data.installment.isna().sum().compute())
# print('Number of missing in int_rate: ', approved_data.int_rate.isna().sum().compute())
# print('Number of missing in issue_d: ', approved_data.issue_d.isna().sum().compute())
# print('Number of missing in loan_amnt: ', approved_data.loan_amnt.isna().sum().compute())
# print('Number of missing in open_acc: ', approved_data.open_acc.isna().sum().compute())
# print('Number of missing in pub_rec: ', approved_data.pub_rec.isna().sum().compute())
# print('Number of missing in pub_rec_bankruptcies: ', approved_data.pub_rec_bankruptcies.isna().sum().compute())
# print('Number of missing in purpose: ', approved_data.purpose.isna().sum().compute())
# print('Number of missing in sub_grade: ', approved_data.sub_grade.isna().sum().compute())
# print('Number of missing in term: ', approved_data.term.isna().sum().compute())
# print('Number of missing in verification_status: ', approved_data.verification_status.isna().sum().compute())
# print('Number of missing in zip_code: ', approved_data.zip_code.isna().sum().compute())
# print('Number of missing in loan_status: ', approved_data.loan_status.isna().sum().compute())


In [124]:
# print('Number of rows in the data: ', len(approved_data))

Number of rows in the data:  1303083


In [4]:
# Clean emp_length
approved_data.emp_length = approved_data.emp_length.replace(to_replace='< 1 year', value='0')
approved_data.emp_length = approved_data.emp_length.str.strip('<+ years')

In [5]:
mean_emp_length = np.floor(approved_data.emp_length.dropna().astype(int).mean())
approved_data.emp_length = approved_data.emp_length.fillna(mean_emp_length)

In [6]:
approved_data.emp_length = approved_data.emp_length.astype(int)

In [7]:
# Separate target from features
y = approved_data.loan_status
X = approved_data[['addr_state', 'annual_inc', 'application_type', 'disbursement_method', 'dti', 'earliest_cr_line', 'emp_length', 'fico_range_high', 'fico_range_low', 'grade',
'home_ownership', 'initial_list_status', 'installment', 'int_rate', 'issue_d', 'loan_amnt', 'open_acc', 'pub_rec', 'pub_rec_bankruptcies', 'purpose', 'sub_grade', 'term', 'verification_status']]

In [8]:
# Replace earliest_cr_line with number of days from earliest cr open to issue date
X.earliest_cr_line = X.earliest_cr_line.map_partitions(pd.to_datetime, meta=('earliest_cr_line', 'datetime64[ns]'))
X.issue_d = X.issue_d.map_partitions(pd.to_datetime, meta=('issue_d', 'datetime64[ns]'))
X['days_since_first_credit'] = (X.issue_d - X.earliest_cr_line).dt.days

In [9]:
# Drop earliest_cr_line and issue_d
X = X.drop(['earliest_cr_line', 'issue_d', 'grade'], axis=1)

In [10]:
approved_data.emp_length.value_counts().compute()

10    428296
5     157584
2     117971
0     104722
3     104288
1      85753
4      78078
6      60675
8      58718
7      57675
9      49323
Name: emp_length, dtype: int64

In [11]:
approved_data.annual_inc = approved_data.annual_inc[approved_data.annual_inc < 2e7]

In [12]:
ce = Categorizer(columns=['addr_state', 'application_type', 'disbursement_method', 'emp_length', 'home_ownership', 'initial_list_status', 'purpose', 'term', 'verification_status', 'sub_grade'])
X = ce.fit_transform(X)

In [13]:
de = DummyEncoder(columns=['addr_state', 'application_type', 'disbursement_method', 'emp_length', 'home_ownership', 'initial_list_status', 'purpose', 'term', 'verification_status', 'sub_grade'])
X = de.fit_transform(X)

In [14]:
le = LabelEncoder()
y = le.fit_transform(y)
y.compute()

array([1, 1, 1, ..., 1, 1, 1])

In [15]:
le.classes_.compute()

array(['Charged Off', 'Fully Paid'], dtype=object)

In [16]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [17]:
lda = LinearDiscriminantAnalysis()
cross_val_score(lda, X.compute(), y, cv=5)

array([0.80756436, 0.80954811, 0.81030785, 0.80480861, 0.80368818])

In [24]:
lda1 = LinearDiscriminantAnalysis(priors=[count_0, count_1])
X_train, X_test, y_train, y_test = train_test_split(X.compute(), y, test_size=0.2, shuffle=True)

  return array[key] if axis == 0 else array[:, key]
  return array[key] if axis == 0 else array[:, key]


In [23]:
count_1 = y.sum() / len(y)
count_0 = 1 - count_1

In [27]:
lda1.fit(X_train, y_train)
probs_positive_class = lda1.predict_proba(X_test)[:, 1]
prediction = probs_positive_class > .9

In [28]:
confusion_matrix(y_test, prediction, labels=[0, 1])

array([[ 45611,   4385],
       [149796,  60825]])