# Application
Use the logistic regressor to classify new loans.

In [238]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from sklearn import ensemble
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split as dataSplitter
from sklearn.model_selection import GridSearchCV
import scipy.stats as stats
from scipy.optimize import curve_fit as fitter
import math

In [239]:
df_simple = pd.read_csv("simpleData.csv")
df_simple['credit_score']=(df_simple['fico_range_high']+df_simple['fico_range_low'])/2.
df_simple.drop(columns = ['last_fico_range_high','last_fico_range_low'])
df_simple['Status']=df_simple['Status'].replace('Default','Charged Off')
df_simple['Status']=df_simple['Status'].replace('In Grace Period','Charged Off')

In [240]:
def grade_recoding(x):
    if type(x) == str:
        x=x.replace('A1','34')
        x=x.replace('A2','33')        
        x=x.replace('A3','32')        
        x=x.replace('A4','31') 
        x=x.replace('A5','30')        
        x=x.replace('B1','29')        
        x=x.replace('B2','28')        
        x=x.replace('B3','27')        
        x=x.replace('B5','26') 
        x=x.replace('B4','25')        
        x=x.replace('C1','24')        
        x=x.replace('C2','23')        
        x=x.replace('C3','22')        
        x=x.replace('C4','21')        
        x=x.replace('C5','20') 
        x=x.replace('D1','19')        
        x=x.replace('D2','18')        
        x=x.replace('D3','17')        
        x=x.replace('D4','16')        
        x=x.replace('D5','15') 
        x=x.replace('E1','14')        
        x=x.replace('E2','13')        
        x=x.replace('E3','12')        
        x=x.replace('E4','11')        
        x=x.replace('E5','10')
        x=x.replace('F1','9')        
        x=x.replace('F2','8')        
        x=x.replace('F3','7')        
        x=x.replace('F4','6')        
        x=x.replace('F5','5')
        x=x.replace('G1','4')        
        x=x.replace('G2','3')        
        x=x.replace('G3','2')        
        x=x.replace('G4','1')        
        x=x.replace('G5','1')
    return x

In [241]:
df_simple['Grade']=df_simple['Grade'].apply(grade_recoding)

In [242]:
def revol_util_map(x):
    if type(x) == str:
        x=x.replace('%','')
        x=float(x)
    return x

In [243]:
df_simple['revol_util']=df_simple['revol_util'].apply(revol_util_map)

In [244]:
status_groups = df_simple.groupby(['Status'])

In [245]:
predictorTitles=['credit_score','annual_inc','total_bal_ex_mort','installment','revol_util','Grade']
df_paid_predictors=status_groups.get_group('Fully Paid')[predictorTitles].astype(float)
df_charged_predictors= status_groups.get_group('Charged Off')[predictorTitles].astype(float)
df_paid_predictors['paid'] = [float(1) for i in range(0,len(df_paid_predictors['credit_score']))]
df_charged_predictors['paid'] = [float(0) for i in range(0,len(df_charged_predictors['credit_score']))]
df_predictors = pd.concat([df_paid_predictors,df_charged_predictors])
df_predictors.head()

Unnamed: 0,credit_score,annual_inc,total_bal_ex_mort,installment,revol_util,Grade,paid
2,697.0,50000.0,110431.0,252.3,32.5,32.0,1.0
3,687.0,63273.6,28983.0,883.18,46.0,19.0,1.0
5,712.0,50000.0,49899.0,185.24,42.0,33.0,1.0
6,662.0,30000.0,9985.0,399.54,48.8,11.0,1.0
8,712.0,75000.0,43319.0,585.08,46.2,21.0,1.0


In [246]:
normalizer = preprocessing.StandardScaler()
normed = pd.DataFrame(normalizer.fit_transform(df_predictors[['Grade','installment','revol_util']].values),columns=['Grade','installment','revol_util'])
maxminscaler = preprocessing.MinMaxScaler()
scaled = pd.DataFrame(maxminscaler.fit_transform(df_predictors[['credit_score','annual_inc','total_bal_ex_mort']].values),columns=['credit_score','annual_inc','total_bal_ex_mort'])

temp = df_predictors['paid'].values
df_predictors = normed.join(scaled)
df_predictors['paid'] = temp
df_predictors.head()



Unnamed: 0,Grade,installment,revol_util,credit_score,annual_inc,total_bal_ex_mort,paid
0,1.242761,-1.13016,-0.629118,0.233333,0.045999,0.228911,1.0
1,-0.640817,0.876961,-0.046978,0.166667,0.064089,0.059858,1.0
2,1.387651,-1.343509,-0.219464,0.333333,0.045999,0.103271,1.0
3,-1.799942,-0.661722,0.073762,0.0,0.018742,0.020426,1.0
4,-0.351036,-0.071433,-0.038354,0.333333,0.080071,0.089614,1.0


In [247]:
df_predictors['paid']=df_predictors['paid'].apply(lambda x: x*2-1) #make it 1 and -1 instead of 1 and 0
train, test = dataSplitter(df_predictors.values,test_size=.1,shuffle=True)
df_train = pd.DataFrame(train,columns=df_predictors.columns)
df_test = pd.DataFrame(test,columns=df_predictors.columns)
reg = GridSearchCV(LogisticRegression(penalty='l1',solver='liblinear'),{'tol':[1e-5,1e-4,1e-3],'C':[.01,.1,1,10,100]},cv=3).fit(df_train.drop('paid',axis=1),df_train['paid'])
print("test score = "+str(reg.score(df_test.drop('paid',axis=1),df_test['paid'])))
print("train score = " + str(reg.score(df_train.drop('paid',axis=1),df_train['paid'])))


test score = 0.88
train score = 0.8144796380090498


In [248]:
df_new = pd.read_csv("new.csv")
df_new['credit_score']=(df_new['fico_range_high']+df_new['fico_range_low'])/2.
df_new['sub_grade']=df_new['sub_grade'].apply(grade_recoding)
df_new['sub_grade']=df_new['sub_grade'].astype(float)
df_new['revol_util']=df_new['revol_util'].apply(revol_util_map)

In [249]:
normed = pd.DataFrame(normalizer.transform(df_new[['sub_grade','installment','revol_util']].values),columns=['sub_grade','installment','revol_util'])
scaled = pd.DataFrame(maxminscaler.transform(df_new[['credit_score','annual_inc','total_bal_ex_mort']].values),columns=['credit_score','annual_inc','total_bal_ex_mort'])

df_data = normed.join(scaled)
df_data.head()


Unnamed: 0,sub_grade,installment,revol_util,credit_score,annual_inc,total_bal_ex_mort
0,-0.061255,0.391056,0.034953,0.1,0.092336,0.220418
1,-0.785708,-0.741545,-0.723985,0.2,0.032371,0.078738
2,0.518308,0.171758,1.470898,0.2,0.044636,0.030055
3,-1.075489,1.129346,0.06945,0.166667,0.066442,0.096401
4,1.532542,1.965212,0.211751,0.666667,0.093699,0.499686


In [250]:

df_new['prob'] = [j for i,j in reg.predict_proba(df_data.values)]
df_new.to_csv("new_classified.csv",index=True)
