In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [2]:
df = pd.read_csv('CreditScoring.csv', encoding='latin1')

In [3]:
df.columns

Index(['id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'term', 'int_rate',
       'installment', 'grade', 'emp_title', 'emp_length', 'home_ownership',
       'annual_inc', 'verification_status', 'issue_d', 'purpose', 'addr_state',
       'dti', 'delinq_2yrs', 'earliest_cr_line', 'inq_last_6mths', 'open_acc',
       'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp',
       'total_pymnt', 'loan_status', 'risk'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 310704 entries, 0 to 310703
Data columns (total 29 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   310704 non-null  int64  
 1   loan_amnt            310704 non-null  int64  
 2   funded_amnt          310704 non-null  int64  
 3   funded_amnt_inv      310704 non-null  float64
 4   term                 310704 non-null  object 
 5   int_rate             310704 non-null  float64
 6   installment          310704 non-null  float64
 7   grade                310704 non-null  object 
 8   emp_title            281139 non-null  object 
 9   emp_length           288089 non-null  object 
 10  home_ownership       310704 non-null  object 
 11  annual_inc           310704 non-null  float64
 12  verification_status  310704 non-null  object 
 13  issue_d              310704 non-null  object 
 14  purpose              310704 non-null  object 
 15  addr_state       

In [5]:
print(df.shape)

(310704, 29)


In [6]:
df.head()

Unnamed: 0,id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,emp_title,emp_length,...,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,total_pymnt,loan_status,risk
0,1,2500,2500,2500.0,36 months,13.56,84.92,C,Chef,10+ years,...,0.0,11,1,15603,37.0,19,0.0,30026.4425,Fully Paid,0
1,2,30000,30000,30000.0,60 months,18.94,777.23,D,Postmaster,10+ years,...,0.0,18,0,34971,64.5,37,0.0,40856.67896,Fully Paid,0
2,3,5000,5000,5000.0,36 months,17.97,180.69,D,Administrative,6 years,...,0.0,9,0,25416,29.9,19,0.0,20215.79243,Fully Paid,0
3,4,4000,4000,4000.0,36 months,18.94,146.51,D,IT Supervisor,10+ years,...,0.0,12,0,4472,15.3,25,0.0,4549.217149,Fully Paid,0
4,5,30000,30000,30000.0,60 months,16.14,731.78,C,Mechanic,10+ years,...,0.0,21,0,36812,65.7,37,0.0,8735.148975,Fully Paid,0


In [7]:
df.tail()

Unnamed: 0,id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,emp_title,emp_length,...,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,total_pymnt,loan_status,risk
310699,1048571,25000,25000,25000.0,36 months,16.99,891.2,D,Analyst,2 years,...,0.0,3,0,1946,59.0,12,34.0,6201.43,Late (31-120 days),1
310700,1048572,24825,24825,24825.0,36 months,16.55,879.53,D,DENTIST,1 year,...,3.0,15,0,15745,48.4,34,10888.4,17769.66,Late (31-120 days),1
310701,1048573,19650,19650,19650.0,60 months,13.33,450.43,C,Director,10+ years,...,0.0,11,0,34256,85.0,26,7840.58,15088.78,Late (31-120 days),1
310702,1048574,31200,31200,31200.0,60 months,19.99,826.44,E,Certified Occupational Therapy Assistant,< 1 year,...,1.0,6,0,7261,71.2,8,10251.72,18602.73,Late (31-120 days),1
310703,1048575,25000,25000,25000.0,60 months,13.33,573.06,C,Principal Engineer,1 year,...,0.0,9,0,6400,34.8,12,1.87,7309.82,Late (31-120 days),1


In [8]:
df.describe()

Unnamed: 0,id,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,total_pymnt,risk
count,310704.0,310704.0,310704.0,310704.0,310704.0,310704.0,310704.0,310556.0,310704.0,310703.0,310704.0,310704.0,310704.0,310491.0,310704.0,310704.0,310704.0,310704.0
mean,620466.1,15518.606133,15518.606133,15511.817884,12.565801,452.838937,80539.98,19.020776,0.338029,0.607374,11.882889,0.248008,16052.73,48.495109,24.916757,723.861865,13430.48343,0.333655
std,392886.2,9196.532109,9196.532109,9195.219672,4.702061,264.514016,92862.94,12.339552,0.921206,0.888372,5.793843,0.671583,23228.24,24.812508,12.312579,3602.732652,10010.944733,0.471519
min,1.0,1000.0,1000.0,725.0,5.32,14.77,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
25%,77676.75,8275.0,8275.0,8250.0,8.81,259.42,48000.0,12.07,0.0,0.0,8.0,0.0,5515.75,29.6,16.0,0.0,5783.21854,0.0
50%,758625.5,14000.0,14000.0,14000.0,11.99,387.55,67200.0,18.19,0.0,0.0,11.0,0.0,10590.0,48.0,23.0,0.0,10741.86,0.0
75%,970899.2,20800.0,20800.0,20800.0,15.31,602.95,95000.0,25.08,0.0,1.0,15.0,0.0,19085.0,67.3,32.0,0.0,18592.57382,1.0
max,1048575.0,40000.0,40000.0,40000.0,30.99,1618.24,9757200.0,999.0,21.0,5.0,81.0,86.0,1044210.0,182.8,176.0,40000.0,59808.26209,1.0


In [9]:
df=df.drop('id',axis=1)
df.shape

(310704, 28)

In [10]:
df.isnull().sum()

loan_amnt                  0
funded_amnt                0
funded_amnt_inv            0
term                       0
int_rate                   0
installment                0
grade                      0
emp_title              29565
emp_length             22615
home_ownership             0
annual_inc                 0
verification_status        0
issue_d                    0
purpose                    0
addr_state                 0
dti                      148
delinq_2yrs                0
earliest_cr_line           0
inq_last_6mths             1
open_acc                   0
pub_rec                    0
revol_bal                  0
revol_util               213
total_acc                  0
out_prncp                  0
total_pymnt                0
loan_status                0
risk                       0
dtype: int64

In [11]:
df=df.drop(['emp_title', 'emp_length'],axis=1)
df.shape

(310704, 26)

In [12]:
df=df.fillna(df.mean())

  df=df.fillna(df.mean())


In [13]:
df.isnull().sum()

loan_amnt              0
funded_amnt            0
funded_amnt_inv        0
term                   0
int_rate               0
installment            0
grade                  0
home_ownership         0
annual_inc             0
verification_status    0
issue_d                0
purpose                0
addr_state             0
dti                    0
delinq_2yrs            0
earliest_cr_line       0
inq_last_6mths         0
open_acc               0
pub_rec                0
revol_bal              0
revol_util             0
total_acc              0
out_prncp              0
total_pymnt            0
loan_status            0
risk                   0
dtype: int64

In [14]:
df['term'].value_counts()

 36 months    216036
 60 months     94668
Name: term, dtype: int64

In [15]:
def term_to_int(term):
    return term[:3]

In [16]:
df['term'] = df['term'].map(term_to_int)
df['term']

0          36
1          60
2          36
3          36
4          60
         ... 
310699     36
310700     36
310701     60
310702     60
310703     60
Name: term, Length: 310704, dtype: object

In [1]:
le_grade = LabelEncoder()
df['grade'] = le_grade.fit_transform(df['grade'])

le_home_ownership = LabelEncoder()
df['home_ownership'] = le_home_ownership.fit_transform(df['home_ownership'])

le_verification_status = LabelEncoder()
df['verification_status'] = le_verification_status.fit_transform(df['verification_status'])

le_earliest_cr_line = LabelEncoder()
df['earliest_cr_line'] = le_earliest_cr_line.fit_transform(df['earliest_cr_line'])

le_issue_d = LabelEncoder()
df['issue_d'] = le_issue_d.fit_transform(df['issue_d'])

le_purpose = LabelEncoder()
df['purpose'] = le_purpose.fit_transform(df['purpose'])

le_addr_state = LabelEncoder()
df['addr_state'] = le_addr_state.fit_transform(df['addr_state'])

df

In [18]:
X = df.drop('loan_status', axis = 1)
y = df['loan_status']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size=0.2, random_state=42)

In [20]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(62140, 25)
(248564, 25)
(62140,)
(248564,)


In [21]:
X_train

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,home_ownership,annual_inc,verification_status,...,earliest_cr_line,inq_last_6mths,open_acc,pub_rec,revol_bal,revol_util,total_acc,out_prncp,total_pymnt,risk
58402,10000,10000,10000.0,36,8.81,317.12,0,1,100000.0,0,...,404,0.0,17,0,22420,49.0,31,0.00,25456.227260,0
302622,2000,2000,2000.0,36,12.59,67.00,2,1,45000.0,1,...,461,0.0,8,1,9809,59.8,20,0.00,11806.530000,1
125062,20000,20000,20000.0,36,5.32,602.30,0,3,70000.0,1,...,166,1.0,11,0,15355,65.1,16,0.00,35742.399550,0
141995,35000,35000,35000.0,60,20.75,941.96,4,3,98000.0,0,...,63,1.0,18,0,23614,48.0,24,0.00,1214.810000,0
243473,7000,7000,7000.0,36,9.17,223.16,1,1,60000.0,1,...,575,1.0,11,0,4963,44.7,23,16993.91,653.700000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119879,3025,3025,3025.0,36,9.75,97.26,1,1,65000.0,0,...,218,1.0,7,0,377,2.3,18,0.00,3119.190000,0
259178,12000,12000,12000.0,36,9.17,382.55,1,2,65000.0,1,...,143,1.0,10,2,5629,37.5,39,911.87,4656.350000,1
131932,10000,10000,10000.0,36,13.67,340.18,2,1,38000.0,1,...,523,1.0,16,0,958,2.2,26,0.00,2408.530459,0
146867,27600,27600,27600.0,60,13.67,637.50,2,3,80000.0,0,...,350,0.0,26,0,12590,12.2,40,0.00,32309.200860,0


In [22]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [23]:
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
y_test

178139    Fully Paid
99194     Fully Paid
195870    Fully Paid
88626     Fully Paid
111234    Fully Paid
             ...    
153387    Fully Paid
100720    Fully Paid
48878     Fully Paid
19832     Fully Paid
56331     Fully Paid
Name: loan_status, Length: 248564, dtype: object

In [25]:
print('LogisticRegression Model Accuracy Score: {0:0.2f}'. format(accuracy_score(y_test, lr_pred)*100)+ "%")

LogisticRegression Model Accuracy Score: 97.53%
