In [51]:
!python -V

Python 3.10.4


In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score
# % matplotlib inline

In [53]:
data_scoring = pd.read_csv("data/credit.csv")
data_scoring.head()

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,Fully Paid,445412,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,,6,1,228190,416746.0,1.0,0.0
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,262328,Short Term,,,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35,0,229976,850784.0,0.0,0.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,99999999,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18,1,297996,750090.0,0.0,0.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,e777faab-98ae-45af-9a86-7ce5b33b1011,Fully Paid,347666,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,,9,0,256329,386958.0,0.0,0.0
4,d4062e70-befa-4995-8643-a0de73938182,81536ad9-5ccf-4eb8-befb-47a4d608658e,Fully Paid,176220,Short Term,,,5 years,Rent,Debt Consolidation,20639.7,6.1,,15,0,253460,427174.0,0.0,0.0


### Описание полей
* Loan ID: уникальный идентификатор кредита;
* Customer ID: уникальный идентификатор клиента;
* Loan Status: категориальный признак - кредит погашен ("Fully Paid") или не погашен ("Charged Off");
* Current Loan Amount: размер кредита;
* Term: срок кредита;
* Credit Score: кредитный рейтинг - число от 0 до 800;
* Years in current job: стаж на текущем месте работы;
* Home Ownership: статус недвижимости - собственность ("Own"), ипотека ("Home Mortgage") или аренда ("Rent");
* Annual Income: годовой доход;
* Purpose: цель кредита;
* Monthly Debt: размер ежемесячного платежа по текущим кредитам;
* Years of Credit History: количество лет кредитной истории;
* Months since last delinquent: количество месяцев с последнего нарушения условий кредита;
* Number of Open Accounts: количество открытых кредитных карт;
* Number of Credit Problems: количество кредитных проблем;
* Current Credit Balance: суммарный текущий долг;
* Maximum Open Credit: максимальный кредитный лимит из всех источников;
* Bankruptcies: количество банкротств;
* Tax Liens: количество нарушений налогового законодательства.

In [54]:
data_scoring.shape

(100000, 19)

In [55]:
data_scoring.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 19 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Loan ID                       100000 non-null  object 
 1   Customer ID                   100000 non-null  object 
 2   Loan Status                   100000 non-null  object 
 3   Current Loan Amount           100000 non-null  int64  
 4   Term                          100000 non-null  object 
 5   Credit Score                  80846 non-null   float64
 6   Annual Income                 80846 non-null   float64
 7   Years in current job          95778 non-null   object 
 8   Home Ownership                100000 non-null  object 
 9   Purpose                       100000 non-null  object 
 10  Monthly Debt                  100000 non-null  float64
 11  Years of Credit History       100000 non-null  float64
 12  Months since last delinquent  46859 non-null 

In [56]:
data_scoring = data_scoring.drop(columns=["Loan ID", "Customer ID"])
data_scoring.head()

Unnamed: 0,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,Fully Paid,445412,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,,6,1,228190,416746.0,1.0,0.0
1,Fully Paid,262328,Short Term,,,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35,0,229976,850784.0,0.0,0.0
2,Fully Paid,99999999,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18,1,297996,750090.0,0.0,0.0
3,Fully Paid,347666,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,,9,0,256329,386958.0,0.0,0.0
4,Fully Paid,176220,Short Term,,,5 years,Rent,Debt Consolidation,20639.7,6.1,,15,0,253460,427174.0,0.0,0.0


In [57]:
data_scoring.shape

(100000, 17)

In [58]:
data_scoring.describe()

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
count,100000.0,80846.0,80846.0,100000.0,100000.0,46859.0,100000.0,100000.0,100000.0,99998.0,99796.0,99990.0
mean,11760450.0,1076.456089,1378277.0,18472.412336,18.199141,34.901321,11.12853,0.16831,294637.4,760798.4,0.11774,0.029313
std,31783940.0,1475.403791,1081360.0,12174.992609,7.015324,21.997829,5.00987,0.482705,376170.9,8384503.0,0.351424,0.258182
min,10802.0,585.0,76627.0,0.0,3.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,179652.0,705.0,848844.0,10214.1625,13.5,16.0,8.0,0.0,112670.0,273438.0,0.0,0.0
50%,312246.0,724.0,1174162.0,16220.3,16.9,32.0,10.0,0.0,209817.0,467874.0,0.0,0.0
75%,524942.0,741.0,1650663.0,24012.0575,21.7,51.0,14.0,0.0,367958.8,782958.0,0.0,0.0
max,100000000.0,7510.0,165557400.0,435843.28,70.5,176.0,76.0,15.0,32878970.0,1539738000.0,7.0,15.0


In [59]:
plt.figure(figsize=(15, 7))

ax = sns.bar

<Figure size 1080x504 with 0 Axes>

<Figure size 1080x504 with 0 Axes>

In [60]:
data_scoring["Loan Status"]

0        Fully Paid
1        Fully Paid
2        Fully Paid
3        Fully Paid
4        Fully Paid
            ...    
99995    Fully Paid
99996    Fully Paid
99997    Fully Paid
99998    Fully Paid
99999    Fully Paid
Name: Loan Status, Length: 100000, dtype: object

In [28]:
data_scoring['Years in current job'].fillna('10+ years', inplace=True)
data_scoring['Credit Score'].fillna(data_scoring['Credit Score'].median(), inplace=True)
data_scoring['Annual Income'].fillna(data_scoring['Annual Income'].median(), inplace=True)
data_scoring['Months since last delinquent'].fillna(data_scoring['Months since last delinquent'].median(), inplace=True)
data_scoring['Maximum Open Credit'].fillna(data_scoring['Maximum Open Credit'].median(), inplace=True)
data_scoring['Bankruptcies'].fillna(data_scoring['Bankruptcies'].median(), inplace=True)
data_scoring['Tax Liens'].fillna(data_scoring['Tax Liens'].median(), inplace=True)

In [29]:
pd.get_dummies(data_scoring['Years in current job'], prefix="Years in current job")

Unnamed: 0,Years in current job_1 year,Years in current job_10+ years,Years in current job_2 years,Years in current job_3 years,Years in current job_4 years,Years in current job_5 years,Years in current job_6 years,Years in current job_7 years,Years in current job_8 years,Years in current job_9 years,Years in current job_< 1 year
0,0,0,0,0,0,0,0,0,1,0,0
1,0,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
99995,0,0,0,0,0,0,0,1,0,0,0
99996,1,0,0,0,0,0,0,0,0,0,0
99997,0,0,0,0,0,0,1,0,0,0,0
99998,0,0,0,0,0,0,0,0,0,1,0


In [30]:
pd.get_dummies(data_scoring['Term'], prefix="Term")

Unnamed: 0,Term_Long Term,Term_Short Term
0,0,1
1,0,1
2,0,1
3,1,0
4,0,1
...,...,...
99995,0,1
99996,0,1
99997,0,1
99998,0,1


In [31]:
pd.get_dummies(data_scoring['Home Ownership'], prefix="Home Ownership")

Unnamed: 0,Home Ownership_HaveMortgage,Home Ownership_Home Mortgage,Home Ownership_Own Home,Home Ownership_Rent
0,0,1,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,1,0
4,0,0,0,1
...,...,...,...,...
99995,0,0,1,0
99996,0,0,0,1
99997,0,0,0,1
99998,0,0,0,1


In [32]:
data_scoring = pd.concat([data_scoring,
                          pd.get_dummies(data_scoring['Years in current job'], prefix="Years in current job"),
                          pd.get_dummies(data_scoring['Term'], prefix="Term"),
                          pd.get_dummies(data_scoring['Home Ownership'], prefix="Home Ownership"),
                          pd.get_dummies(data_scoring['Purpose'], prefix="Purpose")], axis=1)
data_scoring.drop(columns=['Years in current job', 'Term', 'Home Ownership', 'Purpose'], inplace=True)
data_scoring.head()

Unnamed: 0,Loan Status,Current Loan Amount,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,...,Purpose_Medical Bills,Purpose_Other,Purpose_Take a Trip,Purpose_major_purchase,Purpose_moving,Purpose_other,Purpose_renewable_energy,Purpose_small_business,Purpose_vacation,Purpose_wedding
0,Fully Paid,445412,709.0,1167493.0,5214.74,17.2,32.0,6,1,228190,...,0,0,0,0,0,0,0,0,0,0
1,Fully Paid,262328,724.0,1174162.0,33295.98,21.1,8.0,35,0,229976,...,0,0,0,0,0,0,0,0,0,0
2,Fully Paid,99999999,741.0,2231892.0,29200.53,14.9,29.0,18,1,297996,...,0,0,0,0,0,0,0,0,0,0
3,Fully Paid,347666,721.0,806949.0,8741.9,12.0,32.0,9,0,256329,...,0,0,0,0,0,0,0,0,0,0
4,Fully Paid,176220,724.0,1174162.0,20639.7,6.1,32.0,15,0,253460,...,0,0,0,0,0,0,0,0,0,0


In [33]:
data_scoring.shape

(100000, 46)

In [41]:
data_scoring['Loan Status'] = data_scoring['Loan Status'].factorize()[0]
data_scoring.head()

Unnamed: 0,Loan Status,Current Loan Amount,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,...,Purpose_Medical Bills,Purpose_Other,Purpose_Take a Trip,Purpose_major_purchase,Purpose_moving,Purpose_other,Purpose_renewable_energy,Purpose_small_business,Purpose_vacation,Purpose_wedding
0,0,445412,709.0,1167493.0,5214.74,17.2,32.0,6,1,228190,...,0,0,0,0,0,0,0,0,0,0
1,0,262328,724.0,1174162.0,33295.98,21.1,8.0,35,0,229976,...,0,0,0,0,0,0,0,0,0,0
2,0,99999999,741.0,2231892.0,29200.53,14.9,29.0,18,1,297996,...,0,0,0,0,0,0,0,0,0,0
3,0,347666,721.0,806949.0,8741.9,12.0,32.0,9,0,256329,...,0,0,0,0,0,0,0,0,0,0
4,0,176220,724.0,1174162.0,20639.7,6.1,32.0,15,0,253460,...,0,0,0,0,0,0,0,0,0,0


In [42]:
X = data_scoring.drop(['Loan Status'], axis=1)
y = data_scoring['Loan Status']

In [43]:
X.head()

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,...,Purpose_Medical Bills,Purpose_Other,Purpose_Take a Trip,Purpose_major_purchase,Purpose_moving,Purpose_other,Purpose_renewable_energy,Purpose_small_business,Purpose_vacation,Purpose_wedding
0,445412,709.0,1167493.0,5214.74,17.2,32.0,6,1,228190,416746.0,...,0,0,0,0,0,0,0,0,0,0
1,262328,724.0,1174162.0,33295.98,21.1,8.0,35,0,229976,850784.0,...,0,0,0,0,0,0,0,0,0,0
2,99999999,741.0,2231892.0,29200.53,14.9,29.0,18,1,297996,750090.0,...,0,0,0,0,0,0,0,0,0,0
3,347666,721.0,806949.0,8741.9,12.0,32.0,9,0,256329,386958.0,...,0,0,0,0,0,0,0,0,0,0
4,176220,724.0,1174162.0,20639.7,6.1,32.0,15,0,253460,427174.0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Loan Status, dtype: int64

In [46]:
result = pd.DataFrame({'model' : ['MLPClassifier', 'Linear Regression', 'Random Forest Classifier', ],
                       'train_score': 0, 'test_score': 0})
result

Unnamed: 0,model,train_score,test_score
0,MLPClassifier,0,0
1,Linear Regression,0,0
2,Random Forest Classifier,0,0


In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=43)

In [49]:
from sklearn.datasets import make_classification

make_classification(n_samples=100, random_state=1)

(array([[ 0.6065484 ,  0.81695766,  1.05132077, ..., -0.36453805,
          0.16466507, -0.76780375],
        [-3.05376438,  0.92116205, -1.45832446, ...,  0.20437739,
         -1.55269878, -0.4466992 ],
        [ 0.60640394,  0.68064537,  1.02124813, ...,  1.03703898,
         -0.83001099, -0.03599018],
        ...,
        [-2.30803851, -1.42368943,  1.14256392, ..., -0.24701649,
         -0.37911961,  0.27610275],
        [-1.53702887,  2.14957042,  0.32455352, ...,  2.15323347,
          1.31972591, -0.8797298 ],
        [ 0.37167029, -0.95543218, -0.1484898 , ..., -0.6294416 ,
          0.14225137,  0.78002714]]),
 array([1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,
        0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
        0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0,
        0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
        1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0]))