In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
RANDOM_STATE = 42

In [3]:
DATASET_PATH = 'https://raw.githubusercontent.com/Burakofff/credit_scoring/main/credit_scoring.csv'


In [4]:
# загрузка данных
df = pd.read_csv(DATASET_PATH)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 12 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      150000 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 2   age                                   135155 non-null  float64
 3   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 4   DebtRatio                             150000 non-null  float64
 5   MonthlyIncome                         120269 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 7   NumberOfTimes90DaysLate               150000 non-null  int64  
 8   NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 9   NumberOfDependents                    146076 non-null  float64
 10  RealEstateLoansOrLines                150000 non-null  object 
 11  

**Заполняем пустоты**

In [6]:
mean = df['MonthlyIncome'].mean()

df['MonthlyIncome'].fillna(mean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['MonthlyIncome'].fillna(mean, inplace=True)


In [7]:
mean = df['age'].mean()

df['age'].fillna(mean, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(mean, inplace=True)


In [8]:
df['NumberOfDependents'].fillna(0, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['NumberOfDependents'].fillna(0, inplace=True)


In [9]:
X = df.drop(['SeriousDlqin2yrs'], axis=1)

y = df['SeriousDlqin2yrs'] # целевая переменная (SeriousDlqin2yrs)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train.shape, X_test.shape

((120000, 11), (30000, 11))

**Предобработаем признаки:**

* закодируем категориальные признаки
* масштабируем числовые признаки

In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
# Удалите импорт OneHotEncoder

categorical = ['RealEstateLoansOrLines','GroupAge']
numeric_features = [col for col in X_train.columns if col not in categorical]

column_transformer = ColumnTransformer([
    ('ohe', OneHotEncoder(drop='first', handle_unknown="ignore"), categorical),
    ('scaling', MinMaxScaler(), numeric_features)
])

X_train_transformed = column_transformer.fit_transform(X_train)
X_test_transformed = column_transformer.transform(X_test)

**Добавим названия колонок**

In [12]:
lst = list(column_transformer.transformers_[0][1].get_feature_names_out())
lst.extend(numeric_features)

X_train_transformed = pd.DataFrame(X_train_transformed, columns=lst)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=lst)

X_train_transformed.head()

Unnamed: 0,RealEstateLoansOrLines_B,RealEstateLoansOrLines_C,RealEstateLoansOrLines_D,RealEstateLoansOrLines_E,GroupAge_b,GroupAge_c,GroupAge_d,GroupAge_e,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.437848e-06,0.486239,0.0,9.416892e-07,0.006381,0.155172,0.0,0.0,0.05
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.947668e-06,0.40367,0.0,2.500234e-07,0.00216,0.068966,0.0,0.0,0.15
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.776014e-06,0.614679,0.0,0.003504875,0.002217,0.155172,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,3.645582e-07,0.479758,0.0,5.97503e-07,0.003796,0.241379,0.0,0.0,0.15
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8.246467e-07,0.46789,0.0,1.319643e-08,0.001969,0.086207,0.0,0.0,0.0


In [13]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=12345, solver='liblinear', class_weight='balanced')

model.fit(X_train_transformed, y_train)

pred = model.predict_proba(X_test_transformed)[:,1]

**Оценка качества модели**

In [14]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [15]:
from sklearn.metrics import confusion_matrix
classes = (pred > 0.5).astype(int)
confusion_matrix(y_test, classes), recall_score(y_test, classes)

(array([[20469,  7493],
        [  655,  1383]], dtype=int64),
 0.6786064769381747)

In [16]:
importances = pd.DataFrame({'weights': model.coef_[0], 'features': X_train_transformed.columns}).sort_values(by='weights')
importances

Unnamed: 0,weights,features
12,-6.218726,MonthlyIncome
9,-3.889204,age
11,-2.528348,DebtRatio
8,-0.878336,RevolvingUtilizationOfUnsecuredLines
13,-0.613267,NumberOfOpenCreditLinesAndLoans
4,0.229218,GroupAge_b
0,0.276097,RealEstateLoansOrLines_B
5,0.360765,GroupAge_c
7,0.442588,GroupAge_e
6,0.467362,GroupAge_d


In [17]:
import pickle

with open('model.pickle', 'wb') as f:
    pickle.dump(model, f)
