In [25]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [26]:
RANDOM_STATE = 42

In [27]:
DATASET_PATH = 'https://raw.githubusercontent.com/Burakofff/credit_scoring/main/credit_scoring.csv'

In [30]:

df = pd.read_csv(DATASET_PATH)

**filling the voids**

In [None]:
mean = df['MonthlyIncome'].mean()

df['MonthlyIncome'].fillna(mean, inplace=True)

In [None]:
mean = df['age'].mean()

df['age'].fillna(mean, inplace=True)

In [None]:
df['NumberOfDependents'].fillna(0, inplace=True)

In [35]:
X = df.drop(['SeriousDlqin2yrs'], axis=1)

y = df['SeriousDlqin2yrs'] 

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

X_train.shape, X_test.shape

((120000, 11), (30000, 11))

**Pre-processing features:**

* lets encode the categorical features
* scale the numerical features

In [37]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler

categorical = ['RealEstateLoansOrLines', 'GroupAge']
numeric_features = [col for col in X_train.columns if col not in categorical]

column_transformer = ColumnTransformer([
    ('ordinal', OrdinalEncoder(), categorical),  # Заменили 'ohe' на 'ordinal' и OneHotEncoder на OrdinalEncoder
    ('scaling', MinMaxScaler(), numeric_features)
])

X_train_transformed = column_transformer.fit_transform(X_train)
X_test_transformed = column_transformer.transform(X_test)

**adding the name of the column**

In [39]:
lst = list(column_transformer.transformers_[0][1].get_feature_names_out())
lst.extend(numeric_features)

X_train_transformed = pd.DataFrame(X_train_transformed, columns=lst)
X_test_transformed = pd.DataFrame(X_test_transformed, columns=lst)

X_train_transformed.head()

Unnamed: 0,RealEstateLoansOrLines,GroupAge,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,0.0,3.0,1.437848e-06,0.486239,0.0,9.416892e-07,0.006381,0.155172,0.0,0.0,0.05
1,0.0,2.0,1.947668e-06,0.40367,0.0,2.500234e-07,0.00216,0.068966,0.0,0.0,0.15
2,0.0,4.0,1.776014e-06,0.614679,0.0,0.003504875,0.002217,0.155172,0.0,0.0,0.0
3,0.0,4.0,3.645582e-07,0.479758,0.0,5.97503e-07,0.003796,0.241379,0.0,0.0,0.15
4,0.0,3.0,8.246467e-07,0.46789,0.0,1.319643e-08,0.001969,0.086207,0.0,0.0,0.0


In [40]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=12345, solver='liblinear', class_weight='balanced')

model.fit(X_train_transformed, y_train)

pred = model.predict_proba(X_test_transformed)[:,1]

**Оevaluating the quality of the model**

In [41]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

In [52]:
from sklearn.metrics import confusion_matrix
classes = (pred > 0.48).astype(int)
confusion_matrix(y_test, classes), recall_score(y_test, classes)

(array([[19078,  8884],
        [  580,  1458]], dtype=int64),
 0.7154072620215898)

In [44]:
import pickle

with open('model.pickle', 'wb') as f:
    pickle.dump(model, f)
