# **Построение модели**

В качестве baseline модели для решения задачи классификации мы используем `Logistic regression`. После чего попробуем использовать `KNN`, `Random forest`, `GBM` из библиотеки `xgboost`. Подберем гипперпараметры для данных моделей. Также сделаем отбор признаков в датасете, посмотрим как улучшится качество. Также можно выделить признаки с помощью `PCA`. Будем пытаться улучшить модели по метрике `AUC-ROC`.

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

In [4]:
credit_score_df = pd.read_csv("~/Documents/datasets/transformed_credit_score.csv")

credit_score_df.head()

Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,7.0,...,_,809.98,26.82262,265.0,No,49.574949,80.415295,High_spent_Small_value_payments,312.494089,True
1,23.0,Scientist,19114.12,4194.150202,3.0,4.0,3.0,4.0,-1.0,30.9254,...,Good,809.98,31.94496,221.1933,No,49.574949,118.280222,Low_spent_Large_value_payments,284.629162,True
2,33.0,Scientist,19114.12,4194.150202,3.0,4.0,3.0,4.0,3.0,7.0,...,Good,809.98,28.609352,267.0,No,49.574949,81.699521,Low_spent_Medium_value_payments,331.209863,True
3,23.0,Scientist,19114.12,4194.150202,3.0,4.0,3.0,4.0,5.0,4.0,...,Good,809.98,31.377862,268.0,No,49.574949,199.458074,Low_spent_Small_value_payments,223.45131,True
4,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,6.0,30.9254,...,Good,809.98,24.797347,269.0,No,49.574949,41.420153,High_spent_Medium_value_payments,341.489231,True


In [5]:
#Выделим целевую переменную 
X = credit_score_df.drop(columns = ['Credit_Score'])
y = credit_score_df['Credit_Score']

In [6]:
#Разделим выборку на обучающую и тестовую
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [7]:
#Применим one-hot кодирование для категориальных данных
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

In [8]:
X_train.shape, X_test.shape

((69993, 43), (29998, 43))

In [9]:
#Нормализуем данные с помощью StandardScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)

X_std_train = scaler.transform(X_train)
X_std_test = scaler.transform(X_test)

In [32]:
X_std_train.shape, X_std_test.shape

((69993, 43), (29998, 43))

Построим `baseline` в виде `LogisticRegression`, подберём гипперпараметры, затем будем пытаться улучшить рекорд  с помощью других моделей.

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, f1_score

logistic_model = LogisticRegression()

logistic_model.fit(X_std_train, y_train)

print(f"AUC ROC score for logistic regression on train: {roc_auc_score(y_train, logistic_model.predict(X_std_train)):.2f}")
print(f"F1 score for logistic regression on train: {f1_score(y_train, logistic_model.predict(X_std_train)):.2f}")
print()
print(f"AUC ROC score for logistic regression on test: {roc_auc_score(y_test, logistic_model.predict(X_std_test)):.2f}")
print(f"F1 score for logistic regression on test: {f1_score(y_test, logistic_model.predict(X_std_test)):.2f}")

AUC ROC score for logistic regression on train: 0.66
F1 score for logistic regression on train: 0.84

AUC ROC score for logistic regression on test: 0.67
F1 score for logistic regression on test: 0.84


In [48]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


model = LogisticRegression()

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']  # 'liblinear' поддерживает 'l1', 'lbfgs' - только 'l2'
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='roc_auc')

grid_search.fit(X_train, y_train)

# Выводим лучшие параметры и лучший результат
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

Best parameters found:  {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}
Best cross-validation score: 0.79
