# **Построение модели**

В качестве baseline модели для решения задачи классификации мы используем `Logistic regression`. После чего попробуем использовать `KNN`, `Random forest`, `GBM` из библиотеки `xgboost`. Подберем гипперпараметры для данных моделей. Также сделаем отбор признаков в датасете, посмотрим как улучшится качество. Также можно выделить признаки с помощью `PCA`. Будем пытаться улучшить модели по метрике `AUC-ROC`.

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

In [4]:
credit_score_df = pd.read_csv("~/Documents/datasets/transformed_credit_score.csv")

credit_score_df.head()

Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,...,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,3.0,7.0,...,_,809.98,26.82262,265.0,No,49.574949,80.415295,High_spent_Small_value_payments,312.494089,True
1,23.0,Scientist,19114.12,4194.150202,3.0,4.0,3.0,4.0,-1.0,30.9254,...,Good,809.98,31.94496,221.1933,No,49.574949,118.280222,Low_spent_Large_value_payments,284.629162,True
2,33.0,Scientist,19114.12,4194.150202,3.0,4.0,3.0,4.0,3.0,7.0,...,Good,809.98,28.609352,267.0,No,49.574949,81.699521,Low_spent_Medium_value_payments,331.209863,True
3,23.0,Scientist,19114.12,4194.150202,3.0,4.0,3.0,4.0,5.0,4.0,...,Good,809.98,31.377862,268.0,No,49.574949,199.458074,Low_spent_Small_value_payments,223.45131,True
4,23.0,Scientist,19114.12,1824.843333,3.0,4.0,3.0,4.0,6.0,30.9254,...,Good,809.98,24.797347,269.0,No,49.574949,41.420153,High_spent_Medium_value_payments,341.489231,True


In [5]:
#Выделим целевую переменную 
X = credit_score_df.drop(columns = ['Credit_Score'])
y = credit_score_df['Credit_Score']

X = pd.get_dummies(X, drop_first=True)

In [6]:
#Разделим выборку на обучающую и тестовую
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify=y,shuffle=True)

In [7]:
X_train.shape, X_test.shape

((79992, 43), (19999, 43))

In [8]:
#Нормализуем данные с помощью StandardScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)

X_train_std = scaler.transform(X_train)
X_test_std = scaler.transform(X_test)

Построим `baseline` в виде `LogisticRegression`, подберём гипперпараметры, затем будем пытаться улучшить рекорд  с помощью других моделей.

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, f1_score

logistic_model = LogisticRegression()

logistic_model.fit(X_train_std, y_train)

print(f"AUC ROC score for logistic regression on train: {roc_auc_score(y_train, logistic_model.predict(X_train_std)):.2f}")
print(f"F1 score for logistic regression on train: {f1_score(y_train, logistic_model.predict(X_train_std)):.2f}\n")
print(f"AUC ROC score for logistic regression on test: {roc_auc_score(y_test, logistic_model.predict(X_test_std)):.2f}")
print(f"F1 score for logistic regression on test: {f1_score(y_test, logistic_model.predict(X_test_std)):.2f}")

AUC ROC score for logistic regression on train: 0.66
F1 score for logistic regression on train: 0.84

AUC ROC score for logistic regression on test: 0.66
F1 score for logistic regression on test: 0.84


Подберем гипперпараметры с помощью `GridSearchCV`.

In [12]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}

grid_search1 = GridSearchCV(estimator=logistic_model, param_grid=param_grid, cv=5, scoring='roc_auc')

grid_search1.fit(X_train_std, y_train)

print("Best Parameters:", grid_search1.best_params_)
print("Best Score:", grid_search1.best_score_)
print("Test Score:", grid_search1.score(X_test_std, y_test))

Best Parameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score: 0.7845980203143406
Test Score: 0.7848790104208406


Подберем гипперпараметры более масштабно с помощью `RandomizedSearch`.

Метрики практически не изменились, зафиксируем лучшие параметры для `LogisticRegression`.

In [15]:
from sklearn.metrics import roc_auc_score, classification_report, f1_score

best_logistic_model = LogisticRegression(C = 0.1, penalty = 'l1', solver = 'liblinear')

best_logistic_model.fit(X_train_std, y_train)

print(f"AUC ROC score for logistic regression on test: {roc_auc_score(y_test, best_logistic_model.predict(X_test_std)):.2f}")
print(classification_report(y_test, best_logistic_model.predict(X_test_std)))

AUC ROC score for logistic regression on test: 0.66
              precision    recall  f1-score   support

       False       0.64      0.41      0.50      5800
        True       0.79      0.91      0.84     14199

    accuracy                           0.76     19999
   macro avg       0.71      0.66      0.67     19999
weighted avg       0.75      0.76      0.74     19999



In [49]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, cross_val_score

pipeline = make_pipeline(StandardScaler(), LogisticRegression(C=0.1, penalty='l1', solver='liblinear'))

kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=kf, scoring='roc_auc')

print("Mean AUC-ROC with cross validation on logistic regression:", np.mean(scores))

Средний AUC ROC с использованием кросс-валидации: 0.7843576776831229


Теперь обучим модель `RandomForestClassifier`, подберем гипперпараметры и посмотрим как улучшится `AUC_ROC`.

In [45]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()

rf_model.fit(X_train_std, y_train)

print(f"AUC ROC score for random forest on test: {roc_auc_score(y_test, rf_model.predict(X_test_std)):.2f}")
print(classification_report(y_test, rf_model.predict(X_test_std)))

AUC ROC score for random forest on test: 0.84
              precision    recall  f1-score   support

       False       0.79      0.76      0.78      5800
        True       0.90      0.92      0.91     14199

    accuracy                           0.87     19999
   macro avg       0.85      0.84      0.84     19999
weighted avg       0.87      0.87      0.87     19999



In [43]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [50, 100, 200], 
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [5, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rand_search1 = RandomizedSearchCV(estimator=rf_model, 
                                  param_distributions=param_dist, 
                                  cv=3,
                                  n_iter=30,
                                  scoring='roc_auc', 
                                  verbose = 3)

rand_search1.fit(X_train_std, y_train)

print("Best Parameters:", rand_search1.best_params_)
print("Best Score:", rand_search1.best_score_)
print("Test Score:", rand_search1.score(X_test_std, y_test))

Fitting 3 folds for each of 30 candidates, totalling 90 fits
[CV 1/3] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=50;, score=nan total time=   0.0s
[CV 2/3] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=50;, score=nan total time=   0.0s
[CV 3/3] END max_depth=10, max_features=auto, min_samples_leaf=4, min_samples_split=5, n_estimators=50;, score=nan total time=   0.0s
[CV 1/3] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=0.869 total time=  10.9s
[CV 2/3] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=0.870 total time=  11.0s
[CV 3/3] END max_depth=10, max_features=log2, min_samples_leaf=2, min_samples_split=10, n_estimators=200;, score=0.872 total time=  10.1s
[CV 1/3] END max_depth=30, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=200;, score=0.912 total

In [47]:
best_rf_model = RandomForestClassifier(n_estimators = 200, 
                                       min_samples_split = 2, 
                                       min_samples_leaf = 1, 
                                       max_features = 'log2', 
                                       max_depth = 30)

best_rf_model.fit(X_train_std, y_train)

print(f"AUC ROC score for best random forest on test: {roc_auc_score(y_test, best_rf_model.predict(X_test_std)):.2f}")
print("Classification report:\n", classification_report(y_test, best_rf_model.predict(X_test_std)))

AUC ROC score for best random forest on test: 0.84
Classification report:
               precision    recall  f1-score   support

       False       0.80      0.75      0.77      5800
        True       0.90      0.92      0.91     14199

    accuracy                           0.87     19999
   macro avg       0.85      0.84      0.84     19999
weighted avg       0.87      0.87      0.87     19999



In [51]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, cross_val_score

pipeline = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators = 200, 
                                       min_samples_split = 2, 
                                       min_samples_leaf = 1, 
                                       max_features = 'log2', 
                                       max_depth = 30))

kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=kf, scoring='roc_auc')

print("Mean AUC-ROC with cross validation on fandom forest:", np.mean(scores))

Mean AUC-ROC with cross validation on fandom forest: 0.9261324394787753


Теперь обучим модель `KNeighborsClassifier`, подберем гипперпараметры и посмотрим как улучшится `AUC_ROC`.

In [60]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(n_neighbors = 5)

knn_model.fit(X_train_std, y_train)

print(f"AUC ROC score for k-nearest neighbors on train: {roc_auc_score(y_train, knn_model.predict(X_train_std)):.2f}")
print(f"F1 score for k-nearest neighbors on train: {f1_score(y_train, knn_model.predict(X_train_std)):.2f}\n")
print(f"AUC ROC score for k-nearest neighbors on test: {roc_auc_score(y_test, knn_model.predict(X_test_std)):.2f}")
print(f"F1 score for k-nearest neighbors on test: {f1_score(y_test, knn_model.predict(X_test_std)):.2f}")

AUC ROC score for k-nearest neighbors on train: 0.77
F1 score for k-nearest neighbors on train: 0.89

AUC ROC score for k-nearest neighbors on test: 0.69
F1 score for k-nearest neighbors on test: 0.85


In [None]:
param_dist = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

grid_search2 = GridSearchCV(estimator=knn_model, 
                            param_grid=param_dist, 
                            cv=5, 
                            scoring='roc_auc',
                            verbose = 3)

grid_search2.fit(X_train_std, y_train)

print("Best Parameters:", grid_search2.best_params_)
print("Best Score:", grid_search2.best_score_)
print("Test Score:", grid_search2.score(X_test_std, y_test))

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5] END metric=euclidean, n_neighbors=3, weights=uniform;, score=0.740 total time=   1.5s
[CV 2/5] END metric=euclidean, n_neighbors=3, weights=uniform;, score=0.735 total time=   1.4s
[CV 3/5] END metric=euclidean, n_neighbors=3, weights=uniform;, score=0.737 total time=   1.4s
[CV 4/5] END metric=euclidean, n_neighbors=3, weights=uniform;, score=0.742 total time=   1.4s
[CV 5/5] END metric=euclidean, n_neighbors=3, weights=uniform;, score=0.738 total time=   1.4s
[CV 1/5] END metric=euclidean, n_neighbors=3, weights=distance;, score=0.746 total time=   1.6s
[CV 2/5] END metric=euclidean, n_neighbors=3, weights=distance;, score=0.743 total time=   1.5s
[CV 3/5] END metric=euclidean, n_neighbors=3, weights=distance;, score=0.742 total time=   1.4s
[CV 4/5] END metric=euclidean, n_neighbors=3, weights=distance;, score=0.749 total time=   1.4s
[CV 5/5] END metric=euclidean, n_neighbors=3, weights=distance;, score=0.745 to