### Модель K-ближайших соседей для прогнозирования возвратов кредитов

Подключим необходимые модули

In [1]:
from google.colab import drive
import pandas as pd
import sklearn

drive.mount('/content/drive')

Mounted at /content/drive


Загрузим заранее предобработанный датасет

In [2]:
df = pd.read_csv("/content/drive/MyDrive/bank_churners_preprocessed.csv")
df

Unnamed: 0.1,Unnamed: 0,Attrition_Flag,Customer_Age,Dependent_count,Education_Level,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,...,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,c_F,c_M,c_Divorced,c_Married,c_Single,c_Unknown
0,0,0,45,3,2,3,0,39,5,1,...,1144,42,1.625,0.061,0,1,0,1,0,0
1,1,0,49,5,5,1,0,44,6,1,...,1291,33,3.714,0.105,1,0,0,0,1,0
2,2,0,51,3,5,4,0,36,4,1,...,1887,20,2.333,0.000,0,1,0,1,0,0
3,3,0,40,4,2,1,0,34,3,4,...,1171,20,2.333,0.760,1,0,0,0,0,1
4,4,0,40,3,1,3,0,21,5,1,...,816,28,2.500,0.000,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,10122,0,50,2,5,2,0,40,3,2,...,15476,117,0.857,0.462,0,1,0,0,1,0
10123,10123,1,41,2,0,2,0,25,4,2,...,8764,69,0.683,0.511,0,1,1,0,0,0
10124,10124,1,44,1,2,1,0,36,5,3,...,10291,60,0.818,0.000,1,0,0,1,0,0
10125,10125,1,30,2,5,2,0,36,4,3,...,8395,62,0.722,0.000,0,1,0,0,0,1


Удалим ненужную колонку Unnamed

In [3]:
df.drop(columns = ["Unnamed: 0"], inplace=True)

Подключим модули для обучения модели

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

Разделяем данные на выборки, пропишем параметр stratify для предохранения от дисбаланса классов

In [5]:
y = df['Attrition_Flag']
X = df.drop(columns = ['Attrition_Flag'])
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2)

Проверим модель с параметром ближайших соседей = 5

In [10]:
knn = KNeighborsClassifier(n_neighbors=5).fit(X_train, y_train)
print(classification_report(y_test, knn.predict(X_test)))
print(confusion_matrix(y_test, knn.predict(X_test)))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      1701
           1       0.71      0.54      0.61       325

    accuracy                           0.89      2026
   macro avg       0.82      0.75      0.78      2026
weighted avg       0.88      0.89      0.89      2026

[[1631   70]
 [ 150  175]]


В целом результат неплохой, но попробуем его улучшить, подобрав оптимальное количество соседей

In [12]:
parameters = {'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9]}

knn_optimal = GridSearchCV(KNeighborsClassifier(), parameters).fit(X_train, y_train)
knn_optimal.best_params_

{'n_neighbors': 7}

In [13]:
print(classification_report(y_test, knn_optimal.predict(X_test)))
print(confusion_matrix(y_test, knn_optimal.predict(X_test)))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94      1701
           1       0.74      0.55      0.63       325

    accuracy                           0.90      2026
   macro avg       0.83      0.76      0.79      2026
weighted avg       0.89      0.90      0.89      2026

[[1638   63]
 [ 146  179]]


Стало немного лучше, но стоит попробовать выполнить скалирование данных и снова подберем параметры

In [15]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
parameters = {'n_neighbors': [3, 5, 7, 9]}

knn_optimal = GridSearchCV(KNeighborsClassifier(), parameters).fit(X_train, y_train)
knn_optimal.best_params_

{'n_neighbors': 5}

In [16]:
print(classification_report(y_test, knn_optimal.predict(X_test)))
print(confusion_matrix(y_test, knn_optimal.predict(X_test)))

              precision    recall  f1-score   support

           0       0.91      0.98      0.94      1701
           1       0.83      0.49      0.62       325

    accuracy                           0.90      2026
   macro avg       0.87      0.74      0.78      2026
weighted avg       0.90      0.90      0.89      2026

[[1669   32]
 [ 166  159]]


Можно заметить, что до скалирования результат был лучше, но в целом картина неплохая