## Классические алгоритмы без ансамблирования
В этом ноутбуке вам нужно обучить модели на датасете классификации из предыдущего ноутбука и сравнить результаты. Вам будет предоставлен baseline, на основе которого вы будете доделывать предсказывающие модели. Оценка лабы будет зависеть от ROC-AUC на тестовых данных по следующим критериям:
\
AUC - на тестовых данных
- $AUC \leq 0.75$ - 0 баллов
- $0.75 < AUC \leq 0.76$ - 2 балла
- $0.76 < AUC \leq 0.77$ - 4 балла
- $0.77 < AUC \leq 0.78$ - 6 баллов
- $0.78 < AUC \leq 0.79$ - 8 баллов
- $AUC > 0.79$ - 10 баллов

\
В этой работе запрещено использовать ансамбли моделей (лес, бустинги и т.д.)!

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import roc_auc_score, precision_score, recall_score, roc_curve, accuracy_score

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv('german.csv', sep=';')
print(data.head())

X = data.iloc[:, 1:].to_numpy()
y = data.iloc[:, 0].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creditability                        0
# Account_Balance                      0
# Duration_of_Credit_monthly           0
# Payment_Status_of_Previous_Credit    0
# Purpose                              0
# Credit_Amount                        0
# Value_Savings_Stocks                 0
# Length_of_current_employment         0
# Instalment_per_cent                  0
# Sex_Marital_Status                   0
# Guarantors                           0
# Duration_in_Current_address          0
# Most_valuable_available_asset        0
# Age_years                            0
# Concurrent_Credits                   0
# Type_of_apartment                    0
# No_of_Credits_at_this_Bank           0
# Occupation                           0
# No_of_dependents                     0
# Telephone                            0
# Foreign_Worker                       0
# dtype: int64

In [None]:
plt.hist(y_train, bins=2, edgecolor='k')
plt.xticks([0, 1])
plt.xlabel('Class (0: Non-Creditworthy, 1: Creditworthy)')
plt.ylabel('Count')
plt.title('Distribution of Classes in Training Data')
plt.show()

# сохраненно в проекте как Figure1

In [None]:
import multiprocessing

# --- Logistic Regression ---
logistic_params = {
    'C': np.logspace(-1, 1, 10),  # Меньше значений
    'solver': ['liblinear', 'saga'],  # Добавлен 'saga'
    'penalty': ['l1', 'l2'],        # Добавлен 'l1'
    'max_iter': [2000, 5000]         # Меньше значений
}
logistic_regression_model = GridSearchCV(
    LogisticRegression(random_state=42),
    logistic_params,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), # Меньше фолдов + StratifiedKFold
    scoring='roc_auc',
    verbose=0,
    n_jobs=multiprocessing.cpu_count() // 2  # Параллелизация
)
logistic_regression_model.fit(X_train, y_train)
logistic_regression_model = logistic_regression_model.best_estimator_


# --- Decision Tree ---
tree_params = {
    'max_depth': [8, 12, 18, None],  # Меньше значений
    'min_samples_split': [3, 8, 15],    # Меньше значений
    'min_samples_leaf': [2, 4, 8],     # Меньше значений
    'criterion': ['gini', 'entropy'],
    'min_impurity_decrease': [0.0, 0.003, 0.008], # Меньше значений
}

decision_tree_model = GridSearchCV(
    DecisionTreeClassifier(random_state=42),
    tree_params,
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),  # Меньше фолдов + StratifiedKFold
    scoring='roc_auc',
    verbose=0,
     n_jobs=multiprocessing.cpu_count() // 2  # Параллелизация
)
decision_tree_model.fit(X_train, y_train)
decision_tree_model = decision_tree_model.best_estimator_


# --- K-Nearest Neighbors ---
knn_params = {
    'n_neighbors': [5, 10, 15, 20],  # Меньше значений
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
    'metric': ['minkowski', 'manhattan'] # Меньше значений
}
knn_model = GridSearchCV(
    KNeighborsClassifier(),
    knn_params,
     cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42), # Меньше фолдов + StratifiedKFold
    scoring='roc_auc',
    verbose=0,
     n_jobs=multiprocessing.cpu_count() // 2 # Параллелизация
)
knn_model.fit(X_train, y_train)
knn_model = knn_model.best_estimator_

# Fitting 10 folds for each of 20 candidates, totalling 200 fits
# Fitting 10 folds for each of 480 candidates, totalling 4800 fits
# Fitting 10 folds for each of 96 candidates, totalling 960 fits

In [None]:
y_prob_logistic = logistic_regression_model.predict_proba(X_test)[:, 1]
y_prob_decision_tree = decision_tree_model.predict_proba(X_test)[:, 1]
y_prob_knn = knn_model.predict_proba(X_test)[:, 1]

y_pred_logistic = logistic_regression_model.predict(X_test)
y_pred_decision_tree = decision_tree_model.predict(X_test)
y_pred_knn = knn_model.predict(X_test)

accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
accuracy_decision_tree = accuracy_score(y_test, y_pred_decision_tree)
accuracy_knn = accuracy_score(y_test, y_pred_knn)

roc_auc_logistic = roc_auc_score(y_test, y_prob_logistic)
roc_auc_decision_tree = roc_auc_score(y_test, y_prob_decision_tree)
roc_auc_knn = roc_auc_score(y_test, y_prob_knn)

precision_logistic = precision_score(y_test, y_pred_logistic)
precision_decision_tree = precision_score(y_test, y_pred_decision_tree)
precision_knn = precision_score(y_test, y_pred_knn)

recall_logistic = recall_score(y_test, y_pred_logistic)
recall_decision_tree = recall_score(y_test, y_pred_decision_tree)
recall_knn = recall_score(y_test, y_pred_knn)

print(f'Accuracy of Logistic Regression: {accuracy_logistic}')
print(f'Accuracy of Decision Tree: {accuracy_decision_tree}')
print(f'Accuracy of K-Nearest Neighbors: {accuracy_knn}')

print(f'ROC AUC of Logistic Regression: {roc_auc_logistic}')
print(f'ROC AUC of Decision Tree: {roc_auc_decision_tree}')
print(f'ROC AUC of K-Nearest Neighbors: {roc_auc_knn}')

print(f'Precision of Logistic Regression: {precision_logistic}')
print(f'Precision of Decision Tree: {precision_decision_tree}')
print(f'Precision of K-Nearest Neighbors: {precision_knn}')

print(f'Recall of Logistic Regression: {recall_logistic}')
print(f'Recall of Decision Tree: {recall_decision_tree}')
print(f'Recall of K-Nearest Neighbors: {recall_knn}')

# Fitting 10 folds for each of 20 candidates, totalling 200 fits
# Fitting 10 folds for each of 480 candidates, totalling 4800 fits
# Fitting 10 folds for each of 96 candidates, totalling 960 fits
# Accuracy of Logistic Regression: 0.755
# Accuracy of Decision Tree: 0.705
# Accuracy of K-Nearest Neighbors: 0.755
# ROC AUC of Logistic Regression: 0.7916082281439926
# ROC AUC of Decision Tree: 0.7398901355773726
# ROC AUC of K-Nearest Neighbors: 0.7578307620383358
# Precision of Logistic Regression: 0.779874213836478
# Precision of Decision Tree: 0.8110236220472441
# Precision of K-Nearest Neighbors: 0.7696969696969697
# Recall of Logistic Regression: 0.8985507246376812
# Recall of Decision Tree: 0.7463768115942029
# Recall of K-Nearest Neighbors: 0.9202898550724637

## Экспериментируйте
Для получения лучшего качества придется поэкспериментировать. Подсказка: попробуйте оптимизировать гиперпараметры модели