In [54]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/data-defoltik/cleaned_data(defoult).csv


In [55]:
# Импорт основных библиотек для работы:
# - warnings: отключаем ненужные предупреждения
# - pandas/numpy: работа с данными
# - seaborn/matplotlib: визуализация
# Также фиксируем стиль графиков и random_state для воспроизводимости
# Импорт всех необходимых библиотек для лабораторной работы №3

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings("ignore")


sns.set(style="whitegrid")

RANDOM_STATE = 42

In [56]:
# Загружаем датасет заранее очищенный кредитного скоринга UCI.
# Смотрим форму датафрейма и первые строки, чтобы понять структуру данных.

df = pd.read_csv('/kaggle/input/data-defoltik/cleaned_data(defoult).csv')
print(df.shape)
df.head()
df['default.payment.next.month'].value_counts(normalize=True)

(30000, 24)


default.payment.next.month
0    0.7788
1    0.2212
Name: proportion, dtype: float64

In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   LIMIT_BAL                   30000 non-null  float64
 1   SEX                         30000 non-null  int64  
 2   EDUCATION                   30000 non-null  int64  
 3   MARRIAGE                    30000 non-null  int64  
 4   AGE                         30000 non-null  int64  
 5   PAY_0                       30000 non-null  int64  
 6   PAY_2                       30000 non-null  int64  
 7   PAY_3                       30000 non-null  int64  
 8   PAY_4                       30000 non-null  int64  
 9   PAY_5                       30000 non-null  int64  
 10  PAY_6                       30000 non-null  int64  
 11  BILL_AMT1                   30000 non-null  float64
 12  BILL_AMT2                   30000 non-null  float64
 13  BILL_AMT3                   300

In [58]:
# выделяем признаки и целевую переменную.

X = df.drop(columns=["default.payment.next.month"])
y = df["default.payment.next.month"]

X.head(), y.head()


(   LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
 0    20000.0    2          2         1   24      2      2     -1     -1   
 1   120000.0    2          2         2   26     -1      2      0      0   
 2    90000.0    2          2         2   34      0      0      0      0   
 3    50000.0    2          2         1   37      0      0      0      0   
 4    50000.0    1          2         1   57     -1      0     -1      0   
 
    PAY_5  ...  BILL_AMT3  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_AMT1  PAY_AMT2  \
 0     -2  ...      689.0        0.0        0.0        0.0       0.0     689.0   
 1      0  ...     2682.0     3272.0     3455.0     3261.0       0.0    1000.0   
 2      0  ...    13559.0    14331.0    14948.0    15549.0    1518.0    1500.0   
 3      0  ...    49291.0    28314.0    28959.0    29547.0    2000.0    2019.0   
 4      0  ...    35835.0    20940.0    19146.0    19131.0    2000.0   36681.0   
 
    PAY_AMT3  PAY_AMT4  PAY_AMT5  PAY_AMT6  
 0 

In [59]:
# Делим данные на тренировочную и тестовую выборки как обычно

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [60]:
# Создаём baseline решающее дерево без каких-либо ограничений
# Это дерево переобучается, и это ожидаемый результат

dt_base = DecisionTreeClassifier(
    random_state=42
)

dt_base.fit(X_train, y_train)
y_pred = dt_base.predict(X_test)
y_prob = dt_base.predict_proba(X_test)[:, 1]

# Вычисляем метрики
baseline_results = {
    "Accuracy": accuracy_score(y_test, y_pred),
    "Precision": precision_score(y_test, y_pred),
    "Recall": recall_score(y_test, y_pred),
    "F1": f1_score(y_test, y_pred),
    "ROC-AUC": roc_auc_score(y_test, y_prob)
}

baseline_results


{'Accuracy': 0.7186666666666667,
 'Precision': 0.3720765414599575,
 'Recall': 0.39562923888470236,
 'F1': 0.38349159970781593,
 'ROC-AUC': 0.6031840306295477}

In [None]:
# подбор гиперпараметров для Decision Tree
# Меньшее количество параметров ,работает быстро, но даёт качественный результат

params_fast = {
    "max_depth": [3, 5, 7, 9],
    "min_samples_leaf": [1, 5, 10],
    "criterion": ["gini", "entropy"],
    "class_weight": ["balanced"]  # фиксируем, т.к. дисбаланс сильный
}

grid_fast = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=params_fast,
    scoring="f1",
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_fast.fit(X_train, y_train)

grid_fast.best_params_


Fitting 5 folds for each of 24 candidates, totalling 120 fits


{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': 5,
 'min_samples_leaf': 10}

In [None]:
# Выводим метрики прокаченной модели
dt_best = DecisionTreeClassifier(
    **best_params,
    random_state=42
)

dt_best.fit(X_train, y_train)

y_pred_best = dt_best.predict(X_test)
y_prob_best = dt_best.predict_proba(X_test)[:, 1]

improved_results = {
    "Accuracy": accuracy_score(y_test, y_pred_best),
    "Precision": precision_score(y_test, y_pred_best),
    "Recall": recall_score(y_test, y_pred_best),
    "F1": f1_score(y_test, y_pred_best),
    "ROC-AUC": roc_auc_score(y_test, y_prob_best)
}

improved_results


{'Accuracy': 0.7738333333333334,
 'Precision': 0.4899732620320856,
 'Recall': 0.5523737754333082,
 'F1': 0.5193057031526744,
 'ROC-AUC': 0.7580648568610165}

In [None]:
# Сравниваем метрики
comparison = pd.DataFrame(
    [baseline_results, improved_results],
    index=["Baseline", "Improved"]
)

comparison


Unnamed: 0,Accuracy,Precision,Recall,F1,ROC-AUC
Baseline,0.718667,0.372077,0.395629,0.383492,0.603184
Improved,0.773833,0.489973,0.552374,0.519306,0.758065


In [None]:
# Создаем самописную модель решающего дерева
import numpy as np

class MyDecisionTreeClassifier:
    def __init__(self, 
                 max_depth=None, 
                 min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.tree = None

    def _gini(self, y):
        #Критерий Джини.
        _, counts = np.unique(y, return_counts=True)
        p = counts / len(y)
        return 1 - np.sum(p ** 2)

    def _best_split(self, X, y):
        #Поиск лучшего разбиения по одному признаку
        best_feat, best_thresh = None, None
        best_gini = 1e9

        n_samples, n_features = X.shape

        for feat in range(n_features):
            thresholds = np.percentile(X[:, feat], [5, 25, 50, 75, 95])
            thresholds = np.unique(thresholds)

            for t in thresholds:
                left_mask = X[:, feat] <= t
                right_mask = ~left_mask

                if left_mask.sum() < self.min_samples_leaf:
                    continue
                if right_mask.sum() < self.min_samples_leaf:
                    continue

                g_left = self._gini(y[left_mask])
                g_right = self._gini(y[right_mask])
                g = (g_left * left_mask.sum() + g_right * right_mask.sum()) / n_samples

                if g < best_gini:
                    best_gini = g
                    best_feat = feat
                    best_thresh = t

        return best_feat, best_thresh

    def _build_tree(self, X, y, depth):
        if len(np.unique(y)) == 1:
            return {"leaf": True, "class": y[0]}

        if self.max_depth is not None and depth >= self.max_depth:
            values, counts = np.unique(y, return_counts=True)
            return {"leaf": True, "class": values[np.argmax(counts)]}

        feat, thresh = self._best_split(X, y)
        if feat is None:
            values, counts = np.unique(y, return_counts=True)
            return {"leaf": True, "class": values[np.argmax(counts)]}

        left_mask = X[:, feat] <= thresh
        right_mask = ~left_mask

        return {
            "leaf": False,
            "feature": feat,
            "threshold": thresh,
            "left": self._build_tree(X[left_mask], y[left_mask], depth + 1),
            "right": self._build_tree(X[right_mask], y[right_mask], depth + 1)
        }

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, 0)
        return self

    def _predict_one(self, node, x):
        if node["leaf"]:
            return node["class"]

        if x[node["feature"]] <= node["threshold"]:
            return self._predict_one(node["left"], x)
        else:
            return self._predict_one(node["right"], x)

    def predict(self, X):
        return np.array([self._predict_one(self.tree, x) for x in X])


In [None]:
# Приводим данные к numpy
def to_dense(X):
    try:
        return X.toarray()
    except:
        return np.asarray(X)

X_train_np = to_dense(X_train)
X_test_np  = to_dense(X_test)
y_train_np = np.asarray(y_train)
y_test_np  = np.asarray(y_test)


# Baseline дерево
my_tree = MyDecisionTreeClassifier(
    max_depth=5,
    min_samples_leaf=10
)

my_tree.fit(X_train_np, y_train_np)
y_pred_my = my_tree.predict(X_test_np)

baseline_results = {
    "Accuracy": accuracy_score(y_test_np, y_pred_my),
    "Precision": precision_score(y_test_np, y_pred_my),
    "Recall": recall_score(y_test_np, y_pred_my),
    "F1": f1_score(y_test_np, y_pred_my),
    "ROC-AUC": roc_auc_score(y_test_np, y_pred_my)
}

print("=== MyDecisionTree Baseline ===")
for k, v in baseline_results.items():
    print(f"{k}: {v:.4f}")

=== MyDecisionTree Baseline ===
Accuracy: 0.8163
Precision: 0.6522
Recall: 0.3632
F1: 0.4666
ROC-AUC: 0.6541


In [None]:
# Улучшенная версия (добавляем class_weight='balanced' вручную)
# Вычисляем веса классов
n = len(y_train_np)
n0 = (y_train_np == 0).sum()
n1 = (y_train_np == 1).sum()

w0 = n / (2 * n0)
w1 = n / (2 * n1)
weights = {0: w0, 1: w1}

print("\nBalanced class weights:", weights)


# Модифицируем predict (взвешенное большинство в листьях)
class MyDecisionTreeImproved(MyDecisionTreeClassifier):
    def _majority_class(self, y):
        # взвешенное голосование классов
        w = {0: w0, 1: w1}
        zero = y[y == 0].shape[0] * w[0]
        one = y[y == 1].shape[0] * w[1]
        return 1 if one > zero else 0

    def _build_tree(self, X, y, depth):
        if len(np.unique(y)) == 1:
            return {"leaf": True, "class": y[0]}

        if self.max_depth is not None and depth >= self.max_depth:
            return {"leaf": True, "class": self._majority_class(y)}

        feat, thresh = self._best_split(X, y)
        if feat is None:
            return {"leaf": True, "class": self._majority_class(y)}

        left_mask = X[:, feat] <= thresh
        right_mask = ~left_mask

        return {
            "leaf": False,
            "feature": feat,
            "threshold": thresh,
            "left": self._build_tree(X[left_mask], y[left_mask], depth + 1),
            "right": self._build_tree(X[right_mask], y[right_mask], depth + 1)
        }


# Improved tree
my_tree_imp = MyDecisionTreeImproved(
    max_depth=5,
    min_samples_leaf=10
)

my_tree_imp.fit(X_train_np, y_train_np)
y_pred_my_imp = my_tree_imp.predict(X_test_np)

improved_results = {
    "Accuracy": accuracy_score(y_test_np, y_pred_my_imp),
    "Precision": precision_score(y_test_np, y_pred_my_imp),
    "Recall": recall_score(y_test_np, y_pred_my_imp),
    "F1": f1_score(y_test_np, y_pred_my_imp),
    "ROC-AUC": roc_auc_score(y_test_np, y_pred_my_imp)
}

print("\n=== MyDecisionTree Improved ===")
for k, v in improved_results.items():
    print(f"{k}: {v:.4f}")


Balanced class weights: {0: 0.6420202236370446, 1: 2.2603126765869277}

=== MyDecisionTree Improved ===
Accuracy: 0.7710
Precision: 0.4842
Recall: 0.5426
F1: 0.5117
ROC-AUC: 0.6892
