In [1]:
import numpy as np
import pandas as pd
import pickle   # сохранение модели

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn import svm
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error as mse, r2_score as r2
from sklearn.metrics import roc_auc_score, roc_curve, auc, confusion_matrix,  accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('train_lr_2_processed.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 42 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Annual Income                   7500 non-null   float64
 1   Tax Liens                       7500 non-null   float64
 2   Number of Open Accounts         7500 non-null   float64
 3   Years of Credit History         7500 non-null   float64
 4   Maximum Open Credit             7500 non-null   float64
 5   Number of Credit Problems       7500 non-null   float64
 6   Months since last delinquent    7500 non-null   float64
 7   Bankruptcies                    7500 non-null   float64
 8   Current Loan Amount             7500 non-null   float64
 9   Current Credit Balance          7500 non-null   float64
 10  Monthly Debt                    7500 non-null   float64
 11  Credit Score                    7500 non-null   float64
 12  Credit Default                  75

In [3]:
feature_names = df.drop(columns=['Credit Default', 'Purpose_renewable energy']).columns.tolist()
target_name = 'Credit Default'
df = df[feature_names + [target_name]]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7500 entries, 0 to 7499
Data columns (total 41 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Annual Income                   7500 non-null   float64
 1   Tax Liens                       7500 non-null   float64
 2   Number of Open Accounts         7500 non-null   float64
 3   Years of Credit History         7500 non-null   float64
 4   Maximum Open Credit             7500 non-null   float64
 5   Number of Credit Problems       7500 non-null   float64
 6   Months since last delinquent    7500 non-null   float64
 7   Bankruptcies                    7500 non-null   float64
 8   Current Loan Amount             7500 non-null   float64
 9   Current Credit Balance          7500 non-null   float64
 10  Monthly Debt                    7500 non-null   float64
 11  Credit Score                    7500 non-null   float64
 12  Home Ownership_Home Mortgage    75

In [4]:
feature_names_for_stand = df[feature_names].select_dtypes(include=['float64', 'int64']).columns.tolist()
scaler = StandardScaler()
stand_features = scaler.fit_transform(df[feature_names_for_stand])

In [5]:
df[feature_names_for_stand] = pd.DataFrame(stand_features, columns=feature_names_for_stand)

In [6]:
df['Credit Default'].value_counts()

Credit Default
0    5387
1    2113
Name: count, dtype: int64

In [7]:
X = df[feature_names]
y = df[target_name]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True, random_state=42,  stratify=y)

In [8]:
display(y_train.value_counts(normalize=True),
y_test.value_counts(normalize=True))

Credit Default
0    0.718209
1    0.281791
Name: proportion, dtype: float64

Credit Default
0    0.718384
1    0.281616
Name: proportion, dtype: float64

In [9]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))


In [10]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [11]:
pred_train = logreg.predict(X_train)
pred_test = logreg.predict(X_test)


In [12]:
get_classification_report(y_train, pred_train, y_test, pred_test)

TRAIN

              precision    recall  f1-score   support

           0       0.73      0.97      0.84      3609
           1       0.57      0.10      0.17      1416

    accuracy                           0.73      5025
   macro avg       0.65      0.54      0.50      5025
weighted avg       0.69      0.73      0.65      5025

TEST

              precision    recall  f1-score   support

           0       0.73      0.97      0.84      1778
           1       0.59      0.10      0.17       697

    accuracy                           0.73      2475
   macro avg       0.66      0.54      0.50      2475
weighted avg       0.69      0.73      0.65      2475

CONFUSION MATRIX

col_0              0   1
Credit Default          
0               1729  49
1                627  70


In [13]:
vectmach = svm.SVC()
vectmach.fit(X_train, y_train)


In [14]:
pred_train = vectmach.predict(X_train)
pred_test = vectmach.predict(X_test)


In [15]:
get_classification_report(y_train, pred_train, y_test, pred_test)

TRAIN

              precision    recall  f1-score   support

           0       0.73      1.00      0.84      3609
           1       1.00      0.04      0.08      1416

    accuracy                           0.73      5025
   macro avg       0.86      0.52      0.46      5025
weighted avg       0.80      0.73      0.63      5025

TEST

              precision    recall  f1-score   support

           0       0.72      1.00      0.84      1778
           1       0.60      0.01      0.02       697

    accuracy                           0.72      2475
   macro avg       0.66      0.50      0.43      2475
weighted avg       0.69      0.72      0.61      2475

CONFUSION MATRIX

col_0              0  1
Credit Default         
0               1774  4
1                691  6


In [16]:
GB = GradientBoostingClassifier(n_estimators=100, learning_rate=1,
                                 max_depth=5, random_state=2)
GB.fit(X_train, y_train)

In [17]:
pred_train = GB.predict(X_train)
pred_test = GB.predict(X_test)

In [18]:
get_classification_report(y_train, pred_train, y_test, pred_test)

TRAIN

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3609
           1       1.00      1.00      1.00      1416

    accuracy                           1.00      5025
   macro avg       1.00      1.00      1.00      5025
weighted avg       1.00      1.00      1.00      5025

TEST

              precision    recall  f1-score   support

           0       0.79      0.84      0.81      1778
           1       0.51      0.43      0.47       697

    accuracy                           0.72      2475
   macro avg       0.65      0.63      0.64      2475
weighted avg       0.71      0.72      0.71      2475

CONFUSION MATRIX

col_0              0    1
Credit Default           
0               1485  293
1                397  300


In [19]:
df_1 = pd.read_csv('test_lr_2_processed.csv')
GB = GradientBoostingClassifier(n_estimators=100, learning_rate=1,
                                 max_depth=5, random_state=2)
GB.fit(X_test, y_test)
y_train_preds = GB.predict(df_1)


In [20]:
nbrs = KNeighborsClassifier(n_neighbors=13)
nbrs.fit(X_train, y_train)


In [21]:
pred_train = nbrs.predict(X_train)
pred_test = nbrs.predict(X_test)


In [22]:
get_classification_report(y_train, pred_train, y_test, pred_test)

TRAIN

              precision    recall  f1-score   support

           0       0.75      0.97      0.84      3609
           1       0.66      0.18      0.28      1416

    accuracy                           0.74      5025
   macro avg       0.71      0.57      0.56      5025
weighted avg       0.72      0.74      0.68      5025

TEST

              precision    recall  f1-score   support

           0       0.73      0.95      0.82      1778
           1       0.44      0.10      0.17       697

    accuracy                           0.71      2475
   macro avg       0.59      0.53      0.50      2475
weighted avg       0.65      0.71      0.64      2475

CONFUSION MATRIX

col_0              0   1
Credit Default          
0               1687  91
1                625  72


In [23]:
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'learning_rate': [0.1, 0.3, 0.5, 1],
#     'max_depth': [1, 5, 7, 9],
#     'random_state': [5, 6, 7, 8]
# }

# # Инициализация модели
# gs = GridSearchCV(GB, param_grid,
#                   scoring= 'f1', # метрика
#                   cv=5,
#                   n_jobs=-1
#                   )
# gs.fit(X_test, y_test)

In [24]:
# gs.best_params_

In [25]:
len(y_train_preds)

2500

In [26]:
def save_object_to_csv(obj, filename):
    """
    Сохраняет объект (например, numpy.ndarray, DataFrame или Series) в CSV файл.
    
    :param obj: Объект для сохранения (может быть numpy.ndarray, DataFrame или Series).
    :param filename: Имя файла, в который будет сохранен объект (без расширения).
    """
    # Преобразуем объект в Series, если это numpy.ndarray
    if isinstance(obj, np.ndarray):
        obj = pd.Series(obj)
    
    # Сохраняем в CSV
    obj.to_csv(f"{filename}.csv", index=False)
    print(f"Объект успешно сохранен в {filename}.csv")

# Пример использования:
save_object_to_csv(y_train_preds, 'Предикты для 3й лабы')

Объект успешно сохранен в Предикты для 3й лабы.csv
