In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder, PolynomialFeatures
from sklearn.metrics import mean_squared_error


In [3]:
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")
example = pd.read_csv("/content/ex.csv")

In [4]:
print("RiskScore in train:", 'RiskScore' in train.columns)
print("RiskScore in test:", 'RiskScore' in test.columns)
print("\nПропуски в train:")
print(train.isna().sum().sort_values(ascending=False).head(10))

RiskScore in train: True
RiskScore in test: False

Пропуски в train:
MonthlyDebtPayments       1031
LoanAmount                1031
CreditScore               1031
BaseInterestRate          1031
NetWorth                  1031
TotalAssets               1031
BankruptcyHistory         1031
LoanPurpose               1031
CheckingAccountBalance    1031
ApplicationDate            530
dtype: int64


In [5]:
train_clean = train.drop_duplicates()

train_clean = train_clean[
    (train_clean["RiskScore"] >= 0) &
    (train_clean["RiskScore"] <= 100)
]


y = train_clean["RiskScore"].values


train_features = train_clean.drop(columns=["RiskScore"])
test_features = test.copy()


full = pd.concat([train_features, test_features], axis=0, ignore_index=True)

In [6]:
if "ApplicationDate" in full.columns:
    full["ApplicationDate"] = pd.to_datetime(full["ApplicationDate"], errors="coerce")

    median_ts = full["ApplicationDate"].dropna().astype("int64").median()
    median_date = pd.to_datetime(median_ts)

    full["ApplicationDate"] = full["ApplicationDate"].fillna(median_date)

    full["ApplicationMonth"] = full["ApplicationDate"].dt.month
    full["ApplicationWeekday"] = full["ApplicationDate"].dt.weekday
    first_date = full["ApplicationDate"].min()
    full["DaysSinceStart"] = (full["ApplicationDate"] - first_date).dt.days

    full = full.drop(columns=["ApplicationDate"])

In [7]:
from sklearn.impute import KNNImputer
import numpy as np
import pandas as pd

num_cols = full.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = full.select_dtypes(include=["object", "category"]).columns.tolist()


cat_maps = {}
full_cat_codes = pd.DataFrame(index=full.index)

for col in cat_cols:
    codes, uniques = pd.factorize(full[col], sort=True)
    full_cat_codes[col] = codes.astype(float)

knn_input = pd.concat([full[num_cols].copy(), full_cat_codes], axis=1)

knn = KNNImputer(n_neighbors=5, weights="distance")
knn_imputed = knn.fit_transform(knn_input)
knn_imputed = pd.DataFrame(knn_imputed, columns=knn_input.columns, index=full.index)


full[num_cols] = knn_imputed[num_cols].values


for col, uniques in cat_maps.items():
    if col not in knn_imputed.columns:
        continue

    codes_float = knn_imputed[col].values

    if len(uniques) == 0:
        full[col] = pd.Series([np.nan] * len(full), index=full.index, dtype="category")
        continue


    codes_round = np.rint(codes_float).astype(int)
    codes_round = np.clip(codes_round, -1, len(uniques) - 1)

    restored = pd.Series(index=full.index, dtype="object")
    mask_nan = codes_round == -1
    valid_mask = ~mask_nan
    if valid_mask.any():
        restored.loc[valid_mask] = uniques.take(codes_round[valid_mask]).astype(object).values
    restored.loc[mask_nan] = np.nan

    full[col] = restored.astype("category")



if "EducationLevel" in full.columns:
    edu_order = ["High School", "Associate", "Bachelor", "Master", "Doctorate"]
    full["EducationLevel"] = pd.Categorical(full["EducationLevel"], categories=edu_order, ordered=True)


cat_cols_after = full.select_dtypes(include=["object", "category"]).columns.tolist()
full = pd.get_dummies(full, columns=cat_cols_after, drop_first=True)

print("Форма full после KNN-импутации числовых и категориальных + OHE:", full.shape)


Форма full после KNN-импутации числовых и категориальных + OHE: (14788, 48)


In [8]:
n_train = train_clean.shape[0]
X_full_train = full.iloc[:n_train].reset_index(drop=True)
X_full_test = full.iloc[n_train:].reset_index(drop=True)

In [9]:
num_cols_all = X_full_train.select_dtypes(include=["int64", "float64"]).columns.tolist()

In [10]:
skew_vals = X_full_train[num_cols_all].skew().sort_values(ascending=False)
print("Топ-10 самых скошенных признаков:")
print(skew_vals.head(10))

Топ-10 самых скошенных признаков:
TotalLiabilities          18.213429
NetWorth                  10.927231
TotalAssets               10.582202
CheckingAccountBalance     9.249012
SavingsAccountBalance      6.657517
TotalDebtToIncomeRatio     6.013178
MonthlyLoanPayment         5.228303
BankruptcyHistory          4.074504
MonthlyDebtPayments        3.567820
LoanAmount                 3.366255
dtype: float64


In [11]:
skewed_cols = [
    col for col in num_cols_all
    if (skew_vals[col] > 1.0) and (X_full_train[col].min() >= 0)
]

print("\nБудем логировать признаки:")
print(skewed_cols)

for col in skewed_cols:
    X_full_train[col] = np.log1p(X_full_train[col])
    X_full_test[col] = np.log1p(X_full_test[col])


Будем логировать признаки:
['LoanAmount', 'MonthlyDebtPayments', 'BankruptcyHistory', 'PreviousLoanDefaults', 'SavingsAccountBalance', 'CheckingAccountBalance', 'TotalAssets', 'TotalLiabilities', 'NetWorth', 'MonthlyLoanPayment', 'TotalDebtToIncomeRatio']


In [12]:
id_like_cols = [c for c in X_full_train.columns if "id" in c.lower()]
print("\nID-like признаки:", id_like_cols)

X_full_train = X_full_train.drop(columns=id_like_cols)
X_full_test = X_full_test.drop(columns=id_like_cols)

print("Форма X_full_train:", X_full_train.shape)
print("Форма X_full_test:", X_full_test.shape)


ID-like признаки: ['ID', 'MaritalStatus_Widowed', 'LoanPurpose_Debt Consolidation']
Форма X_full_train: (9788, 45)
Форма X_full_test: (5000, 45)


In [13]:
num_cols_corr = X_full_train.select_dtypes(include=["int64", "float64"]).columns.tolist()

df_corr = X_full_train[num_cols_corr].copy()
df_corr["RiskScore"] = y

corr_series = df_corr.corr()["RiskScore"].abs().sort_values(ascending=False)
print("\nТоп-15 по |corr| с RiskScore:")
print(corr_series.head(15))

strong_num_cols = [c for c in corr_series.index if c != "RiskScore"][:10]
print("\nСильные числовые признаки для poly:")
print(strong_num_cols)


Топ-15 по |corr| с RiskScore:
RiskScore                 1.000000
CreditScore               0.782926
MonthlyIncome             0.769977
AnnualIncome              0.757883
BaseInterestRate          0.757219
InterestRate              0.743754
TotalDebtToIncomeRatio    0.712637
MonthlyLoanPayment        0.198644
BankruptcyHistory         0.174037
NetWorth                  0.167577
TotalAssets               0.165236
DebtToIncomeRatio         0.110157
LoanAmount                0.075790
LengthOfCreditHistory     0.060102
PreviousLoanDefaults      0.054791
Name: RiskScore, dtype: float64

Сильные числовые признаки для poly:
['CreditScore', 'MonthlyIncome', 'AnnualIncome', 'BaseInterestRate', 'InterestRate', 'TotalDebtToIncomeRatio', 'MonthlyLoanPayment', 'BankruptcyHistory', 'NetWorth', 'TotalAssets']


In [14]:
scaler_strong = StandardScaler()
X_train_strong_scaled = scaler_strong.fit_transform(X_full_train[strong_num_cols])
X_test_strong_scaled = scaler_strong.transform(X_full_test[strong_num_cols])


poly = PolynomialFeatures(
    degree=4,
    include_bias=False,
    interaction_only=False
)

X_train_poly_strong = poly.fit_transform(X_train_strong_scaled)
X_test_poly_strong = poly.transform(X_test_strong_scaled)

print(f"Создано {X_train_poly_strong.shape[1]} полиномиальных признаков степени 4")


X_train_rest = X_full_train.drop(columns=strong_num_cols)
X_test_rest = X_full_test.drop(columns=strong_num_cols)


X_train_final = np.hstack([X_train_poly_strong, X_train_rest.values])
X_test_final = np.hstack([X_test_poly_strong, X_test_rest.values])

print("\nФорма X_train_final:", X_train_final.shape)
print("Форма X_test_final:", X_test_final.shape)


scaler_all = StandardScaler()
X_train_scaled = scaler_all.fit_transform(X_train_final)
X_test_scaled = scaler_all.transform(X_test_final)

print("Форма X_train_scaled:", X_train_scaled.shape)
print("Форма X_test_scaled:", X_test_scaled.shape)

Создано 1000 полиномиальных признаков степени 4

Форма X_train_final: (9788, 1035)
Форма X_test_final: (5000, 1035)
Форма X_train_scaled: (9788, 1035)
Форма X_test_scaled: (5000, 1035)


In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np

scaler_all = StandardScaler()
X_train_scaled = scaler_all.fit_transform(X_train_final)
X_test_scaled = scaler_all.transform(X_test_final)

print("Форма X_train_scaled:", X_train_scaled.shape)
print("Форма X_test_scaled:", X_test_scaled.shape)


print("=== ПОДБОР ПАРАМЕТРОВ ДЛЯ RIDGE С POLY-4 ===")

ridge_alphas = [0.001, 0.01, 0.1, 0.5, 1, 2, 5, 10, 20, 50, 100, 125, 150, 200, 500, 1000]

kf = KFold(n_splits=5, shuffle=True, random_state=42)

best_mse = float("inf")
best_alpha = None

print("--- Ridge поиск ---")
for alpha in ridge_alphas:
    cv_mses = []
    for train_idx, val_idx in kf.split(X_train_scaled):
        model = Ridge(
            alpha=alpha,
            random_state=42,
            max_iter=10000
        )
        model.fit(X_train_scaled[train_idx], y[train_idx])
        pred = model.predict(X_train_scaled[val_idx])
        cv_mses.append(mean_squared_error(y[val_idx], pred))

    mean_mse = np.mean(cv_mses)
    print(f"Ridge: alpha={alpha:<6} CV MSE={mean_mse:.4f}")

    if mean_mse < best_mse:
        best_mse = mean_mse
        best_alpha = alpha

print(f"\nЛучший Ridge: alpha={best_alpha}, MSE={best_mse:.4f}")

final_model = Ridge(
    alpha=best_alpha,
    random_state=42,
    max_iter=10000
)

final_model.fit(X_train_scaled, y)

# Предсказание на тренировочных данных для проверки
train_preds = final_model.predict(X_train_scaled)
train_mse = mean_squared_error(y, train_preds)
print(f"MSE на тренировочных данных: {train_mse:.4f}")

# Анализ качества модели
residuals = y - train_preds
print(f"Средняя ошибка: {np.mean(residuals):.4f}")
print(f"Стандартное отклонение ошибок: {np.std(residuals):.4f}")
print(f"Min/Max ошибка: {np.min(residuals):.4f} / {np.max(residuals):.4f}")

test_preds = final_model.predict(X_test_scaled)


test_preds = np.clip(test_preds, 0, 100)

submission = example.copy()
submission["RiskScore"] = test_preds
submission.to_csv("submission_poly4_ridge.csv", index=False)
print("\nФайл submission_poly4_ridge.csv сохранён.")

Форма X_train_scaled: (9788, 1035)
Форма X_test_scaled: (5000, 1035)
=== ПОДБОР ПАРАМЕТРОВ ДЛЯ RIDGE С POLY-4 ===
--- Ridge поиск ---
Ridge: alpha=0.001  CV MSE=262.6145
Ridge: alpha=0.01   CV MSE=168.0495
Ridge: alpha=0.1    CV MSE=59.4260
Ridge: alpha=0.5    CV MSE=36.2536
Ridge: alpha=1      CV MSE=32.8937
Ridge: alpha=2      CV MSE=30.8650
Ridge: alpha=5      CV MSE=29.2671
Ridge: alpha=10     CV MSE=28.5019
Ridge: alpha=20     CV MSE=27.9811
Ridge: alpha=50     CV MSE=27.6145
Ridge: alpha=100    CV MSE=27.6001
Ridge: alpha=125    CV MSE=27.6567
Ridge: alpha=150    CV MSE=27.7301
Ridge: alpha=200    CV MSE=27.9032
Ridge: alpha=500    CV MSE=29.0764
Ridge: alpha=1000   CV MSE=30.9108

Лучший Ridge: alpha=100, MSE=27.6001
MSE на тренировочных данных: 24.4708
Средняя ошибка: 0.0000
Стандартное отклонение ошибок: 4.9468
Min/Max ошибка: -26.4886 / 41.8445

Файл submission_poly4_ridge.csv сохранён.
