In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.preprocessing import RobustScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import KFold
from scipy.stats import mstats
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

In [None]:
sns.set(style="whitegrid")
plt.rcParams['figure.figsize'] = (10, 5)
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12

In [None]:
train_original = pd.read_csv('./kaggle/input/full-lab-for-labs/train.csv')
test_original = pd.read_csv('./kaggle/input/full-lab-for-labs/test.csv')

In [None]:
# –Ω—É —Ç—É—Ç –ø–æ–Ω—è—Ç–Ω–æ
test_ids = test_original['ID'] if 'ID' in test_original.columns else test_original.index
print(f"train shape->{train_original.shape}")
print(f"test shape->{test_original.shape}")

In [None]:
display(train_original.head())

In [None]:
display(test_original.head())

In [None]:
train_original.dtypes.value_counts()

In [None]:
display(train_original.describe().T)

In [None]:
if 'RiskScore' in train_original.columns:
    display(train_original['RiskScore'].describe())

In [None]:
num_cols_raw = train_original.select_dtypes(include=[np.number]).columns.tolist()

In [None]:
if 'RiskScore' in num_cols_raw:
    fig, ax = plt.subplots(1, 2, figsize=(14, 5))
    sns.histplot(train_original['RiskScore'], bins=40, kde=True, ax=ax[0])
    ax[0].set_title("–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ RiskScore")
    ax[0].set_xlabel("RiskScore")

    sns.boxplot(x=train_original['RiskScore'], ax=ax[1])
    ax[1].set_title("Boxplot RiskScore")
    ax[1].set_xlabel("RiskScore")

    plt.tight_layout()
    plt.show()

In [None]:
interesting_feats = ['CreditScore', 'MonthlyIncome', 'LoanAmount', 'DebtToIncomeRatio']
for col in interesting_feats:
    if col in num_cols_raw:
        plt.figure()
        sns.histplot(train_original[col], bins=40, kde=True)
        plt.title(f"–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ {col}")
        plt.xlabel(col)
        plt.ylabel("count")
        plt.show()

In [None]:

missing_train = train_original.isnull().mean().sort_values(ascending=False)
missing_test = test_original.isnull().mean().sort_values(ascending=False)

In [None]:
display(missing_train.head(20))

In [None]:
display(missing_test.head(20))

In [None]:
top_miss = missing_train[missing_train > 0].head(20)
if len(top_miss) > 0:
    plt.figure(figsize=(10, 6))
    sns.barplot(x=top_miss.values, y=top_miss.index)
    plt.title("To–ø –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –ø–æ –¥–æ–ª–µ –ø—Ä–æ–ø—É—Å–∫–æ–≤ (train)")
    plt.xlabel("–î–æ–ª—è –ø—Ä–æ–ø—É—Å–∫–æ–≤")
    plt.ylabel("–ü—Ä–∏–∑–Ω–∞–∫")
    plt.tight_layout()
    plt.show()

In [None]:

if 'RiskScore' in train_original.columns:
    num_cols_for_corr = train_original.select_dtypes(include=[np.number]).columns.tolist()
    num_cols_for_corr = [c for c in num_cols_for_corr if train_original[c].notnull().sum() > 0]

    corr_matrix = train_original[num_cols_for_corr].corr()
    corr_target = corr_matrix['RiskScore'].sort_values(ascending=False)
    display(corr_target.head(15))
    display(corr_target.tail(15))
    top_corr_feats = corr_target.reindex(
        corr_target.abs().sort_values(ascending=False).head(15).index
    ).index.tolist()

    plt.figure(figsize=(10, 8))
    sns.heatmap(
        train_original[top_corr_feats + ['RiskScore']].corr(),
        annot=False,
        cmap="coolwarm",
        vmin=-1, vmax=1
    )
    plt.title("–ö–æ—Ä—Ä–µ–ª—è—Ü–∏–∏ —Ç–æ–ø-15 –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ —Å —Ä–∏—Å–∫—Å–∫–æ—Ä–µ")
    plt.tight_layout()
    plt.show()

In [None]:
cat_cols_raw = train_original.select_dtypes(include=['object']).columns.tolist()

In [None]:
cat_cols_raw[:15]

In [None]:
for col in cat_cols_raw[:3]:
    print(f"\n–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–π –≤ {col}:")
    display(train_original[col].value_counts(normalize=True).head(10))
    if 'RiskScore' in train_original.columns:
        plt.figure(figsize=(10, 5))
        sns.boxplot(
            data=train_original,
            x=col,
            y='RiskScore'
        )
        plt.title(f"RiskScore –ø–æ –∫–∞—Ç–µ–≥–æ—Ä–∏—è–º {col}")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

In [None]:
train=train_original.copy()
test= test_original.copy()

# —Å—á–∏—Ç–∞–µ–º —Å–∫–æ–ª—å–∫–æ –ø—Ä–æ–ø—É—Å–∫–æ–≤ –≤ —Å—Ç—Ä–æ–∫–µ –∏ –≤—ã–∫–∏–¥—ã–≤–∞–µ–º —Å–æ–≤—Å–µ–º —É–±–∏—Ç—ã–µ
train['na_count'] = train.isnull().sum(axis=1)
train = train[train['na_count'] <= 3]
train = train.drop('na_count', axis=1)

In [None]:
# –∑–∞–ø–æ–ª–Ω—è–µ–º —á–∏—Å–ª–æ–≤—ã–µ –ø—Ä–æ–ø—É—Å–∫–∏ –º–µ–¥–∏–∞–Ω–æ–π –ø–æ train+test
num_cols = train.select_dtypes(include=[np.number]).columns.tolist()
for col in num_cols:
    if col not in ['RiskScore', 'ID'] and col in test.columns:
        combined = pd.concat([train[col], test[col]], ignore_index=True)
        global_median = combined.median()
        train[col].fillna(global_median, inplace=True)
        test[col].fillna(global_median, inplace=True)

# –æ—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ —Å—Ç—Ä–æ–∫–∏ —Å –≤–∞–ª–∏–¥–Ω—ã–º —Ç–∞—Ä–≥–µ—Ç–æ–º
train = train[~train['RiskScore'].isnull()]
valid = (train['RiskScore'] >= 0) & (train['RiskScore'] <= 150)
train = train[valid]

In [None]:
# –æ—Ç–±–æ—Ä –ø–æ IQR, —á—Ç–æ–±—ã —É–±—Ä–∞—Ç—å —Å–æ–≤—Å–µ–º —Å—Ç—Ä–∞–Ω–Ω—ã–µ —Å–∫–æ—Ä—ã
Q1, Q3 = train['RiskScore'].quantile(0.03), train['RiskScore'].quantile(0.97)
IQR = Q3 - Q1
train = train[(train['RiskScore'] >= max(Q1 - 1.5 * IQR, 0)) & (train['RiskScore'] <= Q3 + 1.5 * IQR)]
print(len(train))

In [None]:
# –Ω–µ–º–Ω–æ–≥–æ —Ä–µ–∂–µ–º –≤—ã–±—Ä–æ—Å—ã –ø–æ –∫–ª—é—á–µ–≤—ã–º —Ñ–∏–Ω–∞–Ω—Å–æ–≤—ã–º –∫–æ–ª–æ–Ω–∫–∞–º
outlier_cols = ['LoanAmount', 'TotalAssets', 'TotalLiabilities', 'NetWorth','SavingsAccountBalance', 'CheckingAccountBalance', 'MonthlyDebtPayments']
for col in outlier_cols:
    if col in train.columns:
        train[col] = mstats.winsorize(train[col].fillna(train[col].median()),limits=[0.02, 0.02])
    if col in test.columns:
        test[col] = mstats.winsorize(test[col].fillna(test[col].median()), limits=[0.02, 0.02])
y_train = train['RiskScore'].copy()

# —ç—Ç —É–¥–∞–ª—è–µ–º
LEAKS = ['BaseInterestRate', 'InterestRate']

In [None]:

def create_domain_features(df):
    df = df.copy()

    if 'MonthlyDebtPayments' in df.columns and 'MonthlyIncome' in df.columns:
        df['DebtServiceRatio'] = df['MonthlyDebtPayments'] / (df['MonthlyIncome'] + 1)

    if 'MonthlyLoanPayment' in df.columns and 'MonthlyIncome' in df.columns:
        df['PaymentToIncome'] = df['MonthlyLoanPayment'] / (df['MonthlyIncome'] + 1)

    if 'TotalAssets' in df.columns and 'LoanAmount' in df.columns:
        df['AssetCoverageRatio'] = df['TotalAssets'] / (df['LoanAmount'] + 1)

    if 'SavingsAccountBalance' in df.columns and 'MonthlyIncome' in df.columns:
        df['SavingsRate'] = df['SavingsAccountBalance'] / (df['MonthlyIncome'] * 12 + 1)

    if 'NetWorth' in df.columns and 'MonthlyIncome' in df.columns:
        df['NetWorthToIncome'] = df['NetWorth'] / (df['MonthlyIncome'] * 12 + 1)

    if 'CreditCardUtilizationRate' in df.columns and 'NumberOfOpenCreditLines' in df.columns:
        df['CreditUtilScore'] = df['CreditCardUtilizationRate'] * df['NumberOfOpenCreditLines']

    if 'TotalLiabilities' in df.columns and 'TotalAssets' in df.columns:
        df['DebtToAssetRatio'] = df['TotalLiabilities'] / (df['TotalAssets'] + 1)

    if 'LoanDuration' in df.columns and 'Age' in df.columns:
        df['LoanDurationToAge'] = df['LoanDuration'] / (df['Age'] + 1)

    if 'Experience' in df.columns and 'Age' in df.columns:
        df['ExperienceToAge'] = df['Experience'] / (df['Age'] + 1)

    return df

train = create_domain_features(train)
test = create_domain_features(test)

print("–ß–∞—Å—Ç—å —Å –¥–æ–º–µ–Ω–∞–º–∏ –∑–∞–≤–µ—Ä—à–µ–Ω–∞")

In [None]:
# –ü—Ä–µ–ø—Ä–æ—Ü–µ—Å—Å–∏–Ω–≥
def prep(df):
    df = df.copy()
    # –∞–π–¥–∏—à–Ω–∏–∫ –Ω–∞–º –≤ –º–æ–¥–µ–ª—å –Ω–µ –Ω—É–∂–µ–Ω
    if 'ID' in df.columns:
        df = df.drop('ID', axis=1)
    # –≤—ã–∫–∏–¥—ã–≤–∞–µ–º –ª–∏–∫–∏
    for leak in LEAKS:
        if leak in df.columns:
            df = df.drop(leak, axis=1)
    if 'AnnualIncome' in df.columns and 'MonthlyIncome' in df.columns:
        df = df.drop('AnnualIncome', axis=1)

    # —Ä–∞–∑–±–∏—Ä–∞–µ–º –¥–∞—Ç—É –∑–∞—è–≤–∫–∏ –Ω–∞ —Ñ–∏—á–∏
    if 'ApplicationDate' in df.columns:
        df['ApplicationDate'] = pd.to_datetime(df['ApplicationDate'], errors='coerce')
        df['AppMonth'] = df['ApplicationDate'].dt.month
        df['AppQuarter'] = df['ApplicationDate'].dt.quarter
        df['AppDaysSinceEpoch'] = (df['ApplicationDate'] - pd.Timestamp('1970-01-01')).dt.days
        df['AppMonth_sin'] = np.sin(2 * np.pi * df['AppMonth']/ 12)
        df['AppMonth_cos'] = np.cos(2 * np.pi* df['AppMonth'] / 12)
        df = df.drop('ApplicationDate', axis = 1)

    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = df.select_dtypes(include=['object']).columns.tolist()


    if 'RiskScore' in num_cols:
        num_cols.remove('RiskScore')
    for col in num_cols:
        if df[col].isnull().any():
            df[col].fillna(df[col].median(), inplace=True)

    # –∑–∞–ø–æ–ª–Ω—è–µ–º –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ —Å–∞–º—ã–º —á–∞—Å—Ç—ã–º
    for col in cat_cols:
        if df[col].isnull().any():
            df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else 'Unk', inplace=True)

    df = pd.get_dummies(df, columns=cat_cols, drop_first=True)
    return df

In [None]:
train_p = prep(train)
test_p = prep(test)


X_train = train_p.drop('RiskScore', axis=1) if 'RiskScore' in train_p.columns else train_p
X_test = test_p


all_cols = sorted(set(X_train.columns)|set(X_test.columns))
for col in all_cols:
    if col not in X_train.columns:
        X_train[col] = 0
    if col not in X_test.columns:
        X_test[col] = 0

X_train = X_train[all_cols]
X_test = X_test[all_cols]

print(X_train.shape[1])

In [None]:
# –¥–∂–∞—Å—Ç —á–∏–ª–ª –Ω–∞–≤–µ—Ä–Ω, –ø—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏—è
if 'TotalDebtToIncomeRatio' in X_train.columns:
    for transform_name, transform_func in [
        ('log', lambda x: np.log1p(x)),
        ('sqrt', lambda x: np.sqrt(x)),
        ('sq', lambda x: x ** 2),
        ('cube', lambda x: x ** 3),
        ('pow_0.5', lambda x: x ** 0.5),
        ('pow_1.5', lambda x: x ** 1.5),
        ('reciprocal', lambda x: 1 / (x + 0.01))
    ]:
        X_train[f'TotalDebt_{transform_name}'] = transform_func(X_train['TotalDebtToIncomeRatio'])
        X_test[f'TotalDebt_{transform_name}'] = transform_func(X_test['TotalDebtToIncomeRatio'])

# –≤–∑–∞–∏–º–æ–¥–µ–π—Å—Ç–≤–∏—è –º–µ–∂–¥—É –≤–∞–∂–Ω—ã–º–∏ –ø—Ä–∏–∑–Ω–∞–∫–∞–º–∏
if 'CreditScore' in X_train.columns and 'MonthlyIncome' in X_train.columns:
    X_train['Credit_X_Income'] = X_train['CreditScore'] * X_train['MonthlyIncome']
    X_test['Credit_X_Income'] = X_test['CreditScore'] * X_test['MonthlyIncome']
    X_train['Credit_X_Income_log'] = np.log1p(X_train['Credit_X_Income'])
    X_test['Credit_X_Income_log'] = np.log1p(X_test['Credit_X_Income'])

if 'CreditScore' in X_train.columns and 'TotalDebtToIncomeRatio' in X_train.columns:
    X_train['Credit_X_TotalDebt'] = X_train['CreditScore'] * X_train['TotalDebtToIncomeRatio']
    X_test['Credit_X_TotalDebt'] = X_test['CreditScore'] * X_test['TotalDebtToIncomeRatio']

if ('MonthlyIncome' in X_train.columns and 'CreditScore' in X_train.columns and 'TotalDebtToIncomeRatio' in X_train.columns):
    X_train['Triple_Interaction'] =(
        X_train['MonthlyIncome'] *
        X_train['CreditScore'] *
        X_train['TotalDebtToIncomeRatio'] / 1e9
    )
    X_test['Triple_Interaction']=(
        X_test['MonthlyIncome'] *
        X_test['CreditScore'] *
        X_test['TotalDebtToIncomeRatio'] / 1e9
    )



In [None]:
def fe(X):
    X = X.copy()
    for col in ['CreditScore', 'MonthlyIncome', 'LoanAmount', 'DebtToIncomeRatio', 'Age', 'TotalAssets']:
        if col in X.columns:
            X[f'{col}_log'] = np.log1p(X[col].clip(lower=0))
            X[f'{col}_sqrt'] = np.sqrt(X[col].clip(lower=0))
            X[f'{col}_sq'] = X[col] ** 2
    for col in ['CreditScore', 'MonthlyIncome']:
        if col in X.columns:
            X[f'{col}_cube'] = X[col] ** 3
    return X

In [None]:
X_train = fe(X_train)
X_test = fe(X_test)
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_test = X_test.replace([np.inf, -np.inf], np.nan)

for col in X_train.columns:
    if X_train[col].isnull().any():
        med = X_train[col].median()
        X_train[col].fillna(med, inplace=True)
        X_test[col].fillna(med, inplace=True)

In [None]:
X_train.shape[1]

In [None]:
TOP3 = ['CreditScore', 'MonthlyIncome', 'TotalDebtToIncomeRatio']
X_top3_tr = X_train[[c for c in TOP3 if c in X_train.columns]]
X_top3_te = X_test[[c for c in TOP3 if c in X_test.columns]]

poly3 = PolynomialFeatures(degree=3, include_bias=False)
X_poly3_tr = poly3.fit_transform(X_top3_tr)
X_poly3_te = poly3.transform(X_top3_te)

In [None]:
MID6 = ['DebtToIncomeRatio', 'LoanAmount', 'Age', 'MonthlyLoanPayment', 'TotalAssets', 'LengthOfCreditHistory']
X_mid6_tr = X_train[[c for c in MID6 if c in X_train.columns]]
X_mid6_te = X_test[[c for c in MID6 if c in X_test.columns]]
poly2 = PolynomialFeatures(degree=2, include_bias=False)
X_poly2_tr = poly2.fit_transform(X_mid6_tr)
X_poly2_te = poly2.transform(X_mid6_te)

In [None]:
X_mid6_tr.shape[1]

In [None]:
X_poly2_tr.shape[1]

In [None]:
all_processed = TOP3 + MID6
X_rest_tr = X_train[[c for c in X_train.columns if c not in all_processed]].values
X_rest_te = X_test[[c for c in X_test.columns if c not in all_processed]].values
X_full_tr = np.hstack([X_poly3_tr, X_poly2_tr, X_rest_tr])
X_full_te = np.hstack([X_poly3_te, X_poly2_te, X_rest_te])

In [None]:
X_full_tr.shape[1]

In [None]:
scaler = RobustScaler()
X_scaled_tr = scaler.fit_transform(X_full_tr)
X_scaled_te = scaler.transform(X_full_te)


alphas_to_test = [0.0001, 0.00025, 0.0005, 0.00075, 0.001]
kf = KFold(n_splits=5, shuffle=True, random_state=42)


best_alpha = None
best_cv_score = float('inf')

In [None]:
for alpha in alphas_to_test:
    cv_scores = []

    for train_idx, val_idx in kf.split(X_scaled_tr):
        X_tr_fold, X_val_fold = X_scaled_tr[train_idx], X_scaled_tr[val_idx]
        y_tr_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

        model = Ridge(alpha=alpha)
        model.fit(X_tr_fold, y_tr_fold)
        pred_val = model.predict(X_val_fold)
        mse_val = mean_squared_error(y_val_fold, pred_val)
        cv_scores.append(mse_val)

    avg_cv_score = np.mean(cv_scores)
    std_cv_score = np.std(cv_scores)

    marker = "üî•" if avg_cv_score < best_cv_score else ""
    print(f"alpha={alpha:8.5f}:CV_MSE = {avg_cv_score:.4f} ¬± {std_cv_score:.4f} {marker}")

    if avg_cv_score < best_cv_score:
        best_cv_score = avg_cv_score
        best_alpha = alpha

In [None]:
model_final = Ridge(alpha=best_alpha)
model_final.fit(X_scaled_tr, y_train)

pred_train = model_final.predict(X_scaled_tr)
pred_test = model_final.predict(X_scaled_te)

train_mse = mean_squared_error(y_train, pred_train)
train_r2 = r2_score(y_train, pred_train)
train_mae = mean_absolute_error(y_train, pred_train)

In [None]:
print(train_mse,train_r2,train_mae)

In [None]:
residuals = y_train - pred_train
abs_residuals = np.abs(residuals)

high_error_threshold = abs_residuals.quantile(0.95)
high_error_mask = abs_residuals > high_error_threshold

In [None]:
high_error_mask.sum()

In [None]:
high_error_threshold

In [None]:
if high_error_mask.sum() > 0:
    problem_risks = y_train[high_error_mask]

In [None]:
problem_risks.mean()

In [None]:
print("lol",problem_risks.min(),problem_risks.max())

In [None]:
y_final = np.clip(pred_test, 0, 120)
submission = pd.DataFrame({
    'ID': test_ids,
    'RiskScore': y_final
})

submission.to_csv('submission.csv', index=False)

In [None]:
# –∏—Ç–æ–≥
print(X_full_tr.shape[1],
best_alpha,
best_cv_score,
train_mse,
train_r2,
max(train_mse - 0.70, 22),
)

In [None]:
# CHILL –Ω–∞ KAGGLE <25 :)))