In [1]:

import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, StandardScaler, KBinsDiscretizer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_fscore_support
from sklearn.base import clone

RANDOM_STATE = 42

train_path = 'mai-ml-lab-2-fiit-2025/train_c.csv'
test_path  = 'mai-ml-lab-2-fiit-2025/test_c.csv'
submission_path = 'submission_lab2.csv'

def safe_log1p_ser(s):
    s = pd.to_numeric(s, errors='coerce').copy()
    if s.isnull().all():
        return s
    mn = s.min(skipna=True)
    if pd.notna(mn) and mn < 0:
        shift = abs(mn) + 1.0
        return np.log1p(s + shift)
    else:
        return np.log1p(s.fillna(0))

if not os.path.exists(train_path):
    raise FileNotFoundError(f"Train file not found at {train_path}")
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path) if os.path.exists(test_path) else None
print("Loaded. Train:", train_df.shape, "Test present:", test_df is not None)

def preprocess_base(df, is_train=True):
    df = df.copy()
    if is_train and 'LoanApproved' in df.columns:
        df = df[df['LoanApproved'].isin([0,1]) | df['LoanApproved'].isna()] if 'LoanApproved' in df.columns else df
    df.replace(-9999999.0, np.nan, inplace=True)
    if 'ApplicationDate' in df.columns:
        df['ApplicationDate'] = pd.to_datetime(df['ApplicationDate'], errors='coerce')
        df['App_Year'] = df['ApplicationDate'].dt.year.fillna(0).astype(int)
        df['App_Month'] = df['ApplicationDate'].dt.month.fillna(0).astype(int)
        df['App_DayOfWeek'] = df['ApplicationDate'].dt.dayofweek.fillna(0).astype(int)
    eps = 1e-6
    if 'MonthlyLoanPayment' in df.columns and 'MonthlyIncome' in df.columns:
        df['PaymentToIncomeRatio'] = df['MonthlyLoanPayment'] / (df['MonthlyIncome'] + eps)
    if 'LoanAmount' in df.columns and 'AnnualIncome' in df.columns:
        df['LoanToIncomeRatio'] = df['LoanAmount'] / (df['AnnualIncome'] + eps)
    if 'TotalLiabilities' in df.columns and 'TotalAssets' in df.columns:
        df['DebtToAssetsRatio'] = df['TotalLiabilities'] / (df['TotalAssets'] + eps)
    if 'SavingsAccountBalance' in df.columns and 'LoanAmount' in df.columns:
        df['SavingsToLoanRatio'] = df['SavingsAccountBalance'] / (df['LoanAmount'] + eps)
    if 'LengthOfCreditHistory' in df.columns and 'PaymentHistory' in df.columns:
        df['CreditHistoryInteraction'] = df['LengthOfCreditHistory'] * df['PaymentHistory']
    if 'MonthlyIncome' in df.columns and 'CreditScore' in df.columns:
        df['Income_x_CreditScore'] = df['MonthlyIncome'].fillna(0) * df['CreditScore'].fillna(0)
    clip_cols = ['MonthlyIncome','LoanAmount','AnnualIncome','SavingsAccountBalance','TotalAssets','TotalLiabilities','MonthlyDebtPayments']
    for c in clip_cols:
        if c in df.columns:
            lo = df[c].quantile(0.01)
            hi = df[c].quantile(0.99)
            df[c] = df[c].clip(lo, hi)
    log_candidates = ['MonthlyIncome','LoanAmount','SavingsAccountBalance','CheckingAccountBalance','TotalAssets','TotalLiabilities','NetWorth','MonthlyDebtPayments']
    for c in log_candidates:
        if c in df.columns:
            df[c] = safe_log1p_ser(df[c])
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    return df

train_proc = preprocess_base(train_df, is_train=True)
test_proc = preprocess_base(test_df, is_train=False) if test_df is not None else None

target_col = 'LoanApproved'
numerical_candidates = [
    'CreditScore','MonthlyIncome','BaseInterestRate','LoanAmount','LoanDuration',
    'DebtToIncomeRatio','NumberOfDependents','NumberOfOpenCreditLines','NumberOfCreditInquiries',
    'PaymentHistory','LengthOfCreditHistory','UtilityBillsPaymentHistory','MonthlyDebtPayments',
    'CreditCardUtilizationRate','InterestRate','TotalDebtToIncomeRatio','SavingsAccountBalance',
    'CheckingAccountBalance','TotalAssets','TotalLiabilities','NetWorth','JobTenure','Experience','Age',
    'BankruptcyHistory','PreviousLoanDefaults','PaymentToIncomeRatio','LoanToIncomeRatio','DebtToAssetsRatio',
    'SavingsToLoanRatio','CreditHistoryInteraction','Income_x_CreditScore','App_Year','App_Month','App_DayOfWeek'
]
numerical = [c for c in numerical_candidates if c in train_proc.columns]
categorical_candidates = ['MaritalStatus','HomeOwnershipStatus','EmploymentStatus','EducationLevel','LoanPurpose']
categorical = [c for c in categorical_candidates if c in train_proc.columns]
binned = [c for c in ['Age','CreditScore'] if c in train_proc.columns]

final_numerical = [c for c in numerical if c not in binned]
all_features = final_numerical + categorical + binned

def drop_high_corr(df, features, thresh=0.99):
    num_df = df[features].select_dtypes(include=[np.number]).copy()
    if num_df.shape[1] < 2:
        return []
    corr = num_df.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = [col for col in upper.columns if any(upper[col] > thresh)]
    return to_drop

to_drop = drop_high_corr(train_proc, final_numerical, thresh=0.995)
if to_drop:
    final_numerical = [f for f in final_numerical if f not in to_drop]
    all_features = final_numerical + categorical + binned

print("Numeric features:", len(final_numerical), "Categorical:", len(categorical), "Binned:", binned)

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('power', PowerTransformer(method='yeo-johnson', standardize=True)),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

binned_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('bin', KBinsDiscretizer(n_bins=10, encode='onehot-dense', strategy='quantile'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, final_numerical),
    ('cat', categorical_pipeline, categorical),
    ('bin', binned_pipeline, binned)
], remainder='drop')

train_proc = train_proc.dropna(subset=[target_col], how='any')
X = train_proc[all_features].copy()
y = train_proc[target_col].astype(int).copy()

if test_proc is not None:
    for c in all_features:
        if c not in test_proc.columns:
            test_proc[c] = np.nan


clf_lr = LogisticRegression(C=1.0, solver='saga', max_iter=5000, random_state=RANDOM_STATE, n_jobs=-1)
clf_rf = RandomForestClassifier(n_estimators=300, max_depth=10, n_jobs=-1, random_state=RANDOM_STATE)
clf_hgb = HistGradientBoostingClassifier(max_iter=300, random_state=RANDOM_STATE)

estimators = [
    ('lr', clf_lr),
    ('rf', clf_rf),
    ('hgb', clf_hgb)
]
final_meta = LogisticRegression(solver='liblinear', C=1.0, random_state=RANDOM_STATE)

pipe_stack = Pipeline([
    ('preproc', preprocessor),
    ('pre_var', SelectKBest(score_func=f_classif, k=min(800, max(10, len(final_numerical) + 100)))),
    ('stack', StackingClassifier(estimators=estimators, final_estimator=final_meta, n_jobs=-1, passthrough=False))
])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

print("Starting cross-validated evaluation (5 folds)...")
oof_preds = np.zeros(len(X))
for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), 1):
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]
    pipe = clone(pipe_stack)
    pipe.fit(X_tr, y_tr)
    proba = pipe.predict_proba(X_val)[:,1]
    oof_preds[val_idx] = proba
    auc = roc_auc_score(y_val, proba)
    ap = average_precision_score(y_val, proba)
    print(f"Fold {fold} | ROC-AUC: {auc:.4f} | PR-AUC: {ap:.4f}")

cv_auc = roc_auc_score(y, oof_preds)
cv_pr = average_precision_score(y, oof_preds)
print(f"\nOOF CV ROC-AUC: {cv_auc:.4f}")
print(f"OOF CV PR-AUC: {cv_pr:.4f}")

y_pred_bin = (oof_preds >= 0.5).astype(int)
precision, recall, f1, _ = precision_recall_fscore_support(y, y_pred_bin, average='binary')
print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")

print("\nTraining final model on full train...")
pipe_stack.fit(X, y)

if test_proc is not None:
    X_test = test_proc[all_features].copy()
    preds_test_proba = pipe_stack.predict_proba(X_test)[:,1]
    preds_test_proba = np.clip(preds_test_proba, 0.0, 1.0)
    if 'ID' in test_proc.columns:
        sub = pd.DataFrame({'ID': test_proc['ID'], 'LoanApproved': preds_test_proba})
    else:
        sub = pd.DataFrame({'LoanApproved': preds_test_proba})
    sub.to_csv(submission_path, index=False)
    print("Saved submission to:", submission_path)

print("\nDone. Если ROC-AUC (OOF) < 0.75, рекомендую:")
print("  • увеличить SelectKBest (k), попробовать passthrough=True в StackingClassifier,")
print("  • тонкая настройка RandomForest (max_depth, n_estimators) или замена HGB на LightGBM,")
print("  • добавить дополнительные interaction-признаки или использовать target-encoding для категорий.")



Loaded. Train: (11017, 35) Test present: True
Numeric features: 33 Categorical: 5 Binned: ['Age', 'CreditScore']
Starting cross-validated evaluation (5 folds)...
Fold 1 | ROC-AUC: 0.9828 | PR-AUC: 0.9840
Fold 2 | ROC-AUC: 0.9845 | PR-AUC: 0.9857
Fold 3 | ROC-AUC: 0.9844 | PR-AUC: 0.9858
Fold 4 | ROC-AUC: 0.9846 | PR-AUC: 0.9856
Fold 5 | ROC-AUC: 0.9851 | PR-AUC: 0.9861

OOF CV ROC-AUC: 0.9842
OOF CV PR-AUC: 0.9854
Precision: 0.9349, Recall: 0.9370, F1: 0.9360

Training final model on full train...
Saved submission to: submission_lab2.csv

Done. Если ROC-AUC (OOF) < 0.75, рекомендую:
  • увеличить SelectKBest (k), попробовать passthrough=True в StackingClassifier,
  • тонкая настройка RandomForest (max_depth, n_estimators) или замена HGB на LightGBM,
  • добавить дополнительные interaction-признаки или использовать target-encoding для категорий.
