##SVM

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
import warnings


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

train = pd.read_csv('train_Mortgage.csv')
val = pd.read_csv('val_Mortgage.csv')

X_train = train.drop('Mortgage', axis=1)
y_train = train['Mortgage']
X_val = val.drop('Mortgage', axis=1)
y_val = val['Mortgage']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

best_params = {
    'C': 1.736433500314599,
    'gamma': 0.002869615526354882,
    'class_weight': 'balanced',
    'kernel': 'rbf',
    'probability': True,
    'random_state': 42
}

model = SVC(**best_params)
cv_scores = cross_val_score(model, X_train_scaled, y_train,
                          cv=5, scoring='roc_auc', n_jobs=-1)

print(f"Cross-Validation AUC: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")

final_model = SVC(**best_params)
final_model.fit(X_train_scaled, y_train)

y_proba = final_model.predict_proba(X_val_scaled)[:, 1]
val_auc = roc_auc_score(y_val, y_proba)

print(f"\nValidation AUC: {val_auc:.4f}")
print("\nModel Parameters:")
for param, value in best_params.items():
    print(f"{param:>15}: {value}")

Cross-Validation AUC: 0.9447 (±0.0057)

Validation AUC: 0.9382

Model Parameters:
              C: 1.736433500314599
          gamma: 0.002869615526354882
   class_weight: balanced
         kernel: rbf
    probability: True
   random_state: 42


In [None]:
train = pd.read_csv('train_Pension.csv')
val = pd.read_csv('val_Pension.csv')

X_train = train.drop('Pension', axis=1)
y_train = train['Pension']
X_val = val.drop('Pension', axis=1)
y_val = val['Pension']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

def objective(trial):
    params = {
        'kernel': 'rbf',
        'C': trial.suggest_float('C', 1e-3, 1e3, log=True),
        'gamma': trial.suggest_float('gamma', 1e-5, 1e2, log=True),
        'class_weight': trial.suggest_categorical('class_weight', [None, 'balanced']),
        'probability': True,
        'cache_size': 1000
    }

    model = SVC(**params)
    scores = cross_val_score(model, X_train_scaled, y_train,
                           cv=3, scoring='roc_auc', n_jobs=-1)
    return np.mean(scores)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

best_params = study.best_params
best_params['kernel'] = 'rbf'
best_params['probability'] = True

final_model_Pension_rbf = SVC(**best_params)
final_model_Pension_rbf.fit(X_train_scaled, y_train)

y_proba = final_model_Pension_rbf.predict_proba(X_val_scaled)[:, 1]
fpr_rbf, tpr_rbf, _ = roc_curve(y_val, y_proba)
roc_auc_rbf = roc_auc_score(y_val, y_proba)
print("\nOptimization Results:")
print(f"Best Validation AUC: {roc_auc_rbf:.4f}")
print("\nBest Parameters:")
for param, value in best_params.items():
    print(f"{param:>15}: {value}")

[I 2025-03-30 13:16:03,749] A new study created in memory with name: no-name-16b119f7-74c0-4982-a266-4962ec60f77d
[I 2025-03-30 13:18:18,517] Trial 0 finished with value: 0.69277372743284 and parameters: {'C': 268.53759419975336, 'gamma': 0.0007020725739147746, 'class_weight': None}. Best is trial 0 with value: 0.69277372743284.
[I 2025-03-30 13:18:35,544] Trial 1 finished with value: 0.7559176973445059 and parameters: {'C': 0.0013987708532579334, 'gamma': 0.0019675451123915013, 'class_weight': None}. Best is trial 1 with value: 0.7559176973445059.
[I 2025-03-30 13:18:51,986] Trial 2 finished with value: 0.6594124218979236 and parameters: {'C': 0.02431591428009784, 'gamma': 0.002424498666926584, 'class_weight': None}. Best is trial 1 with value: 0.7559176973445059.
[I 2025-03-30 13:19:09,888] Trial 3 finished with value: 0.6850363493523702 and parameters: {'C': 0.010672432999709647, 'gamma': 0.023879066362446872, 'class_weight': None}. Best is trial 1 with value: 0.7559176973445059.
[I


Optimization Results:
Best Validation AUC: 0.7784

Best Parameters:
              C: 0.30674757973192535
          gamma: 0.00044508369810259416
   class_weight: balanced
         kernel: rbf
    probability: True


In [None]:
train = pd.read_csv('train_Pension.csv')
val = pd.read_csv('val_Pension.csv')

X_train = train.drop('Pension', axis=1)
y_train = train['Pension']
X_val = val.drop('Pension', axis=1)
y_val = val['Pension']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

best_params = {
    'C': 0.30674757973192535,
    'gamma': 0.00044508369810259416,
    'class_weight': 'balanced',
    'kernel': 'rbf',
    'probability': True,
    'random_state': 42
}

model = SVC(**best_params)
cv_scores = cross_val_score(model, X_train_scaled, y_train,
                          cv=5, scoring='roc_auc', n_jobs=-1)

print(f"Cross-Validation AUC: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")

final_model = SVC(**best_params)
final_model.fit(X_train_scaled, y_train)

y_proba = final_model.predict_proba(X_val_scaled)[:, 1]
val_auc = roc_auc_score(y_val, y_proba)

print(f"\nValidation AUC: {val_auc:.4f}")
print("\nBest Parameters:")
for param, value in best_params.items():
    print(f"{param:>15}: {value}")

Cross-Validation AUC: 0.7739 (±0.0109)

Validation AUC: 0.7784

Best Parameters:
              C: 0.30674757973192535
          gamma: 0.00044508369810259416
   class_weight: balanced
         kernel: rbf
    probability: True
   random_state: 42


In [None]:
train = pd.read_csv('train_Savings.csv')
val = pd.read_csv('val_Savings.csv')

X_train = train.drop('Savings', axis=1)
y_train = train['Savings']
X_val = val.drop('Savings', axis=1)
y_val = val['Savings']

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

best_params = {
    'C': 11.784268056358108,
    'gamma': 0.0005921563846307659,
    'class_weight': 'balanced',
    'kernel': 'rbf',
    'probability': True,
    'random_state': 42,
    'cache_size': 1000
}

model = SVC(**best_params)
cv_scores = cross_val_score(model, X_train_scaled, y_train,
                          cv=5, scoring='roc_auc', n_jobs=-1)

print(f"Cross-Validation AUC: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")

final_model = SVC(**best_params)
final_model.fit(X_train_scaled, y_train)

y_proba = final_model.predict_proba(X_val_scaled)[:, 1]
val_auc = roc_auc_score(y_val, y_proba)

print(f"\nValidation AUC: {val_auc:.4f}")
print("\nBest Parameters:")
for param, value in best_params.items():
    print(f"{param:>15}: {value if not isinstance(value, float) else f'{value:.6f}'}")

Cross-Validation AUC: 0.6731 (±0.0115)

Validation AUC: 0.6950

Best Parameters:
              C: 11.784268
          gamma: 0.000592
   class_weight: balanced
         kernel: rbf
    probability: True
   random_state: 42
     cache_size: 1000


##NLP

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

train = pd.read_csv('train_Mortgage.csv')
val = pd.read_csv('val_Mortgage.csv')

X_train = train.drop('Mortgage', axis=1).values
y_train = train['Mortgage'].values
X_val = val.drop('Mortgage', axis=1).values
y_val = val['Mortgage'].values

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

class SimpleMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(X_train.shape[1], 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

# Обучение базовой модели
base_model = SimpleMLP()
optimizer = torch.optim.Adam(base_model.parameters(), lr=0.001)
loader = DataLoader(TensorDataset(torch.FloatTensor(X_train_scaled), torch.FloatTensor(y_train).unsqueeze(1)),
                    batch_size=64, shuffle=True)

for _ in range(10):
    for Xb, yb in loader:
        optimizer.zero_grad()
        loss = nn.BCELoss()(base_model(Xb), yb)
        loss.backward()
        optimizer.step()

# ROC-AUC до оптимизации
base_model.eval()
with torch.no_grad():
    base_probs = base_model(torch.FloatTensor(X_val_scaled)).numpy().ravel()
    base_auc = roc_auc_score(y_val, base_probs)

# 3. Оптимизированная модель
class OptimizedMLP(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(X_train.shape[1], 48),
            nn.LeakyReLU(),
            nn.Dropout(0.15),
            nn.Linear(48, 176),
            nn.BatchNorm1d(176),
            nn.LeakyReLU(),
            nn.Dropout(0.57),
            nn.Linear(176, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.net(x)

opt_model = OptimizedMLP()
optimizer = torch.optim.RAdam(opt_model.parameters(), lr=0.0005, weight_decay=0.02)
loader = DataLoader(TensorDataset(torch.FloatTensor(X_train_scaled), torch.FloatTensor(y_train).unsqueeze(1)),
                    batch_size=32, shuffle=True)

for _ in range(100):
    for Xb, yb in loader:
        optimizer.zero_grad()
        loss = nn.BCELoss()(opt_model(Xb), yb)
        loss.backward()
        optimizer.step()

opt_model.eval()
with torch.no_grad():
    opt_probs = opt_model(torch.FloatTensor(X_val_scaled)).numpy().ravel()
    opt_auc = roc_auc_score(y_val, opt_probs)

print(f"Baseline AUC: {base_auc:.4f}")
print(f"Optimized AUC: {opt_auc:.4f}")

Baseline AUC: 0.9391
Optimized AUC: 0.9404


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt

train = pd.read_csv('train_Pension.csv')
val = pd.read_csv('val_Pension.csv')

X_all = pd.concat([train.drop('Pension', axis=1), val.drop('Pension', axis=1)])
y_all = pd.concat([train['Pension'], val['Pension']]).values

BEST_PARAMS = {
    'n_layers': 2,
    'activation': 'ELU',
    'n_units_0': 400,
    'use_bn_0': True,
    'dropout_0': 0.266,
    'n_units_1': 368,
    'use_bn_1': False,
    'dropout_1': 0.1297,
    'lr': 0.000358,
    'batch_size': 16,
    'optimizer': 'Adam',
    'weight_decay': 0.0341,
    'epochs': 14
}

def create_mlp(input_dim):
    layers = []
    in_features = input_dim
    layers.append(nn.Linear(in_features, 400))
    layers.append(nn.ELU())
    layers.append(nn.BatchNorm1d(400))
    layers.append(nn.Dropout(0.266))

    layers.append(nn.Linear(400, 368))
    layers.append(nn.ELU())
    layers.append(nn.Dropout(0.1297))

    layers.append(nn.Linear(368, 1))
    layers.append(nn.Sigmoid())
    return nn.Sequential(*layers)

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

model = create_mlp(X_train.shape[1])
optimizer = optim.Adam(model.parameters(),
                      lr=BEST_PARAMS['lr'],
                      weight_decay=BEST_PARAMS['weight_decay'])

train_loader = DataLoader(
    TensorDataset(torch.FloatTensor(X_train_scaled),
                 torch.FloatTensor(y_train).unsqueeze(1)),
    batch_size=16,
    shuffle=True
)

for _ in range(14):
    model.train()
    for Xb, yb in train_loader:
        optimizer.zero_grad()
        loss = nn.BCELoss()(model(Xb), yb)
        loss.backward()
        optimizer.step()

model.eval()
with torch.no_grad():
    val_probs = model(torch.FloatTensor(X_val_scaled)).numpy().ravel()

print(f"ROC-AUC ДО кросс-валидации: {roc_auc_score(y_val, val_probs):.4f}")

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X_all, y_all)):
    X_train, X_test = X_all.iloc[train_idx], X_all.iloc[test_idx]
    y_train, y_test = y_all[train_idx], y_all[test_idx]

    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    model = create_mlp(X_train.shape[1])
    optimizer = optim.Adam(model.parameters(),
                          lr=BEST_PARAMS['lr'],
                          weight_decay=BEST_PARAMS['weight_decay'])

    train_loader = DataLoader(
        TensorDataset(torch.FloatTensor(X_train),
                        torch.FloatTensor(y_train).unsqueeze(1)),
        batch_size=16,
        shuffle=True
    )

    for _ in range(14):
        model.train()
        for Xb, yb in train_loader:
            optimizer.zero_grad()
            loss = nn.BCELoss()(model(Xb), yb)
            loss.backward()
            optimizer.step()

    model.eval()
    with torch.no_grad():
        probs = model(torch.FloatTensor(X_test)).numpy().ravel()

    auc = roc_auc_score(y_test, probs)
    auc_scores.append(auc)

print(f"\nROC-AUC ПОСЛЕ кросс-валидации: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")

ROC-AUC ДО кросс-валидации: 0.7692

ROC-AUC ПОСЛЕ кросс-валидации: 0.7649 ± 0.0198


In [None]:
train_data = pd.read_csv('train_Savings.csv')
val_data = pd.read_csv('val_Savings.csv')

combined = pd.concat([train_data, val_data])
X = combined.drop('Savings', axis=1).values
y = combined['Savings'].values

X_train = X[:len(train_data)]
y_train = y[:len(train_data)]
X_val = X[len(train_data):]
y_val = y[len(train_data):]

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

FIXED_PARAMS = {
    'lr': 0.0013819558991985145,
    'batch_size': 16,
    'optimizer': 'RAdam',
    'weight_decay': 0.0056240739754509494,
    'architecture': {
        'n_layers': 1,
        'activation': 'LeakyReLU',
        'n_units_0': 352,
        'use_bn_0': False,
        'dropout_0': 0.4246360943397488
    },
    'epochs': 30
}

def create_model(input_size):
    layers = []
    in_features = input_size

    layers.append(nn.Linear(in_features, FIXED_PARAMS['architecture']['n_units_0']))
    layers.append(nn.LeakyReLU(negative_slope=0.1))
    if FIXED_PARAMS['architecture']['dropout_0'] > 0:
        layers.append(nn.Dropout(FIXED_PARAMS['architecture']['dropout_0']))

    layers.append(nn.Linear(FIXED_PARAMS['architecture']['n_units_0'], 1))
    layers.append(nn.Sigmoid())

    return nn.Sequential(*layers)

model = create_model(X_train.shape[1])
optimizer = getattr(optim, FIXED_PARAMS['optimizer'])(
    model.parameters(),
    lr=FIXED_PARAMS['lr'],
    weight_decay=FIXED_PARAMS['weight_decay']
)

train_dataset = TensorDataset(
    torch.FloatTensor(X_train),
    torch.FloatTensor(y_train).reshape(-1, 1)
)
train_loader = DataLoader(
    train_dataset,
    batch_size=FIXED_PARAMS['batch_size'],
    shuffle=True
)

for epoch in range(FIXED_PARAMS['epochs']):
    model.train()
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = nn.BCELoss()(outputs, labels)
        loss.backward()
        optimizer.step()

model.eval()
with torch.no_grad():
    val_probs = model(torch.FloatTensor(X_val)).numpy().ravel()
    auc_raw = roc_auc_score(y_val, val_probs)

print(f"ROC-AUC без кроссвалидации: {auc_raw:.4f}")

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X, y)):
    X_train_fold, X_test_fold = X[train_idx], X[test_idx]
    y_train_fold, y_test_fold = y[train_idx], y[test_idx]

    scaler_fold = StandardScaler().fit(X_train_fold)
    X_train_fold = scaler_fold.transform(X_train_fold)
    X_test_fold = scaler_fold.transform(X_test_fold)

    fold_model = create_model(X_train_fold.shape[1])
    fold_optimizer = getattr(optim, FIXED_PARAMS['optimizer'])(
        fold_model.parameters(),
        lr=FIXED_PARAMS['lr'],
        weight_decay=FIXED_PARAMS['weight_decay']
    )

    fold_loader = DataLoader(
        TensorDataset(
            torch.FloatTensor(X_train_fold),
            torch.FloatTensor(y_train_fold).reshape(-1, 1)
        ),
        batch_size=FIXED_PARAMS['batch_size'],
        shuffle=True
    )

    for epoch in range(FIXED_PARAMS['epochs']):
        fold_model.train()
        for inputs, labels in fold_loader:
            fold_optimizer.zero_grad()
            outputs = fold_model(inputs)
            loss = nn.BCELoss()(outputs, labels)
            loss.backward()
            fold_optimizer.step()

    fold_model.eval()
    with torch.no_grad():
        probs = fold_model(torch.FloatTensor(X_test_fold)).numpy().ravel()
        auc = roc_auc_score(y_test_fold, probs)
        auc_scores.append(auc)

print(f"\nСредний ROC-AUC с кроссвалидацией: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")

ROC-AUC без кроссвалидации: 0.6948

Средний ROC-AUC с кроссвалидацией: 0.6727 ± 0.0063


##log_reg

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, roc_curve

train = pd.read_csv('train_Pension.csv')
val = pd.read_csv('val_Pension.csv')

s = 'Pension'

X_train = train.drop(columns=[s])
y_train = train[s]
X_val = val.drop(columns=[s])
y_val = val[s]

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size=0.3,
    random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

FIXED_PARAMS = {
    'C': 0.01,
    'l1_ratio': 0.8,
    'max_iter': 1000,
    'penalty': 'elasticnet',
    'solver': 'saga',
    'random_state': 42
}

model = LogisticRegression(**FIXED_PARAMS)
model.fit(X_train_scaled, y_train)

val_probs = model.predict_proba(X_val_scaled)[:, 1]
val_auc = roc_auc_score(y_val, val_probs)

print(f"ROC-AUC без кроссвалидации: {val_auc:.4f}")

X_full_train = pd.concat([X_train, X_test])
y_full_train = pd.concat([y_train, y_test])
X_full_scaled = scaler.fit_transform(X_full_train)

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X_full_scaled, y_full_train)):
    X_tr, X_te = X_full_scaled[train_idx], X_full_scaled[test_idx]
    y_tr, y_te = y_full_train.iloc[train_idx], y_full_train.iloc[test_idx]

    fold_model = LogisticRegression(**FIXED_PARAMS)
    fold_model.fit(X_tr, y_tr)

    fold_probs = fold_model.predict_proba(X_te)[:, 1]
    auc = roc_auc_score(y_te, fold_probs)
    auc_scores.append(auc)

print(f"\nСредний ROC-AUC с кроссвалидацией: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")


ROC-AUC без кроссвалидации: 0.7778

Средний ROC-AUC с кроссвалидацией: 0.7772 ± 0.0039


In [None]:
train = pd.read_csv('train_Savings.csv')
val = pd.read_csv('val_Savings.csv')

target = 'Savings'

X_train = train.drop(columns=[target])
y_train = train[target]
X_val = val.drop(columns=[target])
y_val = val[target]

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size=0.3,
    random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

FIXED_PARAMS = {
    'C': 0.1,
    'max_iter': 500,
    'penalty': 'l2',
    'solver': 'lbfgs',
    'random_state': 42
}

model = LogisticRegression(**FIXED_PARAMS)
model.fit(X_train_scaled, y_train)

val_probs = model.predict_proba(X_val_scaled)[:, 1]
val_auc = roc_auc_score(y_val, val_probs)

print(f"ROC-AUC без кроссвалидации: {val_auc:.4f}")

X_full_train = pd.concat([X_train, X_test])
y_full_train = pd.concat([y_train, y_test])
X_full_scaled = scaler.fit_transform(X_full_train)

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X_full_scaled, y_full_train)):
    X_tr, X_te = X_full_scaled[train_idx], X_full_scaled[test_idx]
    y_tr, y_te = y_full_train.iloc[train_idx], y_full_train.iloc[test_idx]

    fold_model = LogisticRegression(**FIXED_PARAMS)
    fold_model.fit(X_tr, y_tr)

    fold_probs = fold_model.predict_proba(X_te)[:, 1]
    auc = roc_auc_score(y_te, fold_probs)
    auc_scores.append(auc)

print(f"\nСредний ROC-AUC с кроссвалидацией: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")

ROC-AUC без кроссвалидации: 0.6928

Средний ROC-AUC с кроссвалидацией: 0.6726 ± 0.0182


In [None]:
train = pd.read_csv('train_Mortgage.csv')
val = pd.read_csv('val_Mortgage.csv')

target = 'Mortgage'

X_train = train.drop(columns=[target])
y_train = train[target]
X_val = val.drop(columns=[target])
y_val = val[target]

X_train, X_test, y_train, y_test = train_test_split(
    X_train, y_train,
    test_size=0.3,
    random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
X_val_scaled = scaler.transform(X_val)

FIXED_PARAMS = {
    'C': 0.01,
    'l1_ratio': 0.5,
    'max_iter': 500,
    'penalty': 'elasticnet',
    'solver': 'saga',
    'random_state': 42
}

model = LogisticRegression(**FIXED_PARAMS)
model.fit(X_train_scaled, y_train)

val_probs = model.predict_proba(X_val_scaled)[:, 1]
val_auc = roc_auc_score(y_val, val_probs)

print(f"ROC-AUC без кроссвалидации: {val_auc:.4f}")

X_full_train = pd.concat([X_train, X_test])
y_full_train = pd.concat([y_train, y_test])
X_full_scaled = scaler.fit_transform(X_full_train)

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_scores = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X_full_scaled, y_full_train)):
    X_tr, X_te = X_full_scaled[train_idx], X_full_scaled[test_idx]
    y_tr, y_te = y_full_train.iloc[train_idx], y_full_train.iloc[test_idx]

    fold_model = LogisticRegression(**FIXED_PARAMS)
    fold_model.fit(X_tr, y_tr)

    fold_probs = fold_model.predict_proba(X_te)[:, 1]
    auc = roc_auc_score(y_te, fold_probs)
    auc_scores.append(auc)

print(f"\nСредний ROC-AUC с кроссвалидацией: {np.mean(auc_scores):.4f} ± {np.std(auc_scores):.4f}")

ROC-AUC без кроссвалидации: 0.9411

Средний ROC-AUC с кроссвалидацией: 0.9451 ± 0.0038


##random forest

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

train_Mortgage = pd.read_csv('train_Mortgage.csv')
val_Mortgage = pd.read_csv('val_Mortgage.csv')

feature_columns = [col for col in train_Mortgage.columns if col != 'Mortgage']

X_train = train_Mortgage[feature_columns]
y_train = train_Mortgage['Mortgage']
X_val = val_Mortgage[feature_columns]
y_val = val_Mortgage['Mortgage']

model = ExtraTreesClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_leaf=2,
    min_samples_split=10,
    class_weight=None,
    random_state=42
)
model.fit(X_train, y_train)

val_probs = model.predict_proba(X_val)[:, 1]
print(f'Validation ROC-AUC: {roc_auc_score(y_val, val_probs):.4f}')

cv_scores = cross_val_score(
    model,
    X_train,
    y_train,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
print(f'Cross-Validation ROC-AUC: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}')

Validation ROC-AUC: 0.9409
Cross-Validation ROC-AUC: 0.9449 ± 0.0059


In [None]:
train_Pension = pd.read_csv('train_Pension.csv')
val_Pension = pd.read_csv('val_Pension.csv')

feature_columns = [col for col in train_Pension.columns if col != 'Pension']

X_train = train_Pension[feature_columns]
y_train = train_Pension['Pension']
X_val = val_Pension[feature_columns]
y_val = val_Pension['Pension']

model = ExtraTreesClassifier(
    n_estimators=100,
    max_depth=5,
    min_samples_leaf=2,
    min_samples_split=10,
    class_weight='balanced',
    random_state=42
)

model.fit(X_train, y_train)

val_probs = model.predict_proba(X_val)[:, 1]
print(f'Pension Validation ROC-AUC: {roc_auc_score(y_val, val_probs):.4f}')

cv_scores = cross_val_score(
    model,
    X_train,
    y_train,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
print(f'Pension Cross-Validation ROC-AUC: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}')

Pension Validation ROC-AUC: 0.7778
Pension Cross-Validation ROC-AUC: 0.7735 ± 0.0103


In [None]:
train_Savings = pd.read_csv('train_Savings.csv')
val_Savings = pd.read_csv('val_Savings.csv')

feature_columns = [col for col in train_Savings.columns if col != 'Savings']

X_train = train_Savings[feature_columns]
y_train = train_Savings['Savings']
X_val = val_Savings[feature_columns]
y_val = val_Savings['Savings']

model = ExtraTreesClassifier(
    n_estimators=300,
    max_depth=5,
    min_samples_leaf=1,
    min_samples_split=2,
    class_weight=None,
    random_state=42
)

model.fit(X_train, y_train)

val_probs = model.predict_proba(X_val)[:, 1]
print(f'Savings Validation ROC-AUC: {roc_auc_score(y_val, val_probs):.4f}')

cv_scores = cross_val_score(
    model,
    X_train,
    y_train,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
print(f'Savings Cross-Validation ROC-AUC: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}')

Savings Validation ROC-AUC: 0.6921
Savings Cross-Validation ROC-AUC: 0.6698 ± 0.0089


##Isolation Forest

In [None]:
train_Mortgage = pd.read_csv('train_Mortgage.csv')
val_Mortgage = pd.read_csv('val_Mortgage.csv')

feature_columns = [col for col in train_Mortgage.columns if col != 'Mortgage']
X_train = train_Mortgage[feature_columns]
y_train = train_Mortgage['Mortgage']
X_val = val_Mortgage[feature_columns]
y_val = val_Mortgage['Mortgage']

anomaly_mortgage_model = IsolationForest(
    n_estimators=100,
    random_state=42,
    contamination='auto'
)
anomaly_mortgage_model.fit(X_train)

X_train['is_anomaly'] = anomaly_mortgage_model.predict(X_train)
X_train['is_anomaly'] = X_train['is_anomaly'].map({1: 0, -1: 1})

X_train_isol = X_train[X_train['is_anomaly'] == 0].drop('is_anomaly', axis=1)
y_train_isol = y_train[X_train['is_anomaly'] == 0]

isol_mortgage_model = RandomForestClassifier(
    class_weight=None,
    max_depth=10,
    min_samples_leaf=2,
    min_samples_split=10,
    n_estimators=200,
    random_state=42
)
isol_mortgage_model.fit(X_train_isol, y_train_isol)

# Функция оценки
def evaluate_model(model, X, y):
    probs = model.predict_proba(X)[:, 1]
    return roc_auc_score(y, probs)

# Валидация
val_score = evaluate_model(isol_mortgage_model, X_val, y_val)
print(f'Mortgage Validation ROC-AUC: {val_score:.4f}')

# Кросс-валидация
cv_scores = cross_val_score(
    isol_mortgage_model,
    X_train_isol,
    y_train_isol,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
print(f'Mortgage CV ROC-AUC: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}')

Mortgage Validation ROC-AUC: 0.9381
Mortgage CV ROC-AUC: 0.9419 ± 0.0052


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

train_Pension = pd.read_csv('train_Pension.csv')
val_Pension = pd.read_csv('val_Pension.csv')

feature_columns = [col for col in train_Pension.columns if col != 'Pension']
X_train = train_Pension[feature_columns]
y_train = train_Pension['Pension']
X_val = val_Pension[feature_columns]
y_val = val_Pension['Pension']

anomaly_model = IsolationForest(
    n_estimators=100,
    random_state=42,
    contamination='auto'
)
anomaly_model.fit(X_train)

X_train['is_anomaly'] = anomaly_model.predict(X_train)
X_train['is_anomaly'] = X_train['is_anomaly'].map({1: 0, -1: 1})

clean_X = X_train[X_train['is_anomaly'] == 0].drop('is_anomaly', axis=1)
clean_y = y_train[X_train['is_anomaly'] == 0]

model = RandomForestClassifier(
    class_weight='balanced',
    max_depth=5,
    min_samples_leaf=2,
    min_samples_split=10,
    n_estimators=100,
    random_state=42
)
model.fit(clean_X, clean_y)

val_probs = model.predict_proba(X_val)[:, 1]
print(f'Pension Validation ROC-AUC: {roc_auc_score(y_val, val_probs):.4f}')

cv_scores = cross_val_score(
    model,
    clean_X,
    clean_y,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
print(f'Pension CV ROC-AUC: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}')


Pension Validation ROC-AUC: 0.7727
Pension CV ROC-AUC: 0.7639 ± 0.0144


In [None]:
train_Savings = pd.read_csv('train_Savings.csv')
val_Savings = pd.read_csv('val_Savings.csv')

feature_columns = [col for col in train_Savings.columns if col != 'Savings']
X_train = train_Savings[feature_columns]
y_train = train_Savings['Savings']
X_val = val_Savings[feature_columns]
y_val = val_Savings['Savings']

anomaly_model = IsolationForest(
    n_estimators=100,
    random_state=42,
    contamination='auto'
)
anomaly_model.fit(X_train)

X_train['is_anomaly'] = anomaly_model.predict(X_train)
X_train['is_anomaly'] = X_train['is_anomaly'].map({1: 0, -1: 1})

mask = X_train['is_anomaly'] == 0
X_train_clean = X_train[mask].drop('is_anomaly', axis=1)
y_train_clean = y_train[mask]

savings_model = RandomForestClassifier(
    class_weight=None,
    max_depth=5,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=300,
    random_state=42
)
savings_model.fit(X_train_clean, y_train_clean)

val_probs = savings_model.predict_proba(X_val)[:, 1]
print(f'\nSavings Validation ROC-AUC: {roc_auc_score(y_val, val_probs):.4f}')

cv_scores = cross_val_score(
    savings_model,
    X_train_clean,
    y_train_clean,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
print(f'Savings CV ROC-AUC: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}\n')


Savings Validation ROC-AUC: 0.6857
Savings CV ROC-AUC: 0.6707 ± 0.0089



##Extra Trees

In [None]:
feature_columns = [col for col in train_Mortgage.columns if col != 'Mortgage']
X_train = train_Mortgage[feature_columns]
y_train = train_Mortgage['Mortgage']
X_val = val_Mortgage[feature_columns]
y_val = val_Mortgage['Mortgage']

final_params = {
    'n_estimators': 200,
    'max_depth': 10,
    'min_samples_leaf': 5,
    'min_samples_split': 2,
    'random_state': 42
}

final_model = ExtraTreesClassifier(**final_params).fit(X_train, y_train)

val_probs = final_model.predict_proba(X_val)[:, 1]
print(f'Validation ROC-AUC: {roc_auc_score(y_val, val_probs):.4f}')

cv_scores = cross_val_score(
    final_model,
    X_train,
    y_train,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
print(f'Cross-Validation ROC-AUC: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}')

Validation ROC-AUC: 0.9402
Cross-Validation ROC-AUC: 0.9455 ± 0.0058


In [None]:
feature_columns = [col for col in train_Pension.columns if col != 'Pension']
X_train = train_Pension[feature_columns]
y_train = train_Pension['Pension']
X_val = val_Pension[feature_columns]
y_val = val_Pension['Pension']

final_params = {
    'n_estimators': 100,
    'max_depth': 10,
    'min_samples_leaf': 3,
    'min_samples_split': 10,
    'random_state': 42
}

final_model = ExtraTreesClassifier(**final_params).fit(X_train, y_train)

val_probs = final_model.predict_proba(X_val)[:, 1]
print(f'Validation ROC-AUC: {roc_auc_score(y_val, val_probs):.4f}')

cv_scores = cross_val_score(
    final_model,
    X_train,
    y_train,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
print(f'Cross-Validation ROC-AUC: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}')

Validation ROC-AUC: 0.7781
Cross-Validation ROC-AUC: 0.7718 ± 0.0099


In [None]:
feature_columns = [col for col in train_Savings.columns if col != 'Savings']
X_train = train_Savings[feature_columns]
y_train = train_Savings['Savings']
X_val = val_Savings[feature_columns]
y_val = val_Savings['Savings']

final_params = {
    'n_estimators': 200,
    'max_depth': 10,
    'min_samples_leaf': 1,
    'min_samples_split': 10,
    'random_state': 42
}

final_model = ExtraTreesClassifier(**final_params).fit(X_train, y_train)

val_probs = final_model.predict_proba(X_val)[:, 1]
print(f'Validation ROC-AUC: {roc_auc_score(y_val, val_probs):.4f}')

cv_scores = cross_val_score(
    final_model,
    X_train,
    y_train,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)
print(f'Cross-Validation ROC-AUC: {np.mean(cv_scores):.4f} ± {np.std(cv_scores):.4f}')

Validation ROC-AUC: 0.6891
Cross-Validation ROC-AUC: 0.6679 ± 0.0109


##combunation of boosting

In [3]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [4]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier

all_models_m = {
    "CatBoost": CatBoostClassifier(iterations=839, learning_rate=0.05774795442279825, depth=6,
                                  l2_leaf_reg=0.2880865175456189, random_strength=0.16254277485904814,
                                  bagging_temperature=0.7992786757360438, border_count=32, verbose=0, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=234, max_depth=14, learning_rate=0.13020051661181956,
                            subsample=0.5000183482043092, colsample_bytree=0.9712108470499956,
                            min_child_weight=10, gamma=4.927111346239827, reg_alpha=2.3159564553387995,
                            reg_lambda=1.7788240357504277, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=891, learning_rate=0.2775178145806826, num_leaves=154, max_depth=4,
                              min_child_samples=5, subsample=0.875834146327149, colsample_bytree=0.9367342318343456,
                              reg_alpha=4.0997572412343555, reg_lambda=1.3718029339702882, verbose=-1, random_state=42),
    "ExtraTrees": ExtraTreesClassifier(max_depth=10, min_samples_leaf=5, min_samples_split=2,
                                      n_estimators=200, random_state=42),
    "LogisticRegression": LogisticRegression(C=0.01, max_iter=1000, solver='lbfgs', random_state=42)
}

all_models_p = {
    "CatBoost": CatBoostClassifier(iterations=200, learning_rate=0.010875325239675968, l2_leaf_reg=0.010817134192675052,
                                  random_strength=4.740509717654505, bagging_temperature=0.9584104224319896,
                                  border_count=75, depth=1, verbose=0, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=238, max_depth=3, learning_rate=0.058427369161116946,
                            subsample=0.9293867219852593, colsample_bytree=0.5238187849054002,
                            min_child_weight=6, gamma=4.7308173402905505, reg_alpha=8.066467399907712,
                            reg_lambda=4.390801165747763, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=1191, learning_rate=0.11153678049934727, num_leaves=107, max_depth=3,
                              min_child_samples=41, subsample=0.5173705475484477, colsample_bytree=0.7026966245620551,
                              reg_alpha=0.45638892145814025, reg_lambda=5.315359029396548, verbose=-1, random_state=42),
    "ExtraTrees": ExtraTreesClassifier(max_depth=10, min_samples_leaf=3, min_samples_split=10,
                                      n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(C=0.001, max_iter=1000, solver='lbfgs', random_state=42)
}

all_models_s = {
    "CatBoost": CatBoostClassifier(iterations=205, learning_rate=0.041965496183629054, l2_leaf_reg=0.0033091053333706402,
                                  random_strength=0.4267643014670958, bagging_temperature=0.6548287821971729,
                                  border_count=254, depth=5, verbose=0, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=1446, max_depth=3, learning_rate=0.17633766607373635,
                            subsample=0.8802608079409611, colsample_bytree=0.6399123535237496,
                            min_child_weight=10, gamma=4.7135127571247475, reg_alpha=7.307229547015628,
                            reg_lambda=0.5506026977366328, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=903, learning_rate=0.2620735099496511, num_leaves=198, max_depth=5,
                              min_child_samples=98, subsample=0.5818744149073714, colsample_bytree=0.9979504029037664,
                              reg_alpha=3.2759181752443514, reg_lambda=9.427794042824686, verbose=-1, random_state=42),
    "ExtraTrees": ExtraTreesClassifier(max_depth=10, min_samples_leaf=1, min_samples_split=10,
                                      n_estimators=200, random_state=42),
    "LogisticRegression": LogisticRegression(C=0.1, max_iter=1000, solver='lbfgs', random_state=42)
}

def load_data(target):
    train = pd.read_csv(f'train_{target}.csv')
    val = pd.read_csv(f'val_{target}.csv')

    feature_columns = [col for col in train.columns if col != target]
    X_train = train[feature_columns]
    y_train = train[target]
    X_val = val[feature_columns]
    y_val = val[target]

    return X_train, y_train, X_val, y_val

target_config = {
    'Mortgage': {
        'models': all_models_m,
        'combinations': [
            'LightGBM+ExtraTrees',
            'LightGBM+ExtraTrees+LogisticRegression',
            'LightGBM+LogisticRegression',
            'LightGBM+XGBoost',
            'CatBoost+LightGBM+LogisticRegression',
            'LightGBM+XGBoost+LogisticRegression',
            'CatBoost+XGBoost+ExtraTrees+LogisticRegression',
            'LightGBM+XGBoost+ExtraTrees+LogisticRegression',
            'CatBoost+LightGBM',
            'CatBoost+XGBoost+ExtraTrees'
        ]
    },
    'Pension': {
        'models': all_models_p,
        'combinations': [
            'LightGBM+XGBoost+ExtraTrees',
            'CatBoost+LightGBM+XGBoost+ExtraTrees+LogisticRegression',
            'CatBoost+ExtraTrees',
            'CatBoost+LightGBM+LogisticRegression',
            'CatBoost+XGBoost+ExtraTrees+LogisticRegression',
            'LightGBM+XGBoost+ExtraTrees+LogisticRegression',
            'CatBoost+LightGBM+XGBoost+ExtraTrees',
            'CatBoost+LightGBM+XGBoost',
            'XGBoost+ExtraTrees',
            'CatBoost+LightGBM+ExtraTrees+LogisticRegression'
        ]
    },
    'Savings': {
        'models': all_models_s,
        'combinations': [
            'CatBoost+XGBoost+ExtraTrees+LogisticRegression',
            'CatBoost+XGBoost+LogisticRegression',
            'XGBoost+LogisticRegression',
            'XGBoost+ExtraTrees+LogisticRegression',
            'LightGBM+LogisticRegression',
            'LightGBM+XGBoost+LogisticRegression',
            'CatBoost+LightGBM+XGBoost+ExtraTrees+LogisticRegression',
            'ExtraTrees+LogisticRegression',
            'LightGBM+XGBoost+ExtraTrees+LogisticRegression',
            'CatBoost+LightGBM+XGBoost+LogisticRegression'
        ]
    }
}

def evaluate_combination(models_dict, combination, X_train, y_train, X_val, y_val):
    """Оценка комбинации моделей"""
    models = combination.split('+')

    roc_before = []
    for model_name in models:
        model = models_dict[model_name]
        scores = cross_val_score(model, X_train, y_train,
                                scoring='roc_auc', cv=5, n_jobs=-1)
        roc_before.append(np.mean(scores))
    roc_before = np.mean(roc_before)

    stacking = StackingClassifier(
        estimators=[(name, models_dict[name]) for name in models],
        final_estimator=LogisticRegression(max_iter=1000, random_state=42),
        cv=5,
        n_jobs=-1
    )
    stacking.fit(X_train, y_train)
    y_proba = stacking.predict_proba(X_val)[:, 1]
    roc_after = roc_auc_score(y_val, y_proba)

    return roc_before, roc_after

results = {}
for target in ['Mortgage', 'Pension', 'Savings']:
    X_train, y_train, X_val, y_val = load_data(target)
    config = target_config[target]

    target_results = []
    for combo in config['combinations']:
        roc_before, roc_after = evaluate_combination(
            config['models'], combo,
            X_train, y_train, X_val, y_val
        )
        target_results.append((combo, roc_before, roc_after))

    results[target] = target_results

for target, data in results.items():
    print(f"\n**{target}**")
    print("```")
    print("+---------------------------------------------------------+------------+-----------+")
    print("| Комбинация                                              | ROC before | ROC after |")
    print("+---------------------------------------------------------+------------+-----------+")
    for combo, before, after in data:
        print(f"| {combo.ljust(55)} | {before:.5f}   | {after:.5f}  |")
    print("+---------------------------------------------------------+------------+-----------+")
    print("```\n")


**Mortgage**
```
+---------------------------------------------------------+------------+-----------+
| Комбинация                                              | ROC before | ROC after |
+---------------------------------------------------------+------------+-----------+
| LightGBM+ExtraTrees                                     | 0.94425   | 0.94315  |
| LightGBM+ExtraTrees+LogisticRegression                  | 0.94425   | 0.94312  |
| LightGBM+LogisticRegression                             | 0.94362   | 0.94255  |
| LightGBM+XGBoost                                        | 0.94525   | 0.94239  |
| CatBoost+LightGBM+LogisticRegression                    | 0.94071   | 0.94312  |
| LightGBM+XGBoost+LogisticRegression                     | 0.94492   | 0.94235  |
| CatBoost+XGBoost+ExtraTrees+LogisticRegression          | 0.94303   | 0.94198  |
| LightGBM+XGBoost+ExtraTrees+LogisticRegression          | 0.94506   | 0.94240  |
| CatBoost+LightGBM                                       | 0.9

##Node

In [8]:
pip install torchdiffeq

Collecting torchdiffeq
  Downloading torchdiffeq-0.2.5-py3-none-any.whl.metadata (440 bytes)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.5.0->torchdiffeq)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.5.0->torchdiffeq)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.5.0->torchdiffeq)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.5.0->torchdiffeq)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.5.0->torchdiffeq)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from t

In [14]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

train_Mortgage = pd.read_csv('train_Mortgage.csv')
val_Mortgage = pd.read_csv('val_Mortgage.csv')

feature_columns = train_Mortgage.columns.drop(['Mortgage']).tolist()

X_train = train_Mortgage[feature_columns]
y_train = train_Mortgage[['Mortgage']]
X_val = val_Mortgage[feature_columns]
y_val = val_Mortgage[['Mortgage']]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_val_encoded = le.transform(y_val)

X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train_encoded, dtype=torch.long)
y_val_tensor = torch.tensor(y_val_encoded, dtype=torch.long)

class ResBlock(nn.Module):
    def __init__(self, dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Linear(dim, dim)
        )
    def forward(self, x):
        return x + self.net(x)

class ODEFunc(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.net = nn.Sequential(
            ResBlock(hidden_dim),
            ResBlock(hidden_dim),
        )
    def forward(self, x):
        return self.net(x)

class NODEClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.ode_func = ODEFunc(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        out = torch.relu(self.ode_func(x))
        return self.fc2(out)

input_dim = X_train_tensor.shape[1]
hidden_dim = 64
output_dim = len(le.classes_)
model = NODEClassifier(input_dim, hidden_dim, output_dim)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)
num_epochs = 500

val_roc_history = []
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        model.eval()
        with torch.no_grad():
            val_preds = torch.argmax(model(X_val_tensor), dim=1)
            roc_auc = roc_auc_score(y_val_tensor.numpy(), val_preds.numpy())
            val_roc_history.append(roc_auc)

roc_without_cv = val_roc_history[-1]

full_data = pd.concat([train_Mortgage, val_Mortgage])
X_full = full_data[feature_columns]
y_full = full_data['Mortgage']
y_full_encoded = le.fit_transform(y_full)

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_full, y_full_encoded)):
    X_train_fold = X_full.iloc[train_idx]
    X_val_fold = X_full.iloc[val_idx]
    y_train_fold = y_full_encoded[train_idx]
    y_val_fold = y_full_encoded[val_idx]

    scaler_fold = StandardScaler()
    X_train_sc = scaler_fold.fit_transform(X_train_fold)
    X_val_sc = scaler_fold.transform(X_val_fold)

    X_train_ts = torch.tensor(X_train_sc, dtype=torch.float32)
    X_val_ts = torch.tensor(X_val_sc, dtype=torch.float32)
    y_train_ts = torch.tensor(y_train_fold, dtype=torch.long)
    y_val_ts = torch.tensor(y_val_fold, dtype=torch.long)

    model_cv = NODEClassifier(
        input_dim=X_train_sc.shape[1],
        hidden_dim=hidden_dim,
        output_dim=output_dim
    )
    optimizer_cv = optim.Adam(model_cv.parameters(), lr=0.01)

    for epoch in range(num_epochs):
        model_cv.train()
        optimizer_cv.zero_grad()
        outputs = model_cv(X_train_ts)
        loss = criterion(outputs, y_train_ts)
        loss.backward()
        optimizer_cv.step()

    model_cv.eval()
    with torch.no_grad():
        preds = torch.argmax(model_cv(X_val_ts), dim=1)
        roc_auc = roc_auc_score(y_val_ts.numpy(), preds.numpy())
        cv_scores.append(roc_auc)

print(f'\nИтоговые метрики:')
print(f'1. ROC-AUC без кроссвалидации: {roc_without_cv:.4f}')
print(f'2. Средний ROC-AUC с кроссвалидацией: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})')

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)



Итоговые метрики:
1. ROC-AUC без кроссвалидации: 0.7477
2. Средний ROC-AUC с кроссвалидацией: 0.7857 (±0.0283)


In [17]:
train = pd.read_csv('train_Savings.csv')
val = pd.read_csv('val_Savings.csv')
feature_columns = train.columns.drop(['Savings']).tolist()

X_train = train[feature_columns]
y_train = train['Savings']
X_val = val[feature_columns]
y_val = val['Savings']

scaler = StandardScaler()
le = LabelEncoder()
X_train_sc = scaler.fit_transform(X_train)
X_val_sc = scaler.transform(X_val)
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)

X_train_ts = torch.tensor(X_train_sc, dtype=torch.float32)
X_val_ts = torch.tensor(X_val_sc, dtype=torch.float32)
y_train_ts = torch.tensor(y_train_enc, dtype=torch.long)
y_val_ts = torch.tensor(y_val_enc, dtype=torch.long)

model = NODEClassifier(X_train_ts.shape[1], 64, len(le.classes_))
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

for _ in range(500):
    optimizer.zero_grad()
    loss = criterion(model(X_train_ts), y_train_ts)
    loss.backward()
    optimizer.step()

model.eval()
with torch.no_grad():
    val_pred = torch.argmax(model(X_val_ts), dim=1)
    roc_without_cv = roc_auc_score(y_val_ts, val_pred)

full_data = pd.concat([train, val])
X_full = full_data[feature_columns]
y_full = le.fit_transform(full_data['Savings'])
kf = StratifiedKFold(5, shuffle=True, random_state=42)
cv_scores = []

for train_idx, val_idx in kf.split(X_full, y_full):
    X_tr, X_vl = X_full.iloc[train_idx], X_full.iloc[val_idx]
    y_tr, y_vl = y_full[train_idx], y_full[val_idx]

    scaler_cv = StandardScaler()
    X_tr_sc = scaler_cv.fit_transform(X_tr)
    X_vl_sc = scaler_cv.transform(X_vl)

    model_cv = NODEClassifier(X_tr_sc.shape[1], 64, len(le.classes_))
    opt = optim.Adam(model_cv.parameters(), lr=0.01)

    for _ in range(500):
        opt.zero_grad()
        loss = criterion(model_cv(torch.tensor(X_tr_sc, dtype=torch.float32)),
                       torch.tensor(y_tr, dtype=torch.long))
        loss.backward()
        opt.step()

    with torch.no_grad():
        pred = torch.argmax(model_cv(torch.tensor(X_vl_sc, dtype=torch.float32)), dim=1)
        cv_scores.append(roc_auc_score(y_vl, pred.numpy()))

print(f"Результаты для Savings:\n"
      f"1. ROC-AUC без кроссвалидации: {roc_without_cv:.4f}\n"
      f"2. ROC-AUC с кроссвалидацией: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")

Результаты для Savings:
1. ROC-AUC без кроссвалидации: 0.5456
2. ROC-AUC с кроссвалидацией: 0.5600 (±0.0117)


In [19]:
train = pd.read_csv('train_Pension.csv')
val = pd.read_csv('val_Pension.csv')
feature_columns = train.columns.drop(['Pension']).tolist()

X_train = train[feature_columns]
y_train = train['Pension']
X_val = val[feature_columns]
y_val = val['Pension']

scaler = StandardScaler()
le = LabelEncoder()
X_train_sc = scaler.fit_transform(X_train)
X_val_sc = scaler.transform(X_val)
y_train_enc = le.fit_transform(y_train)
y_val_enc = le.transform(y_val)

X_train_ts = torch.tensor(X_train_sc, dtype=torch.float32)
X_val_ts = torch.tensor(X_val_sc, dtype=torch.float32)
y_train_ts = torch.tensor(y_train_enc, dtype=torch.long)
y_val_ts = torch.tensor(y_val_enc, dtype=torch.long)

model = NODEClassifier(X_train_ts.shape[1], 64, len(le.classes_))
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

for _ in range(500):
    optimizer.zero_grad()
    loss = criterion(model(X_train_ts), y_train_ts)
    loss.backward()
    optimizer.step()

model.eval()
with torch.no_grad():
    val_pred = torch.argmax(model(X_val_ts), dim=1)
    roc_without_cv = roc_auc_score(y_val_ts, val_pred)

full_data = pd.concat([train, val])
X_full = full_data[feature_columns]
y_full = le.fit_transform(full_data['Pension'])
kf = StratifiedKFold(5, shuffle=True, random_state=42)
cv_scores = []

for train_idx, val_idx in kf.split(X_full, y_full):
    X_tr, X_vl = X_full.iloc[train_idx], X_full.iloc[val_idx]
    y_tr, y_vl = y_full[train_idx], y_full[val_idx]

    scaler_cv = StandardScaler()
    X_tr_sc = scaler_cv.fit_transform(X_tr)
    X_vl_sc = scaler_cv.transform(X_vl)

    model_cv = NODEClassifier(X_tr_sc.shape[1], 64, len(le.classes_))
    opt = optim.Adam(model_cv.parameters(), lr=0.01)

    for _ in range(500):
        opt.zero_grad()
        loss = criterion(model_cv(torch.tensor(X_tr_sc, dtype=torch.float32)),
                       torch.tensor(y_tr, dtype=torch.long))
        loss.backward()
        opt.step()

    with torch.no_grad():
        pred = torch.argmax(model_cv(torch.tensor(X_vl_sc, dtype=torch.float32)), dim=1)
        cv_scores.append(roc_auc_score(y_vl, pred.numpy()))

print(f"Результаты для Pension:\n"
      f"1. ROC-AUC без кроссвалидации: {roc_without_cv:.4f}\n"
      f"2. ROC-AUC с кроссвалидацией: {np.mean(cv_scores):.4f} (±{np.std(cv_scores):.4f})")

Результаты для Pension:
1. ROC-AUC без кроссвалидации: 0.5957
2. ROC-AUC с кроссвалидацией: 0.5860 (±0.0061)
