In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, callbacks
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os

# ==============================================================================
#  HARDCODED WEIGHTS (CHANGE THESE TO TUNE)
# ==============================================================================
# The sum should ideally be 1.0, but the code handles it regardless.
# Since NN and LogReg gave slightly better scores (0.887), we give them more weight.
WEIGHT_NN  = 0.40
WEIGHT_LR  = 0.40
WEIGHT_SVM = 0.20

print(f"Ensemble Configuration:")
print(f"  Neural Network: {WEIGHT_NN}")
print(f"  Logistic Reg:   {WEIGHT_LR}")
print(f"  SVM:            {WEIGHT_SVM}")

# ==============================================================================
# 1. LOAD & FEATURE ENGINEERING
# ==============================================================================
print("\n[1/5] Loading and Engineering Data...")

train_df = pd.read_csv('train_updated.csv')
test_df = pd.read_csv('test_updated.csv')
test_ids = test_df[['ProfileID']]

def create_features(df):
    df = df.copy()
    df['Loan_to_Income'] = df['RequestedSum'] / (df['AnnualEarnings'] + 1)
    df['Income_Stability'] = df['AnnualEarnings'] / (df['WorkDuration'] + 1)
    total_repay = df['RequestedSum'] * (1 + df['OfferRate'] / 100)
    df['Monthly_Burden'] = total_repay / df['RepayPeriod']
    df['Trust_x_Accounts'] = df['TrustMetric'] * (df['ActiveAccounts'] + 1)
    return df

train_df = create_features(train_df)
test_df = create_features(test_df)

y = train_df['RiskFlag'].values
train_X_raw = train_df.drop(['RiskFlag', 'ProfileID'], axis=1)
test_X_raw = test_df.drop(['ProfileID'], axis=1)

# ==============================================================================
# 2. PREPROCESSING
# ==============================================================================
print("[2/5] Preprocessing...")

cat_cols = train_X_raw.select_dtypes(include=['object']).columns
num_cols = train_X_raw.select_dtypes(exclude=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ],
    verbose_feature_names_out=False
)

X = preprocessor.fit_transform(train_X_raw)
X_test = preprocessor.transform(test_X_raw)

# Split for validation (to check if ensemble improves score)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

# ==============================================================================
# 3. TRAIN MODELS
# ==============================================================================

# --- Model A: Neural Network ---
print("\n[3/5] Training Neural Network...")
def build_nn():
    model = keras.Sequential([
        layers.InputLayer(input_shape=(X.shape[1],)),
        layers.Dense(256, activation='swish'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(128, activation='swish'),
        layers.BatchNormalization(),
        layers.Dropout(0.3),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model_nn = build_nn()
early_stop = callbacks.EarlyStopping(monitor='val_accuracy', patience=10, restore_best_weights=True)

model_nn.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=1024,
    callbacks=[early_stop],
    verbose=0 
)
# Get probabilities
prob_nn_val = model_nn.predict(X_val, verbose=0).flatten()
prob_nn_test = model_nn.predict(X_test, verbose=0).flatten()
print(f"  NN Validation Accuracy: {accuracy_score(y_val, (prob_nn_val > 0.5).astype(int)):.5f}")


# --- Model B: Logistic Regression ---
print("\n[4/5] Training Logistic Regression...")
# Uses CPU (sklearn) for simplicity and stability in ensemble
model_lr = LogisticRegression(max_iter=1000, random_state=42, solver='lbfgs')
model_lr.fit(X_train, y_train)

# Get probabilities
prob_lr_val = model_lr.predict_proba(X_val)[:, 1]
prob_lr_test = model_lr.predict_proba(X_test)[:, 1]
print(f"  LogReg Validation Accuracy: {accuracy_score(y_val, (prob_lr_val > 0.5).astype(int)):.5f}")


# --- Model C: SVM (Calibrated) ---
print("\n[5/5] Training SVM (Calibrated)...")
# LinearSVC doesn't output probabilities, so we wrap it in CalibratedClassifierCV
linear_svc = LinearSVC(dual=False, random_state=42, C=1.0)
model_svm = CalibratedClassifierCV(linear_svc, method='sigmoid', cv=3)
model_svm.fit(X_train, y_train)

# Get probabilities
prob_svm_val = model_svm.predict_proba(X_val)[:, 1]
prob_svm_test = model_svm.predict_proba(X_test)[:, 1]
print(f"  SVM Validation Accuracy: {accuracy_score(y_val, (prob_svm_val > 0.5).astype(int)):.5f}")


# ==============================================================================
# 4. ENSEMBLE AGGREGATION
# ==============================================================================
print("\n--- Calculating Weighted Ensemble ---")

# Combine Probabilities (Weighted Average)
ensemble_prob_val = (
    (prob_nn_val * WEIGHT_NN) + 
    (prob_lr_val * WEIGHT_LR) + 
    (prob_svm_val * WEIGHT_SVM)
)

ensemble_prob_test = (
    (prob_nn_test * WEIGHT_NN) + 
    (prob_lr_test * WEIGHT_LR) + 
    (prob_svm_test * WEIGHT_SVM)
)

# Thresholding
ensemble_pred_val = (ensemble_prob_val > 0.5).astype(int)
ensemble_pred_test = (ensemble_prob_test > 0.5).astype(int)

# Check if Ensemble actually helped on Validation set
acc_ensemble = accuracy_score(y_val, ensemble_pred_val)
print(f"Ensemble Validation Accuracy: {acc_ensemble:.5f}")

# ==============================================================================
# 5. SAVE SUBMISSION
# ==============================================================================
submission = pd.DataFrame({
    'ProfileID': test_ids['ProfileID'],
    'RiskFlag': ensemble_pred_test
})

filename = 'submission_weighted_ensemble.csv'
submission.to_csv(filename, index=False)
print(f"\nSuccess! Saved to '{filename}'")

Ensemble Configuration:
  Neural Network: 0.4
  Logistic Reg:   0.4
  SVM:            0.2

[1/5] Loading and Engineering Data...
[2/5] Preprocessing...

[3/5] Training Neural Network...




  NN Validation Accuracy: 0.88854

[4/5] Training Logistic Regression...
  LogReg Validation Accuracy: 0.88785

[5/5] Training SVM (Calibrated)...
  SVM Validation Accuracy: 0.88658

--- Calculating Weighted Ensemble ---
Ensemble Validation Accuracy: 0.88751

Success! Saved to 'submission_weighted_ensemble.csv'
