In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# ==============================================================================
# 1. LOAD DATA & FEATURE ENGINEERING
# ==============================================================================
print("Loading and Engineering Data...")

train_df = pd.read_csv('train_updated.csv')
test_df = pd.read_csv('test_updated.csv')
test_ids = test_df[['ProfileID']]

def create_features(df):
    df = df.copy()
    # 1. Loan to Income Ratio
    df['Loan_to_Income'] = df['RequestedSum'] / (df['AnnualEarnings'] + 1)
    
    # 2. Income Stability
    df['Income_Stability'] = df['AnnualEarnings'] / (df['WorkDuration'] + 1)
    
    # 3. Monthly Burden
    total_repay = df['RequestedSum'] * (1 + df['OfferRate'] / 100)
    df['Monthly_Burden'] = total_repay / df['RepayPeriod']
    
    # 4. Trust vs Accounts Interaction
    df['Trust_x_Accounts'] = df['TrustMetric'] * (df['ActiveAccounts'] + 1)
    
    return df

# Apply Feature Engineering
train_df = create_features(train_df)
test_df = create_features(test_df)

# Separate Target and Features
y = train_df['RiskFlag'].values
train_X_raw = train_df.drop(['RiskFlag', 'ProfileID'], axis=1)
test_X_raw = test_df.drop(['ProfileID'], axis=1)

# ==============================================================================
# 2. PREPROCESSING
# ==============================================================================
print("Preprocessing...")

cat_cols = train_X_raw.select_dtypes(include=['object']).columns
num_cols = train_X_raw.select_dtypes(exclude=['object']).columns

# Create Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ],
    verbose_feature_names_out=False
)

# Transform Data
X = preprocessor.fit_transform(train_X_raw)
X_test = preprocessor.transform(test_X_raw)

print(f"Data Shape: {X.shape}")

# ==============================================================================
# 3. TRAIN LOGISTIC REGRESSION
# ==============================================================================
print("Training Logistic Regression...")

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initialize Model
# Note: We use default class_weights to maximize Accuracy (similar to the NN approach)
# If you wanted to catch more fraud at the cost of accuracy, you would use class_weight='balanced'
model = LogisticRegression(
    max_iter=2000,       # Increased to ensure convergence
    solver='lbfgs',      # Standard robust solver
    random_state=42
)

# Fit Model
model.fit(X_train, y_train)

# ==============================================================================
# 4. EVALUATION
# ==============================================================================
val_preds = model.predict(X_val)
acc = accuracy_score(y_val, val_preds)

print(f"\nValidation Accuracy: {acc:.5f}")
print("\nClassification Report:")
print(classification_report(y_val, val_preds))

# ==============================================================================
# 5. SUBMISSION
# ==============================================================================
print("Generating Submission...")

test_preds = model.predict(X_test)

submission = pd.DataFrame({
    'ProfileID': test_ids['ProfileID'],
    'RiskFlag': test_preds
})

filename = 'submission_logistic_regression.csv'
submission.to_csv(filename, index=False)
print(f"Success! Submission saved to '{filename}'")

Loading and Engineering Data...
Preprocessing...
Data Shape: (204277, 35)
Training Logistic Regression...

Validation Accuracy: 0.88665

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.99      0.94     36105
           1       0.60      0.07      0.13      4751

    accuracy                           0.89     40856
   macro avg       0.75      0.53      0.54     40856
weighted avg       0.86      0.89      0.85     40856

Generating Submission...
Success! Submission saved to 'submission_logistic_regression.csv'


In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import accuracy_score
import optuna
import os

# Enable GPU memory growth (prevents crashes)
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print("GPU Detected: Running in Fast Mode")
else:
    print("No GPU found. Running on CPU (Slower)")

# ==============================================================================
# 1. LOAD & ENGINEER DATA
# ==============================================================================
print("Loading Data...")
train_df = pd.read_csv('train_updated.csv')
test_df = pd.read_csv('test_updated.csv')
test_ids = test_df[['ProfileID']]

def create_features(df):
    df = df.copy()
    df['Loan_to_Income'] = df['RequestedSum'] / (df['AnnualEarnings'] + 1)
    df['Income_Stability'] = df['AnnualEarnings'] / (df['WorkDuration'] + 1)
    total_repay = df['RequestedSum'] * (1 + df['OfferRate'] / 100)
    df['Monthly_Burden'] = total_repay / df['RepayPeriod']
    df['Trust_x_Accounts'] = df['TrustMetric'] * (df['ActiveAccounts'] + 1)
    return df

train_df = create_features(train_df)
test_df = create_features(test_df)

y = train_df['RiskFlag'].values
train_X_raw = train_df.drop(['RiskFlag', 'ProfileID'], axis=1)
test_X_raw = test_df.drop(['ProfileID'], axis=1)

# ==============================================================================
# 2. PREPROCESSING
# ==============================================================================
cat_cols = train_X_raw.select_dtypes(include=['object']).columns
num_cols = train_X_raw.select_dtypes(exclude=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ],
    verbose_feature_names_out=False
)

X = preprocessor.fit_transform(train_X_raw)
X_test = preprocessor.transform(test_X_raw)

# ==============================================================================
# 3. OPTUNA TUNING (GPU LOGISTIC REGRESSION)
# ==============================================================================
def objective(trial):
    # --- Tune Hyperparameters ---
    # L2 Regularization (Inverse of 'C' in sklearn)
    l2_reg = trial.suggest_float('l2_reg', 1e-6, 1e-1, log=True)
    lr = trial.suggest_float('lr', 1e-4, 1e-1, log=True)
    batch_size = trial.suggest_categorical('batch_size', [512, 1024, 2048])
    
    # Class Weight (To beat 0.887, we tune this carefully)
    pos_weight = trial.suggest_float('pos_weight', 0.5, 4.0)

    # --- Build Logistic Regression (Single Layer NN) ---
    model = keras.Sequential([
        layers.InputLayer(input_shape=(X.shape[1],)),
        
        # Dense(1) with Sigmoid IS Logistic Regression
        layers.Dense(1, activation='sigmoid', 
                     kernel_regularizer=regularizers.l2(l2_reg)) 
    ])
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    # Split for tuning
    X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    
    # Pruning Callback
    pruning_callback = optuna.integration.TFKerasPruningCallback(trial, 'val_accuracy')
    
    history = model.fit(
        X_tr, y_tr,
        validation_data=(X_val, y_val),
        epochs=30, # LogReg needs more epochs than Deep NN to converge
        batch_size=batch_size,
        class_weight={0: 1.0, 1: pos_weight},
        callbacks=[pruning_callback],
        verbose=0
    )
    
    return max(history.history['val_accuracy'])

print("\n--- Starting Optuna Tuning (GPU) ---")
study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study.optimize(objective, n_trials=30) 

print("Best Params:", study.best_params)

# ==============================================================================
# 4. FINAL TRAINING & SUBMISSION
# ==============================================================================
print("\n--- Training Final Logistic Model ---")

best = study.best_params

# Rebuild Best Model
model = keras.Sequential([
    layers.InputLayer(input_shape=(X.shape[1],)),
    layers.Dense(1, activation='sigmoid', 
                 kernel_regularizer=regularizers.l2(best['l2_reg']))
])

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=best['lr']),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Train on Full Data (with a validation split for monitoring)
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

early_stop = callbacks.EarlyStopping(monitor='val_accuracy', patience=15, restore_best_weights=True)

model.fit(
    X_tr, y_tr,
    validation_data=(X_val, y_val),
    epochs=150, # Allow it time to converge
    batch_size=best['batch_size'],
    class_weight={0: 1.0, 1: best['pos_weight']},
    callbacks=[early_stop],
    verbose=0
)

# Predict
val_acc = model.evaluate(X_val, y_val, verbose=0)[1]
print(f"Final Validation Accuracy: {val_acc:.5f}")

test_preds = (model.predict(X_test) > 0.5).astype(int)

submission = pd.DataFrame({
    'ProfileID': test_ids['ProfileID'],
    'RiskFlag': test_preds.flatten()
})

submission.to_csv('submission_gpu_logistic_tuned.csv', index=False)
print("Saved: submission_gpu_logistic_tuned.csv")

  from .autonotebook import tqdm as notebook_tqdm


No GPU found. Running on CPU (Slower)
Loading Data...


[I 2025-11-27 17:08:59,414] A new study created in memory with name: no-name-3224fdca-97e5-41a3-9c90-7bdf2aa25668



--- Starting Optuna Tuning (GPU) ---


[I 2025-11-27 17:09:11,430] Trial 0 finished with value: 0.8320931792259216 and parameters: {'l2_reg': 0.00042372091494208454, 'lr': 0.0032002127239405395, 'batch_size': 1024, 'pos_weight': 3.9319550813902517}. Best is trial 0 with value: 0.8320931792259216.
[I 2025-11-27 17:09:19,516] Trial 1 finished with value: 0.8606079816818237 and parameters: {'l2_reg': 2.277349793735707e-05, 'lr': 0.004121865470915753, 'batch_size': 2048, 'pos_weight': 3.2081108785702406}. Best is trial 1 with value: 0.8606079816818237.
[I 2025-11-27 17:09:27,604] Trial 2 finished with value: 0.8681955933570862 and parameters: {'l2_reg': 2.2351053461703003e-06, 'lr': 0.00047707013575939554, 'batch_size': 2048, 'pos_weight': 2.652086885434692}. Best is trial 2 with value: 0.8681955933570862.
[I 2025-11-27 17:09:35,846] Trial 3 finished with value: 0.8335617780685425 and parameters: {'l2_reg': 0.031913116304582434, 'lr': 0.00010383402738107353, 'batch_size': 2048, 'pos_weight': 3.3467770233268395}. Best is trial 2

Best Params: {'l2_reg': 0.005266989217602594, 'lr': 0.010562998856748962, 'batch_size': 2048, 'pos_weight': 1.087097851134728}

--- Training Final Logistic Model ---


NameError: name 'callbacks' is not defined