In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import os

# ==============================================================================
# 1. LOAD DATA & APPLY FEATURE ENGINEERING
# ==============================================================================
print("Loading Data...")
train_df = pd.read_csv('train_updated.csv')
test_df = pd.read_csv('test_updated.csv')
test_ids = test_df[['ProfileID']]

def create_features(df):
    """
    Applies the same feature engineering used in previous models
    to ensure the SVM has strong predictive inputs.
    """
    df = df.copy()
    
    # 1. Loan to Income Ratio
    df['Loan_to_Income'] = df['RequestedSum'] / (df['AnnualEarnings'] + 1)
    
    # 2. Income Stability
    df['Income_Stability'] = df['AnnualEarnings'] / (df['WorkDuration'] + 1)
    
    # 3. Monthly Burden
    # Approximating simple interest for the feature
    total_repay = df['RequestedSum'] * (1 + df['OfferRate'] / 100)
    df['Monthly_Burden'] = total_repay / df['RepayPeriod']
    
    # 4. Trust vs Accounts Interaction
    df['Trust_x_Accounts'] = df['TrustMetric'] * (df['ActiveAccounts'] + 1)
    
    return df

print("Engineering Features...")
train_df = create_features(train_df)
test_df = create_features(test_df)

# Separate Target and Features
y = train_df['RiskFlag'].values
train_X_raw = train_df.drop(['RiskFlag', 'ProfileID'], axis=1)
test_X_raw = test_df.drop(['ProfileID'], axis=1)

# ==============================================================================
# 2. PREPROCESSING
# ==============================================================================
print("Preprocessing...")

cat_cols = train_X_raw.select_dtypes(include=['object']).columns
num_cols = train_X_raw.select_dtypes(exclude=['object']).columns

# SVMs are strictly distance-based, so Scaling is MANDATORY.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), cat_cols)
    ],
    verbose_feature_names_out=False
)

X = preprocessor.fit_transform(train_X_raw)
X_test = preprocessor.transform(test_X_raw)

print(f"Data Shape: {X.shape}")

# ==============================================================================
# 3. TRAIN LINEAR SVM
# ==============================================================================
print("Training Linear SVM...")

# Split for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Initialize Model
# dual=False is preferred when n_samples > n_features for performance
model = LinearSVC(
    dual=False,
    random_state=42,
    C=1.0,           # Standard regularization (No tuning as requested)
    max_iter=3000    # Increased to ensure convergence on this dataset size
)

# Fit Model
model.fit(X_train, y_train)

# ==============================================================================
# 4. EVALUATION
# ==============================================================================
print("Evaluating...")
val_preds = model.predict(X_val)
acc = accuracy_score(y_val, val_preds)

print(f"\nValidation Accuracy: {acc:.5f}")
print("\nClassification Report:")
print(classification_report(y_val, val_preds))

# ==============================================================================
# 5. SUBMISSION GENERATION
# ==============================================================================
print("Generating Submission...")

test_preds = model.predict(X_test)

submission = pd.DataFrame({
    'ProfileID': test_ids['ProfileID'],
    'RiskFlag': test_preds
})

filename = 'submission_svm.csv'
submission.to_csv(filename, index=False)
print(f"Success! Submission saved to '{filename}'")

Loading Data...
Engineering Features...
Preprocessing...
Data Shape: (204277, 35)
Training Linear SVM...
Evaluating...

Validation Accuracy: 0.88575

Classification Report:
              precision    recall  f1-score   support

           0       0.89      1.00      0.94     36105
           1       0.75      0.03      0.05      4751

    accuracy                           0.89     40856
   macro avg       0.82      0.51      0.49     40856
weighted avg       0.87      0.89      0.84     40856

Generating Submission...
Success! Submission saved to 'submission_svm.csv'
