In [1]:
# =============================
# 1. IMPORTS
# =============================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, roc_auc_score, 
    classification_report, confusion_matrix
)

In [2]:
# # =============================
# # 2. LOAD PREPROCESSED FILES
# # =============================
# X = pd.read_csv("/kaggle/input/processed-financial-risk/X_processed.csv")
# y = pd.read_csv("/kaggle/input/processed-financial-risk/y.csv").iloc[:, 0]   # flatten to Series
# test_processed = pd.read_csv("/kaggle/input/processed-financial-risk/test_processed.csv")

# =============================
# 2. LOAD PROCESSED TRAIN & TEST
# =============================
train = pd.read_csv("/kaggle/input/processed-financial-risk/train_processed.csv")
test  = pd.read_csv("/kaggle/input/processed-financial-risk/test_processed.csv")

print("Train shape:", train.shape)
print("Test shape :", test.shape)

print("\nColumns:", train.columns.tolist())



Train shape: (204277, 27)
Test shape : (51070, 26)

Columns: ['ProfileID', 'orig_index', 'ApplicantYears', 'AnnualEarnings', 'RequestedSum', 'TrustMetric', 'WorkDuration', 'ActiveAccounts', 'OfferRate', 'RepayPeriod', 'DebtFactor', 'QualificationLevel_High School', "QualificationLevel_Master's", 'QualificationLevel_PhD', 'WorkCategory_Part-time', 'WorkCategory_Self-employed', 'WorkCategory_Unemployed', 'RelationshipStatus_Married', 'RelationshipStatus_Single', 'OwnsProperty_Yes', 'FamilyObligation_Yes', 'FundUseCase_Business', 'FundUseCase_Education', 'FundUseCase_Home', 'FundUseCase_Other', 'JointApplicant_Yes', 'RiskFlag']


In [3]:
# =============================
# 3. SPLIT FEATURES & TARGET
# =============================
TARGET = "RiskFlag"
IDCOL = "ProfileID"

X = train.drop(columns=[TARGET, IDCOL])
y = train[TARGET]

print("X:", X.shape)
print("y:", y.shape)

# Quick debug
print("\n[DEBUG] y class distribution:")
print(y.value_counts())


X: (204277, 25)
y: (204277,)

[DEBUG] y class distribution:
RiskFlag
0    180524
1     23753
Name: count, dtype: int64


In [4]:
# =============================
# 4. STRATIFIED TRAIN/VALIDATION SPLIT
# =============================
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("y_train distribution:\n", y_train.value_counts())
print("\ny_val distribution:\n", y_val.value_counts())


y_train distribution:
 RiskFlag
0    144419
1     19002
Name: count, dtype: int64

y_val distribution:
 RiskFlag
0    36105
1     4751
Name: count, dtype: int64


In [5]:
# =============================
# 5. TRAIN LOGISTIC REGRESSION
# =============================
log_reg = LogisticRegression(
    max_iter=1000,
    solver="lbfgs",
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)

log_reg.fit(X_train, y_train)

print("Model trained successfully.")


Model trained successfully.


In [6]:
# =============================
# 6. VALIDATION METRICS
# =============================
y_val_pred  = log_reg.predict(X_val)
y_val_proba = log_reg.predict_proba(X_val)[:, 1]

print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("ROC AUC:", roc_auc_score(y_val, y_val_proba))

print("\nClassification Report:\n", classification_report(y_val, y_val_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_val, y_val_pred))


Accuracy: 0.8837135304484042
ROC AUC: 0.5051484142974907

Classification Report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.94     36105
           1       0.00      0.00      0.00      4751

    accuracy                           0.88     40856
   macro avg       0.44      0.50      0.47     40856
weighted avg       0.78      0.88      0.83     40856


Confusion Matrix:
 [[36105     0]
 [ 4751     0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
# =============================
# 7. FINAL MODEL TRAINING (ALL DATA)
# =============================
final_model = LogisticRegression(
    max_iter=1000,
    solver="lbfgs",
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)

final_model.fit(X, y)
print("Final model trained on all data.")


Final model trained on all data.


In [8]:
# =============================
# 8. PREDICT ON TEST SET
# =============================
X_test = test.drop(columns=[IDCOL])  # exclude ProfileID
test_proba = final_model.predict_proba(X_test)[:, 1]
test_pred  = (test_proba >= 0.5).astype(int)

print(test_proba[:5])
print(test_pred[:5])


[0.5        0.49999999 0.49999998 0.49999996 0.49999995]
[0 0 0 0 0]


In [9]:
# =============================
# 9. CREATE SUBMISSION FILE
# =============================
sample_path = "/kaggle/input/financial-risk-profiling/sample_submission_updated.csv"
sample = pd.read_csv(sample_path)

print("Sample submission structure:")
print(sample.head())

# Probability submission (many competitions allow probability)
sub_proba = sample.copy()
sub_proba["RiskFlag"] = test_proba
sub_proba.to_csv("/kaggle/working/logreg_submission_proba.csv", index=False)

# Binary (0/1) submission
sub_bin = sample.copy()
sub_bin["RiskFlag"] = test_pred
sub_bin.to_csv("/kaggle/working/logreg_submission_binary.csv", index=False)

print("Saved:")
print("/kaggle/working/logreg_submission_proba.csv")
print("/kaggle/working/logreg_submission_binary.csv")


Sample submission structure:
    ProfileID  RiskFlag
0  CKV34LU7V7         1
1  62KTYNH93J         1
2  JGFUSOIUH7         1
3  4538THBHOX         1
4  DXLNA06JHR         1
Saved:
/kaggle/working/logreg_submission_proba.csv
/kaggle/working/logreg_submission_binary.csv


In [10]:
# =============================
# 10. PREVIEW SUBMISSION
# =============================
sub_bin.head()


Unnamed: 0,ProfileID,RiskFlag
0,CKV34LU7V7,0
1,62KTYNH93J,0
2,JGFUSOIUH7,0
3,4538THBHOX,0
4,DXLNA06JHR,0
