In [1]:
# =============================
# A1. IMPORTS
# =============================
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.preprocessing import PowerTransformer


In [2]:
# =============================
# B2. LOAD DATA (fresh copy for skew pipeline)
# =============================
# TRAIN_PROC = "/kaggle/input/processed-financial-risk/train_processed.csv"
# TEST_PROC  = "/kaggle/input/processed-financial-risk/test_processed.csv"
# SAMPLE_SUB = "/kaggle/input/financial-risk-profiling/sample_submission_updated.csv"

train = pd.read_csv("/kaggle/input/processed-financial-risk/train_processed.csv")
test  = pd.read_csv("/kaggle/input/processed-financial-risk/test_processed.csv")
sample = pd.read_csv("/kaggle/input/financial-risk-profiling/sample_submission_updated.csv")

X = train.drop(columns=["ProfileID","RiskFlag"])
y = train["RiskFlag"]
X_test = test.drop(columns=["ProfileID"])

numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
print("Numeric cols:", numeric_cols)
print("\nClass distribution:", y.value_counts())


Numeric cols: ['orig_index', 'ApplicantYears', 'AnnualEarnings', 'RequestedSum', 'TrustMetric', 'WorkDuration', 'ActiveAccounts', 'OfferRate', 'RepayPeriod', 'DebtFactor', 'QualificationLevel_High School', "QualificationLevel_Master's", 'QualificationLevel_PhD', 'WorkCategory_Part-time', 'WorkCategory_Self-employed', 'WorkCategory_Unemployed', 'RelationshipStatus_Married', 'RelationshipStatus_Single', 'OwnsProperty_Yes', 'FamilyObligation_Yes', 'FundUseCase_Business', 'FundUseCase_Education', 'FundUseCase_Home', 'FundUseCase_Other', 'JointApplicant_Yes']

Class distribution: RiskFlag
0    180524
1     23753
Name: count, dtype: int64


In [3]:
# =============================
# B3. SKEW DETECTION & POWER TRANSFORM
# =============================
skews = X[numeric_cols].skew().abs().sort_values(ascending=False)
print("Top skewed numeric features:\n", skews.head(20))

# Choose threshold for skewness
SKEW_THRESH = 0.8
skewed_features = skews[skews > SKEW_THRESH].index.tolist()
print("\nFeatures to power-transform (abs(skew) > 0.8):", skewed_features)

if skewed_features:
    pt = PowerTransformer(method="yeo-johnson")
    X[skewed_features] = pt.fit_transform(X[skewed_features])
    X_test[skewed_features] = pt.transform(X_test[skewed_features])
    print("[B3] Applied PowerTransformer to skewed features.")
else:
    print("[B3] No features exceeded skew threshold; no transform applied.")


Top skewed numeric features:
 FundUseCase_Other                 1.501021
FundUseCase_Education             1.500026
FundUseCase_Home                  1.499147
FundUseCase_Business              1.495101
QualificationLevel_Master's       1.163818
WorkCategory_Self-employed        1.159759
QualificationLevel_PhD            1.157402
WorkCategory_Unemployed           1.156979
QualificationLevel_High School    1.155410
WorkCategory_Part-time            1.142974
RelationshipStatus_Single         0.711072
RelationshipStatus_Married        0.704201
TrustMetric                       0.005914
OfferRate                         0.005100
DebtFactor                        0.002946
RequestedSum                      0.002052
ActiveAccounts                    0.001910
ApplicantYears                    0.001385
WorkDuration                      0.001308
JointApplicant_Yes                0.001126
dtype: float64

Features to power-transform (abs(skew) > 0.8): ['FundUseCase_Other', 'FundUseCase_Education', 

In [4]:
# =============================
# B4. SCALE (after power transform)
# =============================
scaler_b = StandardScaler()
X[numeric_cols] = scaler_b.fit_transform(X[numeric_cols])
X_test[numeric_cols] = scaler_b.transform(X_test[numeric_cols])

print("[B4] Scaling complete.")


[B4] Scaling complete.


In [5]:
# =============================
# B5. STRATIFIED SPLIT
# =============================
X_train_b, X_val_b, y_train_b, y_val_b = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("y_train_b:\n", y_train_b.value_counts())
print("y_val_b:\n", y_val_b.value_counts())


y_train_b:
 RiskFlag
0    144419
1     19002
Name: count, dtype: int64
y_val_b:
 RiskFlag
0    36105
1     4751
Name: count, dtype: int64


In [6]:
# =============================
# B6. TRAIN LOGREG (WITH SKEW REDUCTION)
# =============================
logreg_sk = LogisticRegression(
    max_iter=1000,
    solver="lbfgs",
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)
logreg_sk.fit(X_train_b, y_train_b)
print("Trained logistic regression (with skew reduction).")


Trained logistic regression (with skew reduction).


In [7]:
# =============================
# B7. EVALUATE (WITH SKEW)
# =============================
yv_pred_b = logreg_sk.predict(X_val_b)
yv_proba_b = logreg_sk.predict_proba(X_val_b)[:, 1]

print("Accuracy:", accuracy_score(y_val_b, yv_pred_b))
print("ROC AUC:", roc_auc_score(y_val_b, yv_proba_b))
print("\nClassification Report:\n", classification_report(y_val_b, yv_pred_b))
print("\nConfusion Matrix:\n", confusion_matrix(y_val_b, yv_pred_b))


Accuracy: 0.671455844918739
ROC AUC: 0.7465614612260582

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.67      0.78     36105
           1       0.22      0.69      0.33      4751

    accuracy                           0.67     40856
   macro avg       0.58      0.68      0.56     40856
weighted avg       0.86      0.67      0.73     40856


Confusion Matrix:
 [[24139 11966]
 [ 1457  3294]]


In [8]:
# =============================
# T1. THRESHOLD TUNING (WITH SKEW PIPELINE)
# =============================
from sklearn.metrics import precision_recall_curve, auc

probs_b = yv_proba_b   # <-- use skew pipeline predicted probabilities
precision_b, recall_b, thresholds_b = precision_recall_curve(y_val_b, probs_b)

# compute F1 at each threshold
f1_scores_b = 2 * (precision_b * recall_b) / (precision_b + recall_b + 1e-12)
best_idx_b = np.argmax(f1_scores_b[:-1])  # ignore last nan threshold
best_thr_b = thresholds_b[best_idx_b]

print("\n===== THRESHOLD TUNING (WITH SKEW) =====")
print("Best F1 threshold:", best_thr_b)
print("Precision:", precision_b[best_idx_b])
print("Recall   :", recall_b[best_idx_b])
print("F1 Score :", f1_scores_b[best_idx_b])


# =============================
# T2. CALIBRATION (WITH SKEW PIPELINE)
# =============================
from sklearn.calibration import CalibratedClassifierCV

print("\n===== CALIBRATION (WITH SKEW) =====")

# We calibrate the trained logistic regression (logreg_sk)
calibrator_b = CalibratedClassifierCV(logreg_sk, cv=3, method="sigmoid")
calibrator_b.fit(X_train_b, y_train_b)

cal_proba_b = calibrator_b.predict_proba(X_val_b)[:, 1]
print("Calibrated ROC AUC:", roc_auc_score(y_val_b, cal_proba_b))



===== THRESHOLD TUNING (WITH SKEW) =====
Best F1 threshold: 0.6282070059014074
Precision: 0.28700271806276256
Recall   : 0.4889496948010945
F1 Score : 0.36169715842693756

===== CALIBRATION (WITH SKEW) =====
Calibrated ROC AUC: 0.7465602078364773


In [9]:
# =============================
# B8. RETRAIN FULL & PREDICT (WITH SKEW)
# =============================
final_sk = LogisticRegression(
    max_iter=1000,
    solver="lbfgs",
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)

# Transform full X using the same scaler/powertransform pipeline:
X_full_b = X.copy()  # X here is already transformed and scaled for the train portion
# Note: X_test is already transformed and scaled above

final_sk.fit(X_full_b, y)

test_proba_sk = final_sk.predict_proba(X_test)[:, 1]
test_pred_sk  = (test_proba_sk >= 0.5).astype(int)

print("Sample probabilities (skew):", test_proba_sk[:5])
print("Sample binary preds (skew):", test_pred_sk[:5])


Sample probabilities (skew): [0.19311441 0.27738883 0.39763814 0.48712687 0.55212333]
Sample binary preds (skew): [0 0 0 0 1]


In [10]:
# =============================
# B9. SAVE SUBMISSIONS (WITH SKEW)
# =============================
sub_proba_sk = sample.copy()
sub_proba_sk["RiskFlag"] = test_proba_sk
sub_proba_sk.to_csv("logreg_submission_with_skew_proba.csv", index=False)

sub_bin_sk = sample.copy()
sub_bin_sk["RiskFlag"] = test_pred_sk
sub_bin_sk.to_csv("logreg_submission_with_skew_binary.csv", index=False)

print("Saved logreg_submission_with_skew_proba.csv")
print("Saved logreg_submission_with_skew_binary.csv")


Saved logreg_submission_with_skew_proba.csv
Saved logreg_submission_with_skew_binary.csv
