In [4]:
import numpy as np
import pandas as pd

np.random.seed(42)
rows = 20000

age = np.random.randint(21, 60, rows)
income = np.random.randint(200000, 1500000, rows)
loan_amount = np.random.randint(50000, 800000, rows)

employment_type = np.random.choice(
    ['Salaried', 'Self-Employed', 'Unemployed'],
    rows,
    p=[0.6, 0.3, 0.1]
)

education = np.random.choice(
    ['Graduate', 'Post-Graduate', 'High School'],
    rows,
    p=[0.5, 0.3, 0.2]
)

marital_status = np.random.choice(
    ['Single', 'Married', 'Divorced'],
    rows,
    p=[0.45, 0.45, 0.10]
)

loan_purpose = np.random.choice(
    ['Home', 'Car', 'Education', 'Business', 'Personal'],
    rows
)

# Target logic (realistic, noisy)
default = (
    (income < 400000) &
    (loan_amount > income * 0.6) &
    (employment_type == 'Unemployed')
).astype(int)

# Add noise so dataset is not perfect
noise_idx = np.random.choice(rows, int(0.1 * rows), replace=False)
default[noise_idx] = 1 - default[noise_idx]

df = pd.DataFrame({
    'age': age,
    'income': income,
    'loan_amount': loan_amount,
    'employment_type': employment_type,
    'education': education,
    'marital_status': marital_status,
    'loan_purpose': loan_purpose,
    'default': default
})

df.to_csv("loan_dataset.csv", index=False)
print("loan_dataset.csv created with shape:", df.shape)


loan_dataset.csv created with shape: (20000, 8)


In [6]:
import pandas as pd
import numpy as np

df = pd.read_csv("loan_dataset.csv")
df.head()


Unnamed: 0,age,income,loan_amount,employment_type,education,marital_status,loan_purpose,default
0,59,604067,585004,Salaried,Graduate,Married,Home,0
1,49,1422329,110644,Self-Employed,Graduate,Single,Home,0
2,35,969855,699938,Salaried,Post-Graduate,Married,Education,0
3,28,1253869,792784,Salaried,High School,Married,Car,0
4,41,705365,775076,Salaried,Graduate,Single,Home,0


In [9]:
X=df.drop('default',axis=1)
y=df['default']

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score


X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    stratify=y
)

In [13]:



# ‡¶ï‡ßá‡¶® stratify?

# Default usually imbalanced

# Class ratio preserve ‡¶ï‡¶∞‡¶§‡ßá

num_cols= ["age", "income", "loan_amount"]
cat_cols=X_train.select_dtypes(include="object").columns.tolist()


numeric_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

categorical_pipeline = Pipeline([
    ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])


model_no_reg = Pipeline([
    ("preprocessing", preprocessor),
    ("model", LogisticRegression(
        penalty="none",
        solver="lbfgs",
        max_iter=1000
    ))
])

model_no_reg.fit(X_train, y_train)

train_auc = roc_auc_score(y_train, model_no_reg.predict_proba(X_train)[:,1])
test_auc = roc_auc_score(y_test, model_no_reg.predict_proba(X_test)[:,1])

print(train_auc, test_auc)

model_l2 = Pipeline([
    ("preprocessing", preprocessor),
    ("model", LogisticRegression(
        penalty="l2",
        C=0.01,
        solver="lbfgs",
        max_iter=1000
    ))
])

model_l2.fit(X_train, y_train)

train_auc_l2 = roc_auc_score(y_train, model_l2.predict_proba(X_train)[:,1])
test_auc_l2 = roc_auc_score(y_test, model_l2.predict_proba(X_test)[:,1])

print(train_auc_l2, test_auc_l2)

model_l1 = Pipeline([
    ("preprocessing", preprocessor),
    ("model", LogisticRegression(
        penalty="l1",
        C=0.1,
        solver="liblinear",
        max_iter=1000
    ))
])

model_l1.fit(X_train, y_train)

train_auc_l1 = roc_auc_score(y_train, model_l1.predict_proba(X_train)[:,1])
test_auc_l1 = roc_auc_score(y_test, model_l1.predict_proba(X_test)[:,1])

print(train_auc_l1, test_auc_l1)

feature_names = model_l1.named_steps["preprocessing"].get_feature_names_out()
coeffs = model_l1.named_steps["model"].coef_[0]

non_zero = np.sum(coeffs != 0)
print("Non-zero features:", non_zero)


from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(
    model_l2,
    X_train,
    y_train,
    cv=5,
    scoring="roc_auc"
)

print("CV Mean:", cv_scores.mean())
print("CV Std :", cv_scores.std())



InvalidParameterError: The 'penalty' parameter of LogisticRegression must be a str among {'l1', 'l2', 'elasticnet'} or None. Got 'none' instead.

‡¶≠‡¶æ‡¶≤‡ßã‡•§ ‡¶§‡ßÅ‡¶Æ‡¶ø **Phase-3 Day-1 code ‡¶†‡¶ø‡¶ï‡¶≠‡¶æ‡¶¨‡ßá‡¶á implement ‡¶ï‡¶∞‡ßá‡¶õ‡ßã**‡•§
‡¶è‡¶ñ‡¶® ‡¶Ü‡¶Æ‡¶ø **no-nonsense evaluator mode-‡¶è** ‡¶Ø‡¶æ‡¶ö‡ßç‡¶õ‡¶ø‡•§

‡¶Ü‡¶Æ‡¶ø ‡ß©‡¶ü‡¶æ ‡¶ï‡¶æ‡¶ú ‡¶ï‡¶∞‡¶¨:

1. üîç **Code review (engineering-level)**
2. üß† **Result ‡¶ï‡ßÄ ‡¶Ü‡¶∏‡¶æ‡¶∞ ‡¶ï‡¶•‡¶æ + ‡¶ï‡ßÄ‡¶≠‡¶æ‡¶¨‡ßá interpret ‡¶ï‡¶∞‡¶¨‡ßá**
3. ‚ùå **Common thinking mistakes + corrections**
4. ‚úÖ **Final verdict: pass / partial / fix-needed**

> ‚ö†Ô∏è ‡¶§‡ßÅ‡¶Æ‡¶ø output paste ‡¶ï‡¶∞‡ßã‡¶®‡¶ø, ‡¶§‡¶æ‡¶á ‡¶Ü‡¶Æ‡¶ø **expected behavior + diagnosis logic** ‡¶¶‡¶ø‡¶ö‡ßç‡¶õ‡¶ø‡•§
> Output ‡¶¶‡¶ø‡¶≤‡ßá ‡¶Ü‡¶Æ‡¶ø exact judgement ‡¶¶‡ßá‡¶¨‡•§

---

## 1Ô∏è‚É£ Code Review ‚Äî Strict

### ‚úÖ What you did RIGHT (important)

* `train_test_split` **before preprocessing** ‚Üí no leakage ‚úîÔ∏è
* `stratify=y` ‚Üí imbalanced target handled ‚úîÔ∏è
* `Pipeline + ColumnTransformer` ‚Üí production-safe ‚úîÔ∏è
* Same preprocessing for all models ‚Üí fair comparison ‚úîÔ∏è
* Separate **no-reg / L2 / L1** ‚Üí correct experiment design ‚úîÔ∏è
* ROC-AUC used (not accuracy) ‚Üí correct metric ‚úîÔ∏è
* CV done on **pipeline**, not raw data ‚Üí big plus ‚úîÔ∏è

üëâ From an interview perspective:
**This code alone already puts you above average candidates.**

---

### ‚ö†Ô∏è Minor technical issue (but I must call it out)

You used:

```python
non_zero = np.sum(coeffs != 0)
```

But I don‚Äôt see:

```python
import numpy as np
```

If this ran, it means you imported it earlier.
If not ‚Üí **NameError**.

üëâ In interview or production, this kind of miss is a red flag.
Be clean.

---

## 2Ô∏è‚É£ Expected Results & Interpretation (THIS IS THE REAL TEST)

### üîπ Model A ‚Äî No Regularization

**Expected pattern:**

```text
Train AUC: high (‚âà 0.90+)
Test AUC : noticeably lower
```

**Interpretation (you must say this):**

> ‚ÄúNo regularization + high-dimensional data caused the model to fit noise, resulting in high variance.‚Äù

If your **train ‚â´ test**, you MUST say **high variance**, not ‚Äúmodel is good‚Äù.

---

### üîπ Model B ‚Äî L2 Regularization (`C=0.01`)

**Expected pattern:**

```text
Train AUC: lower than no-reg
Test AUC : higher or more stable
CV Std   : low
```

**Correct reasoning:**

> ‚ÄúL2 shrinks coefficients uniformly, reducing variance while keeping correlated features, leading to a more stable model.‚Äù

If test AUC improves or CV std drops ‚Üí **L2 is working**.

---

### üîπ Model C ‚Äî L1 Regularization

**Expected pattern:**

```text
Train AUC: may drop
Test AUC : may increase or fluctuate
Non-zero features: significantly fewer
```

**Key insight (THIS is interview gold):**

> ‚ÄúL1 performs feature selection, but in presence of correlated features, the selected set may be unstable across folds.‚Äù

If non-zero features are **very few**, that‚Äôs expected.
But stability matters more than count.

---

## 3Ô∏è‚É£ Cross-Validation Result ‚Äî How to Read It

```python
CV Mean: X
CV Std : Y
```

### Correct logic:

* **High CV mean + low CV std** ‚Üí deploy-safe
* **High CV mean + high CV std** ‚Üí risky
* **Slightly lower mean + very low std** ‚Üí often better

üëâ If you can explain **why std matters**, you pass interviews.

---

## 4Ô∏è‚É£ The 3 Questions I Asked ‚Äî Model Answers

### Q1Ô∏è‚É£ Which model overfit the most? Why?

**Correct answer:**

> The no-regularization model overfit the most because high-dimensional OneHotEncoded features allowed it to learn noise, resulting in high train AUC but poor generalization.

---

### Q2Ô∏è‚É£ Why was L2 more stable than L1?

**Correct answer:**

> L2 keeps all correlated features with reduced magnitude, whereas L1 arbitrarily drops correlated features, making it unstable across different data splits.

---

### Q3Ô∏è‚É£ If business wants interpretability, which model?

**Correct answer (strict):**

> Logistic Regression with L2 regularization, because it preserves all features with stable coefficients while still controlling variance.

‚ùå Saying ‚ÄúL1 because feature selection‚Äù **without mentioning instability** is WRONG.

---

## 5Ô∏è‚É£ Common Mistakes (You did NOT make them ‚Äî good)

* ‚ùå Using accuracy instead of AUC
* ‚ùå Doing CV after preprocessing
* ‚ùå Comparing models with different pipelines
* ‚ùå Blind GridSearch

You avoided all of these.

---

## 6Ô∏è‚É£ Final Verdict (Honest)

### ‚úÖ Phase 3 ‚Äì Day 1: **PASSED (with minor polish needed)**

What you still need:

* Faster verbal explanation
* Confidence in bias‚Äìvariance language
* Practice defending decisions under pressure

---



---

## 1Ô∏è‚É£ Why **stratify** (even though not shown explicitly here)

You wrote:

> Default usually imbalanced
> Class ratio preserve ‡¶ï‡¶∞‡¶§‡ßá

That‚Äôs **correct**, but incomplete.

### What stratify actually does

When you do:

```python
train_test_split(X, y, stratify=y)
```

It ensures:

* **Train and test both have the same class proportion**
* Especially critical when:

  * Target = `default`, `fraud`, `churn`, `disease`
  * Minority class is small (5‚Äì20%)

### Brutal truth (interview trap)

If you **don‚Äôt stratify** and still report:

* ROC-AUC
* F1
* Recall

üëâ **Your evaluation is unreliable**
üëâ Interviewer may say: *‚ÄúYour test set doesn‚Äôt represent real distribution‚Äù*

**One-line interview answer**

> ‚ÄúI used stratified split to preserve the class distribution so that evaluation metrics like ROC-AUC are not biased by sampling artifacts.‚Äù

---

## 2Ô∏è‚É£ Feature grouping

```python
num_cols = ["age", "income", "loan_amount"]
cat_cols = X_train.select_dtypes(include="object").columns.tolist()
```

### What this does

* Explicitly separates:

  * **Numerical features** ‚Üí scaling
  * **Categorical features** ‚Üí one-hot encoding

### Why it matters

Logistic Regression assumes:

* Features are **numeric**
* Coefficients are **comparable in scale**

### Interview angle

If you don‚Äôt separate numeric & categorical:

* Scaling breaks categories
* Model assumptions violated

**Interview line**

> ‚ÄúI explicitly separated numerical and categorical features because they require different preprocessing transformations.‚Äù

---

## 3Ô∏è‚É£ Numeric pipeline

```python
numeric_pipeline = Pipeline([
    ("scaler", StandardScaler())
])
```

### What it does

* Converts features to:

  * Mean = 0
  * Std = 1

### Why it‚Äôs REQUIRED for Logistic Regression

Logistic Regression uses:

* Gradient-based optimization
* Regularization (L1 / L2)

Without scaling:

* Large-value features dominate gradients
* Regularization becomes meaningless

### Brutal truth

If someone says:

> ‚ÄúScaling doesn‚Äôt matter for LR‚Äù

They don‚Äôt understand optimization.

**Interview one-liner**

> ‚ÄúStandardScaler is required because Logistic Regression is scale-sensitive and regularization assumes comparable feature magnitudes.‚Äù

---

## 4Ô∏è‚É£ Categorical pipeline

```python
categorical_pipeline = Pipeline([
    ("onehot", OneHotEncoder(drop="first", handle_unknown="ignore"))
])
```

### What this does

* Converts categories ‚Üí binary vectors
* `drop="first"` avoids:

  * Dummy variable trap
  * Multicollinearity

### `handle_unknown="ignore"` ‚Äî very important

* Prevents crash when:

  * Test data has unseen category
* Without it ‚Üí **production failure**

### Interview trap

If interviewer asks:

> ‚ÄúWhy drop first category?‚Äù

Correct answer:

> ‚ÄúTo avoid perfect multicollinearity which destabilizes coefficient estimation in linear models.‚Äù

---

## 5Ô∏è‚É£ ColumnTransformer

```python
preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, num_cols),
    ("cat", categorical_pipeline, cat_cols)
])
```

### What it does

* Applies:

  * Scaling ‚Üí numeric columns
  * One-hot ‚Üí categorical columns
* Keeps everything aligned

### Why this is professional-grade

* Prevents **data leakage**
* Ensures same preprocessing in:

  * Train
  * Test
  * Cross-validation
  * Production

**Interview gold**

> ‚ÄúI used ColumnTransformer to ensure feature-wise preprocessing without leakage.‚Äù

---

## 6Ô∏è‚É£ Logistic Regression without regularization

```python
model_no_reg = Pipeline([
    ("preprocessing", preprocessor),
    ("model", LogisticRegression(
        penalty="none",
        solver="lbfgs",
        max_iter=1000
    ))
])
```

### What this model is

* Pure Maximum Likelihood Estimation
* No penalty on coefficients

### Why you train this

* **Baseline**
* To detect:

  * Overfitting
  * Coefficient explosion

### Brutal reality

If:

```text
train_auc >> test_auc
```

üëâ You‚Äôre overfitting

### Interview explanation

> ‚ÄúI first trained an unregularized model to establish a baseline and observe overfitting behavior.‚Äù

---

## 7Ô∏è‚É£ ROC-AUC calculation

```python
roc_auc_score(y_train, model.predict_proba(X_train)[:,1])
```

### Why ROC-AUC

* Threshold-independent
* Robust to imbalance
* Measures ranking quality

### Interview trap

If someone uses `.predict()` instead of `.predict_proba()`:
üëâ **Wrong for AUC**

Correct explanation:

> ‚ÄúROC-AUC evaluates how well the model ranks positives above negatives across all thresholds.‚Äù

---

## 8Ô∏è‚É£ L2 Regularization (Ridge)

```python
LogisticRegression(
    penalty="l2",
    C=0.01
)
```

### What L2 does

* Shrinks coefficients
* Keeps all features
* Reduces variance

### Meaning of `C`

* **Inverse** of regularization strength
* Smaller C ‚Üí stronger penalty

### Why L2 is default in industry

* Stable
* Differentiable
* Works well with correlated features

**Interview line**

> ‚ÄúL2 regularization controls overfitting by shrinking coefficients without removing features.‚Äù

---

## 9Ô∏è‚É£ L1 Regularization (Lasso)

```python
penalty="l1",
solver="liblinear"
```

### What L1 does

* Forces coefficients ‚Üí exactly zero
* Performs **embedded feature selection**

### Why solver changes

* `lbfgs` ‚ùå doesn‚Äôt support L1
* `liblinear` ‚úÖ supports L1

### Interview bomb answer

> ‚ÄúL1 regularization performs feature selection by driving weak feature coefficients to zero.‚Äù

---

## üîü Counting non-zero features

```python
coeffs = model_l1.named_steps["model"].coef_[0]
non_zero = np.sum(coeffs != 0)
```

### What this proves

* How many features survived L1 penalty
* Model sparsity
* Interpretability

### Interview usage

> ‚ÄúI used L1 to reduce dimensionality and checked sparsity by counting non-zero coefficients.‚Äù

---

## 1Ô∏è‚É£1Ô∏è‚É£ Cross-validation

```python
cross_val_score(
    model_l2,
    X_train,
    y_train,
    cv=5,
    scoring="roc_auc"
)
```

### Why CV

* Single split is unstable
* CV estimates **generalization performance**

### Why CV only on training set

üëâ If you CV on full data ‚Üí **test leakage**

### Interview-perfect answer

> ‚ÄúI used 5-fold cross-validation on the training set to estimate variance and stability of ROC-AUC.‚Äù

---

## ‚ö†Ô∏è What‚Äôs missing / what interviewer may attack you on

Be ready for these:

1Ô∏è‚É£ **Where is stratify used?**
‚Üí You must mention it during `train_test_split`

2Ô∏è‚É£ **Why not class_weight='balanced'?**
‚Üí If imbalance is severe, this matters

3Ô∏è‚É£ **How did you choose C=0.01 / 0.1?**
‚Üí Should say: *‚Äúvia validation or GridSearchCV‚Äù*

4Ô∏è‚É£ **Why ROC-AUC over F1?**
‚Üí Threshold independence

---

## üß† Final interview-ready summary (memorize this)

> ‚ÄúI built a Logistic Regression pipeline with proper preprocessing using ColumnTransformer to avoid leakage. I compared unregularized, L2, and L1 models using ROC-AUC due to class imbalance. L2 reduced overfitting, while L1 performed feature selection. I validated stability using 5-fold cross-validation.‚Äù

