<a href="https://colab.research.google.com/github/Abhijeetkhade11/KaggleCompetitions/blob/main/ML_Assignment3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install ucimlrepo

In [None]:
from ucimlrepo import fetch_ucirepo

bank_marketing = fetch_ucirepo(id=222)

X = bank_marketing.data.features.copy()
y = bank_marketing.data.targets.copy()

In [None]:
import pandas as pd

y = y.squeeze()
y = y.map({'no':0, 'yes':1})

In [None]:
X.head()

In [None]:
X.shape

In [None]:
X.info()

In [None]:
cat_cols = [
    "job", "education", "default", "housing", "loan",
    "poutcome", "month", "contact"
]

num_cols = [
    "age", "balance",
    "campaign", "pdays", "previous",
    "duration"
]

In [None]:
X[num_cols] = X[num_cols].fillna(X[num_cols].mean())

for col in cat_cols:
    X[col].fillna(X[col].mode()[0], inplace=True)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y, random_state=42)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OrdinalEncoder(
            handle_unknown="use_encoded_value",
            unknown_value=-1
        ), cat_cols),

        ("num", "passthrough", num_cols)
    ]
)

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = Pipeline([
    ("prep", preprocess),
    ("clf", LogisticRegression())
])

lr_model.fit(X_train, y_train)

In [None]:
y_pred = lr_model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))


In [None]:
y_prob = lr_model.predict_proba(X_test)[:, 1]

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

fpr, tpr, thresholds = roc_curve(y_test, y_prob)
auc_score = roc_auc_score(y_test, y_prob)

print("AUC Score:", auc_score)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, label=f"AUC = {auc_score:.4f}")
plt.plot([0,1], [0,1], linestyle='--')

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()

plt.show()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_threshold(threshold):
    y_pred = (y_prob >= threshold).astype(int)

    acc = accuracy_score(y_test, y_pred)
    pre = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1  = f1_score(y_test, y_pred)

    return acc, pre, rec, f1

In [None]:
acc1, pre1, rec1, f11 = evaluate_threshold(0.5)


In [None]:
import numpy as np
from sklearn.metrics import f1_score

thresholds = np.arange(0.1, 0.9, 0.01)

best_thr = 0
best_f1 = 0

for t in thresholds:
    y_pred = (y_prob >= t).astype(int)
    score = f1_score(y_test, y_pred)

    if score > best_f1:
        best_f1 = score
        best_thr = t

print("Best Threshold:", best_thr)


In [None]:
acc2, pre2, rec2, f12 = evaluate_threshold(best_thr)

In [None]:
print("\nThreshold = 0.5")
print("Accuracy :", acc1)
print("Precision:", pre1)
print("Recall   :", rec1)
print("F1 Score :", f11)

print("\nOptimized Threshold =", best_thr)
print("Accuracy :", acc2)
print("Precision:", pre2)
print("Recall   :", rec2)
print("F1 Score :", f12)


Q1. Why is the ROC curve useful?

ans. ROC shows how well the model separates classes across all thresholds by plotting TPR vs FPR, and AUC summarizes overall performance independent of any single threshold.

Q2. What changes when the threshold changes (precisionâ€“recall trade-off)?

ans. Lowering threshold increases recall but reduces precision (more positives detected but more false alarms), while increasing threshold increases precision but reduces recall (fewer false alarms but more missed positives).
