# Introduction to Machine Learning — Companion Notebook
This notebook accompanies `introduction_to_machine_learning.md`. It contains runnable code cells, small exercises, and plots.
Install prerequisites as needed: `scikit-learn`, `numpy`, `scipy`, `matplotlib`, `pandas`, `imbalanced-learn`.


## 1) Data split & leak-free preprocessing

In [None]:

import numpy as np, pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

X, y = make_classification(n_samples=5000, n_features=20, n_informative=8,
                           weights=[0.85, 0.15], random_state=42)
X = pd.DataFrame(X, columns=[f"f{i}" for i in range(X.shape[1])])
X["country"] = np.random.choice(["US","DE","IN","BR"], size=len(X), p=[.4,.2,.25,.15])
X["device"] = np.random.choice(["mobile","desktop","tablet"], size=len(X), p=[.6,.35,.05])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train.shape, X_test.shape, y_train.mean(), y_test.mean()


## 2) ColumnTransformer + Pipeline

In [None]:

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression

num_cols = X_train.select_dtypes(np.number).columns.tolist()
cat_cols = X_train.select_dtypes(exclude=np.number).columns.tolist()

pre = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
])

pipe = Pipeline([("pre", pre), ("model", LogisticRegression(max_iter=1000))])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)


## 3) RandomizedSearchCV on LogisticRegression(C)

In [None]:

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from scipy.stats import loguniform

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
param_dist = {"model__C": loguniform(1e-3, 1e2)}
search = RandomizedSearchCV(pipe, param_distributions=param_dist,
                            n_iter=30, cv=cv, scoring="f1", n_jobs=-1, random_state=42)
search.fit(X_train, y_train)
search.best_params_, search.best_score_


## 4) Metrics & calibration

In [None]:

from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, average_precision_score, brier_score_loss
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

best = search.best_estimator_
cal = CalibratedClassifierCV(best, cv=3)  # isotonic by default
cal.fit(X_train, y_train)

proba = cal.predict_proba(X_test)[:,1]
pred  = (proba >= 0.5).astype(int)

print(classification_report(y_test, pred))
print("ROC AUC:", roc_auc_score(y_test, proba))
print("Brier:", brier_score_loss(y_test, proba))

prec, rec, thr = precision_recall_curve(y_test, proba)
ap = average_precision_score(y_test, proba)
ap


In [None]:

# Reliability diagram
import matplotlib.pyplot as plt
prob_true, prob_pred = calibration_curve(y_test, proba, n_bins=10, strategy="quantile")
plt.plot(prob_pred, prob_true, marker='o', label='isotonic-calibrated')
plt.plot([0,1],[0,1],'--',alpha=.5,color='gray')
plt.xlabel("Predicted probability"); plt.ylabel("Empirical frequency"); plt.legend(); plt.tight_layout()


## 5) Imbalanced tactics: SMOTE + LogisticRegression

In [None]:

from imblearn.over_sampling import SMOTE
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression

smote_logreg = make_pipeline(
    SMOTE(k_neighbors=5, random_state=42),
    LogisticRegression(max_iter=1000)
).fit(X_train, y_train)

p = smote_logreg.predict_proba(X_test)[:,1]
roc_auc_score(y_test, p), average_precision_score(y_test, p)


## 6) Model interpretation: permutation importance

In [None]:

from sklearn.inspection import permutation_importance
r = permutation_importance(cal, X_test, y_test, n_repeats=10, scoring="roc_auc", random_state=42)
np.vstack([X_test.columns, r.importances_mean, r.importances_std]).T[:10]


## 7) Unsupervised starter: KMeans

In [None]:

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

km = make_pipeline(StandardScaler(), KMeans(n_clusters=8, n_init="auto", random_state=42))
labels = km.fit_predict(X_train.select_dtypes(np.number))
km[-1].inertia_, np.bincount(labels)[:8]


## 8) Learning & validation curves

In [None]:

from sklearn.model_selection import learning_curve, validation_curve
sizes, tr, va = learning_curve(best, X_train, y_train, cv=3, scoring="roc_auc",
                               train_sizes=np.linspace(0.1,1.0,5), n_jobs=-1, random_state=42)
param_range = np.logspace(-3, 2, 8)
tr_v, va_v = validation_curve(pipe, X_train, y_train, param_name="model__C",
                              param_range=param_range, cv=3, scoring="f1", n_jobs=-1)
sizes[:3], tr.mean(axis=1)[:3], va.mean(axis=1)[:3]
