# 11 â€” Supervised Machine Learning
**Author:** Ebenezer Adjartey

Covers: Linear/Ridge/Lasso regression, Logistic regression, Decision Trees, Random Forests, Gradient Boosting (XGBoost), SVM, KNN, Neural Networks (MLP).

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_text
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix,
                               roc_auc_score, mean_squared_error, root_mean_squared_error,
                               r2_score)
import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
sns.set_theme(style='whitegrid')
print('Libraries loaded.')

## 1. Generate Datasets

In [None]:
# Classification dataset
X_cls, y_cls = make_classification(n_samples=600, n_features=10, n_informative=6,
                                    n_redundant=2, random_state=42)
X_tr, X_te, y_tr, y_te = train_test_split(X_cls, y_cls, test_size=0.25, random_state=42)

# Regression dataset
X_reg, y_reg = make_regression(n_samples=500, n_features=10, noise=20, random_state=42)
Xr_tr, Xr_te, yr_tr, yr_te = train_test_split(X_reg, y_reg, test_size=0.25, random_state=42)

scaler = StandardScaler()
X_tr_sc = scaler.fit_transform(X_tr)
X_te_sc = scaler.transform(X_te)
Xr_tr_sc = scaler.fit_transform(Xr_tr)
Xr_te_sc = scaler.transform(Xr_te)

print(f'Classification: {X_tr.shape[0]} train, {X_te.shape[0]} test')
print(f'Regression:     {Xr_tr.shape[0]} train, {Xr_te.shape[0]} test')

## 2. Linear Regression (OLS, Ridge, Lasso)

In [None]:
reg_results = {}

for name, model in [('OLS',   LinearRegression()),
                     ('Ridge', Ridge(alpha=1.0)),
                     ('Lasso', Lasso(alpha=0.5))]:
    model.fit(Xr_tr_sc, yr_tr)
    pred = model.predict(Xr_te_sc)
    rmse = root_mean_squared_error(yr_te, pred)
    r2   = r2_score(yr_te, pred)
    reg_results[name] = {'RMSE': round(rmse,3), 'R2': round(r2,3)}
    print(f'{name:6}: RMSE={rmse:.3f}, R2={r2:.3f}, coefs_nonzero={np.sum(model.coef_!=0)}')

print('\nLasso performs automatic feature selection (zeros out coefficients)')

## 3. Logistic Regression

In [None]:
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_tr_sc, y_tr)
y_pred_lr = lr.predict(X_te_sc)
y_prob_lr = lr.predict_proba(X_te_sc)[:,1]
print(f'Logistic Regression Accuracy: {accuracy_score(y_te, y_pred_lr):.4f}')
print(f'AUC: {roc_auc_score(y_te, y_prob_lr):.4f}')
print('\nClassification Report:')
print(classification_report(y_te, y_pred_lr))

## 4. Decision Trees

In [None]:
dt = DecisionTreeClassifier(max_depth=5, random_state=42)
dt.fit(X_tr, y_tr)
y_pred_dt = dt.predict(X_te)
print(f'Decision Tree Accuracy: {accuracy_score(y_te, y_pred_dt):.4f}')
print(f'AUC: {roc_auc_score(y_te, dt.predict_proba(X_te)[:,1]):.4f}')
print(f'Tree depth: {dt.get_depth()}')
print(f'Feature importances: {dt.feature_importances_.round(3)}')

## 5. Random Forest

In [None]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_tr, y_tr)
y_pred_rf = rf.predict(X_te)
y_prob_rf = rf.predict_proba(X_te)[:,1]
print(f'Random Forest Accuracy: {accuracy_score(y_te, y_pred_rf):.4f}')
print(f'AUC: {roc_auc_score(y_te, y_prob_rf):.4f}')

# Feature importance
fi = pd.Series(rf.feature_importances_, name='Importance').sort_values(ascending=False)
print('\nFeature Importances (sorted):')
print(fi.round(4).values)

## 6. Gradient Boosting

In [None]:
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb.fit(X_tr, y_tr)
y_pred_gb = gb.predict(X_te)
y_prob_gb = gb.predict_proba(X_te)[:,1]
print(f'GradientBoosting Accuracy: {accuracy_score(y_te, y_pred_gb):.4f}')
print(f'AUC: {roc_auc_score(y_te, y_prob_gb):.4f}')

# Try XGBoost if available
try:
    from xgboost import XGBClassifier
    xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=3,
                         use_label_encoder=False, eval_metric='logloss', random_state=42)
    xgb.fit(X_tr, y_tr)
    print(f'XGBoost Accuracy: {accuracy_score(y_te, xgb.predict(X_te)):.4f}')
    print(f'XGBoost AUC: {roc_auc_score(y_te, xgb.predict_proba(X_te)[:,1]):.4f}')
except ImportError:
    print('XGBoost not installed (pip install xgboost)')

## 7. Support Vector Machine (SVM)

In [None]:
svm = SVC(kernel='rbf', C=1.0, probability=True, random_state=42)
svm.fit(X_tr_sc, y_tr)
y_pred_svm = svm.predict(X_te_sc)
print(f'SVM (RBF) Accuracy: {accuracy_score(y_te, y_pred_svm):.4f}')
print(f'AUC: {roc_auc_score(y_te, svm.predict_proba(X_te_sc)[:,1]):.4f}')

## 8. K-Nearest Neighbors

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_tr_sc, y_tr)
y_pred_knn = knn.predict(X_te_sc)
print(f'KNN (k=5) Accuracy: {accuracy_score(y_te, y_pred_knn):.4f}')
print(f'AUC: {roc_auc_score(y_te, knn.predict_proba(X_te_sc)[:,1]):.4f}')

## 9. Neural Network (MLP)

In [None]:
mlp = MLPClassifier(hidden_layer_sizes=(64,32), activation='relu',
                     max_iter=500, random_state=42)
mlp.fit(X_tr_sc, y_tr)
y_pred_mlp = mlp.predict(X_te_sc)
print(f'MLP Accuracy: {accuracy_score(y_te, y_pred_mlp):.4f}')
print(f'AUC: {roc_auc_score(y_te, mlp.predict_proba(X_te_sc)[:,1]):.4f}')

## 10. Model Comparison

In [None]:
models_cls = {
    'Logistic Reg': (y_pred_lr, y_prob_lr),
    'Decision Tree': (y_pred_dt, dt.predict_proba(X_te)[:,1]),
    'Random Forest': (y_pred_rf, y_prob_rf),
    'Gradient Boost': (y_pred_gb, y_prob_gb),
    'SVM':           (y_pred_svm, svm.predict_proba(X_te_sc)[:,1]),
    'KNN':           (y_pred_knn, knn.predict_proba(X_te_sc)[:,1]),
    'MLP':           (y_pred_mlp, mlp.predict_proba(X_te_sc)[:,1]),
}

results = []
for name, (pred, prob) in models_cls.items():
    results.append({'Model':name,
                    'Accuracy': round(accuracy_score(y_te, pred),4),
                    'AUC':      round(roc_auc_score(y_te, prob),4)})

comp_df = pd.DataFrame(results).sort_values('AUC', ascending=False)
print('Model Comparison:')
print(comp_df.to_string(index=False))

# Plot
fig, ax = plt.subplots(figsize=(10, 5))
comp_df.set_index('Model')[['Accuracy','AUC']].plot(kind='bar', ax=ax)
ax.set_title('Classification Model Comparison'); ax.set_ylim(.5, 1)
plt.xticks(rotation=30, ha='right'); plt.tight_layout()
os.makedirs('11_machine_learning', exist_ok=True)
plt.savefig('11_machine_learning/model_comparison.png', dpi=100, bbox_inches='tight')
plt.show(); print('Saved.')

## Key Takeaways

- **Regularization** (Ridge, Lasso): prevents overfitting; Lasso does feature selection
- **Ensemble methods** (RF, GB): generally outperform single models
- **SVM**: effective in high-dimensional spaces; kernel trick for non-linearity
- **Neural networks**: flexible but need more data and tuning
- **Scale features** for distance-based methods (SVM, KNN, MLP)
