# Lab 4 – Linear and Logistic Regression Models
Build Linear and Logistic Regression models and evaluate their performance using appropriate metrics.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo

In [None]:
ckd = fetch_ucirepo(id=336)

X = ckd.data.features
y = ckd.data.targets

df = pd.concat([X, y], axis=1)
df.head()

In [None]:
# Replace '?' with NaN and handle missing values
df.replace('?', np.nan, inplace=True)

num_cols = ['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wbcc', 'rbcc']
cat_cols = [col for col in df.columns if col not in num_cols + ['class']]

for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
    df[col].fillna(df[col].median(), inplace=True)

for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Label encode categorical columns
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in cat_cols + ['class']:
    df[col] = le.fit_transform(df[col])

print("Shape:", df.shape)
df.head()

---
## Part A – Linear Regression
Predict **hemoglobin (hemo)** from other numeric features.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Features: all numeric columns except hemo (target)
linear_features = ['age', 'bp', 'bgr', 'bu', 'sc', 'sod', 'pot', 'pcv', 'wbcc', 'rbcc']

X_lin = df[linear_features]
y_lin = df['hemo']

X_lin_train, X_lin_test, y_lin_train, y_lin_test = train_test_split(
    X_lin, y_lin, test_size=0.2, random_state=42
)

scaler_lin = StandardScaler()
X_lin_train = scaler_lin.fit_transform(X_lin_train)
X_lin_test = scaler_lin.transform(X_lin_test)

print("Training set:", X_lin_train.shape)
print("Test set:", X_lin_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression

lin_model = LinearRegression()
lin_model.fit(X_lin_train, y_lin_train)

y_lin_pred = lin_model.predict(X_lin_test)

print("Coefficients:", lin_model.coef_)
print("Intercept:", lin_model.intercept_)

### Linear Regression – Evaluation Metrics

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_lin_test, y_lin_pred)
mse = mean_squared_error(y_lin_test, y_lin_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_lin_test, y_lin_pred)

print("=== Linear Regression Metrics ===")
print(f"Mean Absolute Error (MAE) : {mae:.4f}")
print(f"Mean Squared Error  (MSE) : {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R\u00b2)            : {r2:.4f}")

In [None]:
# Actual vs Predicted scatter plot
plt.figure(figsize=(8, 6))
plt.scatter(y_lin_test, y_lin_pred, alpha=0.6, edgecolors='k')
plt.plot([y_lin_test.min(), y_lin_test.max()], [y_lin_test.min(), y_lin_test.max()], 'r--', lw=2)
plt.xlabel("Actual Hemoglobin")
plt.ylabel("Predicted Hemoglobin")
plt.title("Linear Regression: Actual vs Predicted")
plt.tight_layout()
plt.show()

In [None]:
# Residual plot
residuals = y_lin_test - y_lin_pred

plt.figure(figsize=(8, 6))
plt.scatter(y_lin_pred, residuals, alpha=0.6, edgecolors='k')
plt.axhline(y=0, color='r', linestyle='--', lw=2)
plt.xlabel("Predicted Hemoglobin")
plt.ylabel("Residuals")
plt.title("Linear Regression: Residual Plot")
plt.tight_layout()
plt.show()

---
## Part B – Logistic Regression
Predict **CKD class** (ckd / notckd) from all features.

In [None]:
X_log = df.drop('class', axis=1)
y_log = df['class']

X_log_train, X_log_test, y_log_train, y_log_test = train_test_split(
    X_log, y_log, test_size=0.2, random_state=42
)

scaler_log = StandardScaler()
X_log_train = scaler_log.fit_transform(X_log_train)
X_log_test = scaler_log.transform(X_log_test)

print("Training set:", X_log_train.shape)
print("Test set:", X_log_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_log_train, y_log_train)

y_log_pred = log_model.predict(X_log_test)

### Logistic Regression – Evaluation Metrics

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy  = accuracy_score(y_log_test, y_log_pred)
precision = precision_score(y_log_test, y_log_pred, average='weighted')
recall    = recall_score(y_log_test, y_log_pred, average='weighted')
f1        = f1_score(y_log_test, y_log_pred, average='weighted')

print("=== Logistic Regression Metrics ===")
print(f"Accuracy  : {accuracy:.4f}")
print(f"Precision : {precision:.4f}")
print(f"Recall    : {recall:.4f}")
print(f"F1 Score  : {f1:.4f}")

In [None]:
from sklearn.metrics import classification_report

print("=== Classification Report ===")
print(classification_report(y_log_test, y_log_pred))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_log_test, y_log_pred)

plt.figure(figsize=(6, 5))
ConfusionMatrixDisplay(cm, display_labels=['CKD', 'Not CKD']).plot(cmap='Blues')
plt.title("Logistic Regression: Confusion Matrix")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc

y_log_prob = log_model.predict_proba(X_log_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_log_test, y_log_prob)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], 'k--', lw=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Logistic Regression: ROC Curve')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()