# Lab Practice 3: Classification Metrics (Accuracy, Precision, Recall, F1 Score)

**Department of Electrical and Computer Engineering**  
**Pak-Austria Fachhochschule: Institute of Applied Sciences & Technology**  
**Subject: Machine Learning**  
**Subject Teacher: Dr. Abid Ali**  
**Lab Supervisor: Miss. Sana Saleem**

## Objective
Implement linear regression and evaluate it using classification metrics by converting continuous predictions to binary classifications.

## Dataset
- **File**: diabetes.csv
- **Features**: All features except Outcome
- **Target Variable**: Outcome (0 or 1)
- **Threshold**: 0.5 (predictions ≥ 0.5 are classified as 1, < 0.5 as 0)


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import (mean_squared_error, r2_score, 
                           accuracy_score, precision_score, 
                           recall_score, f1_score, confusion_matrix)
from scipy import stats

print("Libraries imported successfully!")


In [None]:
# Load and preprocess the dataset
url = "diabetes.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
           'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = pd.read_csv(url, names=columns)

# Data preprocessing
df = df.apply(pd.to_numeric, errors='coerce')
df = df.dropna()

# Remove outliers using Z-score
z_scores = np.abs(stats.zscore(df))
df_clean = df[(z_scores < 3).all(axis=1)]

print(f"Dataset shape after cleaning: {df_clean.shape}")

# Prepare features and target
X = df_clean.drop(columns='Outcome')
y = df_clean['Outcome']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Feature names: {list(X.columns)}")


In [None]:
# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions
y_pred = model.predict(X_test_scaled)

# Convert continuous predictions to binary classifications
y_pred_binary = [1 if pred >= 0.5 else 0 for pred in y_pred]

print("Linear Regression Model Performance:")
print("=" * 40)

# Regression metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared: {r2:.4f}")

print("\nClassification Metrics:")
print("=" * 40)

# Classification metrics
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_binary)
print(f"\nConfusion Matrix:")
print(f"True Negatives: {cm[0,0]}")
print(f"False Positives: {cm[0,1]}")
print(f"False Negatives: {cm[1,0]}")
print(f"True Positives: {cm[1,1]}")


In [None]:
# Comprehensive visualization
plt.figure(figsize=(18, 12))

# Plot 1: Actual vs Predicted (Continuous)
plt.subplot(3, 4, 1)
plt.scatter(y_test, y_pred, color='purple', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
plt.xlabel("Actual Outcome")
plt.ylabel("Predicted Outcome")
plt.title("Linear Regression: Actual vs Predicted")
plt.grid(True, alpha=0.3)

# Plot 2: Residuals distribution
plt.subplot(3, 4, 2)
residuals = y_test - y_pred
sns.histplot(residuals, bins=20, kde=True, color='orange')
plt.title('Residuals Distribution')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)

# Plot 3: Confusion Matrix Heatmap
plt.subplot(3, 4, 3)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Predicted 0', 'Predicted 1'],
            yticklabels=['Actual 0', 'Actual 1'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

# Plot 4: Classification Report Visualization
plt.subplot(3, 4, 4)
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
values = [accuracy, precision, recall, f1]
colors = ['skyblue', 'lightgreen', 'lightcoral', 'gold']
bars = plt.bar(metrics, values, color=colors, alpha=0.7)
plt.title('Classification Metrics')
plt.ylabel('Score')
plt.ylim(0, 1)
for bar, value in zip(bars, values):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{value:.3f}', ha='center', va='bottom')
plt.xticks(rotation=45)

# Plot 5: Prediction Distribution
plt.subplot(3, 4, 5)
plt.hist(y_pred, bins=30, alpha=0.7, color='lightblue', edgecolor='black')
plt.axvline(x=0.5, color='red', linestyle='--', linewidth=2, label='Threshold (0.5)')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.title('Distribution of Predicted Probabilities')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 6: ROC Curve (simplified)
plt.subplot(3, 4, 6)
from sklearn.metrics import roc_curve, auc
fpr, tpr, _ = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)

# Plot 7: Precision-Recall Curve
plt.subplot(3, 4, 7)
from sklearn.metrics import precision_recall_curve
precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_pred)
plt.plot(recall_curve, precision_curve, color='blue', lw=2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid(True, alpha=0.3)

# Plot 8: Threshold Analysis
plt.subplot(3, 4, 8)
thresholds = np.linspace(0, 1, 100)
accuracies = []
for threshold in thresholds:
    y_pred_thresh = [1 if pred >= threshold else 0 for pred in y_pred]
    acc = accuracy_score(y_test, y_pred_thresh)
    accuracies.append(acc)
plt.plot(thresholds, accuracies, color='green', lw=2)
plt.axvline(x=0.5, color='red', linestyle='--', linewidth=2, label='Current Threshold')
plt.xlabel('Threshold')
plt.ylabel('Accuracy')
plt.title('Accuracy vs Threshold')
plt.legend()
plt.grid(True, alpha=0.3)

# Plot 9: Feature Importance
plt.subplot(3, 4, 9)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values('Coefficient', key=abs, ascending=True)
colors = ['red' if x < 0 else 'blue' for x in feature_importance['Coefficient']]
bars = plt.barh(feature_importance['Feature'], feature_importance['Coefficient'], color=colors, alpha=0.7)
plt.xlabel('Coefficient Value')
plt.title('Feature Importance')
plt.grid(True, alpha=0.3)

# Plot 10: Prediction vs Actual (Binary)
plt.subplot(3, 4, 10)
plt.scatter(y_test, y_pred_binary, alpha=0.6, color='purple')
plt.xlabel("Actual Outcome")
plt.ylabel("Predicted Outcome (Binary)")
plt.title("Binary Classification Results")
plt.grid(True, alpha=0.3)

# Plot 11: Error Analysis
plt.subplot(3, 4, 11)
errors = np.abs(y_test - y_pred)
plt.scatter(y_pred, errors, alpha=0.6, color='red')
plt.xlabel('Predicted Value')
plt.ylabel('Absolute Error')
plt.title('Error Analysis')
plt.grid(True, alpha=0.3)

# Plot 12: Class Distribution
plt.subplot(3, 4, 12)
class_counts = pd.Series(y_test).value_counts()
plt.pie(class_counts.values, labels=['Class 0', 'Class 1'], autopct='%1.1f%%', 
        colors=['lightcoral', 'lightblue'])
plt.title('Class Distribution in Test Set')

plt.tight_layout()
plt.show()
