# Lab Practice 2: Multi-Variable Linear Regression

**Department of Electrical and Computer Engineering**  
**Pak-Austria Fachhochschule: Institute of Applied Sciences & Technology**  
**Subject: Machine Learning**  
**Subject Teacher: Dr. Abid Ali**  
**Lab Supervisor: Miss. Sana Saleem**

## Objective
Implement linear regression with multiple variables using the Diabetes dataset with feature scaling and comprehensive analysis.

## Dataset
- **File**: diabetes.csv
- **Features**: All features except Outcome (Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age)
- **Target Variable**: Outcome (0 or 1)
- **Preprocessing**: Feature scaling using StandardScaler


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats

print("Libraries imported successfully!")


In [None]:
# Load the Diabetes dataset
url = "diabetes.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
           'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = pd.read_csv(url, names=columns)

print("Dataset loaded successfully!")
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows:")
print(df.head())

print("\nDataset Statistics:")
print(df.describe())

print("\nMissing Values:")
print(df.isnull().sum())


In [None]:
# Data preprocessing
df = df.apply(pd.to_numeric, errors='coerce')
df = df.dropna()

print(f"Dataset shape after cleaning: {df.shape}")

# Visualize the correlation matrix
plt.figure(figsize=(12, 10))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5, 
            square=True, fmt='.2f')
plt.title('Correlation Matrix of All Features')
plt.tight_layout()
plt.show()

# Feature distributions
df.hist(bins=20, figsize=(15, 12), color='blue', alpha=0.7)
plt.suptitle('Feature Distributions', fontsize=16)
plt.tight_layout()
plt.show()


In [None]:
# Outlier detection and removal using Z-score
z_scores = np.abs(stats.zscore(df))
print("Outliers (Z > 3):")
outlier_indices = np.where(z_scores > 3)
print(f"Number of outliers: {len(outlier_indices[0])}")

# Remove outliers
df_clean = df[(z_scores < 3).all(axis=1)]
print(f"Shape after removing outliers: {df_clean.shape}")

# Visualize distributions after outlier removal
df_clean.hist(bins=20, figsize=(15, 12), color='green', alpha=0.7)
plt.suptitle('Distributions After Removing Outliers', fontsize=16)
plt.tight_layout()
plt.show()

# Prepare features and target
X = df_clean.drop(columns='Outcome')
y = df_clean['Outcome']

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"Feature names: {list(X.columns)}")


In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# Feature scaling (Standardization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nScaled training set shape: {X_train_scaled.shape}")
print(f"Scaled test set shape: {X_test_scaled.shape}")

# Visualize scaling effect
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.boxplot(X_train.values)
plt.title('Features Before Scaling')
plt.ylabel('Values')
plt.xticks(range(1, len(X.columns) + 1), X.columns, rotation=45)

plt.subplot(1, 3, 2)
plt.boxplot(X_train_scaled)
plt.title('Features After Scaling')
plt.ylabel('Scaled Values')
plt.xticks(range(1, len(X.columns) + 1), X.columns, rotation=45)

plt.subplot(1, 3, 3)
plt.hist(X_train_scaled.flatten(), bins=50, alpha=0.7, color='green')
plt.title('Distribution of All Scaled Features')
plt.xlabel('Scaled Values')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


In [None]:
# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

print("Model trained successfully!")
print(f"Model intercept: {model.intercept_:.4f}")
print(f"Number of features: {len(model.coef_)}")

# Display feature coefficients
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

print("\nFeature Coefficients (sorted by absolute value):")
print(feature_importance)

# Predict on the test data
y_pred = model.predict(X_test_scaled)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = np.mean(np.abs(y_test - y_pred))

print(f"\nModel Performance:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared: {r2:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")


In [None]:
# Visualizing model performance
plt.figure(figsize=(15, 10))

# Plot 1: Actual vs Predicted
plt.subplot(2, 3, 1)
plt.scatter(y_test, y_pred, color='purple', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
plt.xlabel("Actual Outcome")
plt.ylabel("Predicted Outcome")
plt.title("Linear Regression: Actual vs Predicted")
plt.grid(True, alpha=0.3)

# Plot 2: Residuals distribution
plt.subplot(2, 3, 2)
residuals = y_test - y_pred
sns.histplot(residuals, bins=20, kde=True, color='orange')
plt.title('Residuals Distribution')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)

# Plot 3: Feature importance
plt.subplot(2, 3, 3)
feature_importance_sorted = feature_importance.sort_values('Coefficient', key=abs, ascending=True)
colors = ['red' if x < 0 else 'blue' for x in feature_importance_sorted['Coefficient']]
bars = plt.barh(feature_importance_sorted['Feature'], feature_importance_sorted['Coefficient'], color=colors, alpha=0.7)
plt.xlabel('Coefficient Value')
plt.title('Feature Importance in Linear Regression')
plt.grid(True, alpha=0.3)

# Plot 4: Training and Loss Visualization (Epochs Simulation)
plt.subplot(2, 3, 4)
epochs = 500
train_errors = []
for i in range(epochs):
    model_temp = LinearRegression()
    model_temp.fit(X_train_scaled, y_train)
    y_train_pred = model_temp.predict(X_train_scaled)
    error = mean_squared_error(y_train, y_train_pred)
    train_errors.append(error)

plt.plot(range(epochs), train_errors, color='blue')
plt.xlabel('Epochs')
plt.ylabel('Training Error (MSE)')
plt.title('Training Error Over Epochs')
plt.grid(True, alpha=0.3)

# Plot 5: Residuals vs Predicted
plt.subplot(2, 3, 5)
plt.scatter(y_pred, residuals, alpha=0.6, color='green')
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residuals vs Predicted')
plt.grid(True, alpha=0.3)

# Plot 6: Q-Q plot for residuals
plt.subplot(2, 3, 6)
from scipy import stats
stats.probplot(residuals, dist="norm", plot=plt)
plt.title('Q-Q Plot of Residuals')
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
