# Lab Practice 1: Single Variable Linear Regression

**Department of Electrical and Computer Engineering**  
**Pak-Austria Fachhochschule: Institute of Applied Sciences & Technology**  
**Subject: Machine Learning**  
**Subject Teacher: Dr. Abid Ali**  
**Lab Supervisor: Miss. Sana Saleem**

## Objective
Implement linear regression with one variable using the Diabetes dataset to predict outcomes based on Glucose levels.

## Dataset
- **File**: diabetes.csv
- **Features**: Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age, Outcome
- **Target Variable**: Outcome (0 or 1)
- **Selected Feature**: Glucose (single input variable)


## Step 1: Import Required Libraries


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats

print("Libraries imported successfully!")


## Step 2: Load and Explore the Dataset


In [None]:
# Load the Diabetes dataset
url = "diabetes.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
           'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
df = pd.read_csv(url, names=columns)

# Display first few rows of the dataset
print("First 5 rows of the dataset:")
print(df.head())

# Basic statistics of the dataset
print("\nDataset Statistics:")
print(df.describe())

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())


## Step 3: Data Preprocessing


In [None]:
# Convert all columns to numeric and handle missing values
df = df.apply(pd.to_numeric, errors='coerce')
df = df.dropna()

print(f"Dataset shape after cleaning: {df.shape}")

# Outlier analysis using Z-score for the selected feature 'Glucose'
z_scores = np.abs(stats.zscore(df['Glucose']))
print("\nOutliers (Z > 3):")
outlier_indices = np.where(z_scores > 3)
print(f"Number of outliers: {len(outlier_indices[0])}")
print(f"Outlier indices: {outlier_indices[0]}")

# Remove outliers based on Z-score for 'Glucose'
df_clean = df[(z_scores < 3)]
print(f"\nShape after removing outliers: {df_clean.shape}")

# Visualize Glucose distribution before and after outlier removal
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df['Glucose'], bins=30, alpha=0.7, color='red', edgecolor='black')
plt.title('Glucose Distribution (Before Outlier Removal)')
plt.xlabel('Glucose Level')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(df_clean['Glucose'], bins=30, alpha=0.7, color='green', edgecolor='black')
plt.title('Glucose Distribution (After Outlier Removal)')
plt.xlabel('Glucose Level')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()


## Step 4: Prepare Data for Single Variable Linear Regression


In [None]:
# Splitting the dataset into the selected feature 'Glucose' and target variable 'Outcome'
X = df_clean[['Glucose']]  # Single input feature
y = df_clean['Outcome']    # Output variable

print(f"Feature shape: {X.shape}")
print(f"Target shape: {y.shape}")

# Visualize the relationship between Glucose and Outcome
plt.figure(figsize=(10, 6))
plt.scatter(X['Glucose'], y, alpha=0.6, color='blue')
plt.xlabel('Glucose Level')
plt.ylabel('Diabetes Outcome (0=No, 1=Yes)')
plt.title('Glucose Level vs Diabetes Outcome')
plt.grid(True, alpha=0.3)
plt.show()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")


## Step 5: Train Linear Regression Model


In [None]:
# Train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

print("Model trained successfully!")
print(f"Model coefficient (slope): {model.coef_[0]:.4f}")
print(f"Model intercept: {model.intercept_:.4f}")

# Predict on the test data
y_pred = model.predict(X_test)

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"\nModel Performance:")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R-squared: {r2:.4f}")

# Calculate additional metrics
mae = np.mean(np.abs(y_test - y_pred))
print(f"Mean Absolute Error: {mae:.4f}")


## Step 6: Visualize Model Performance


In [None]:
# Visualizing model performance
plt.figure(figsize=(15, 5))

# Plot 1: Actual vs Predicted
plt.subplot(1, 3, 1)
plt.scatter(y_test, y_pred, color='purple', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linewidth=2)
plt.xlabel("Actual Outcome")
plt.ylabel("Predicted Outcome")
plt.title("Linear Regression: Actual vs Predicted")
plt.grid(True, alpha=0.3)

# Plot 2: Residuals distribution
plt.subplot(1, 3, 2)
residuals = y_test - y_pred
sns.histplot(residuals, bins=20, kde=True, color='orange')
plt.title('Residuals Distribution')
plt.xlabel('Residuals')
plt.ylabel('Frequency')
plt.grid(True, alpha=0.3)

# Plot 3: Regression line on scatter plot
plt.subplot(1, 3, 3)
plt.scatter(X_test['Glucose'], y_test, color='blue', alpha=0.6, label='Actual')
plt.scatter(X_test['Glucose'], y_pred, color='red', alpha=0.6, label='Predicted')

# Plot regression line
X_line = np.linspace(X_test['Glucose'].min(), X_test['Glucose'].max(), 100).reshape(-1, 1)
y_line = model.predict(X_line)
plt.plot(X_line, y_line, color='green', linewidth=2, label='Regression Line')

plt.xlabel('Glucose Level')
plt.ylabel('Diabetes Outcome')
plt.title('Regression Line Visualization')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## Step 7: Model Interpretation and Conclusions


In [None]:
# Model interpretation
print("SINGLE VARIABLE LINEAR REGRESSION ANALYSIS")
print("=" * 50)
print(f"Model Equation: Outcome = {model.intercept_:.4f} + {model.coef_[0]:.4f} * Glucose")
print(f"\nInterpretation:")
print(f"• For every 1 unit increase in Glucose level, the predicted outcome increases by {model.coef_[0]:.4f}")
print(f"• When Glucose level is 0, the predicted outcome is {model.intercept_:.4f}")
print(f"• R² = {r2:.4f} means the model explains {r2*100:.2f}% of the variance in the outcome")
print(f"• Mean Squared Error: {mse:.4f}")

# Sample predictions
print(f"\nSample Predictions:")
sample_glucose = [100, 150, 200]
for glucose in sample_glucose:
    prediction = model.predict([[glucose]])[0]
    print(f"Glucose Level: {glucose} → Predicted Outcome: {prediction:.4f}")

print(f"\nModel Performance Summary:")
print(f"• The model shows {'good' if r2 > 0.5 else 'moderate' if r2 > 0.3 else 'poor'} performance")
print(f"• {'High' if mse < 0.1 else 'Moderate' if mse < 0.2 else 'High'} prediction error")
print(f"• The relationship between Glucose and Diabetes outcome is {'strong' if abs(model.coef_[0]) > 0.01 else 'weak'}")
