# Lab 03: Linear Regression on Insurance Dataset

**Subject:** Machine Learning  
**Subject Teacher:** Dr. Abid Ali  
**Lab Supervisor:** Miss. Sana Saleem  
**Student:** [Your Name]

## Objectives
- Implement linear regression to predict insurance charges using real-world dataset
- Explore and preprocess datasets, including handling missing values and outliers
- Apply data visualization techniques to understand data distributions and correlations
- Train and evaluate a linear regression model using performance metrics
- Use gradient descent to optimize model parameters and understand the cost function
- Analyze model performance using metrics such as accuracy, precision, recall, and F1-score
- Visualize the importance of features in the model and analyze training loss over time


## Phase 1: Data Analysis and Preprocessing

### 1.1 Import Libraries and Load Dataset


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
# Load the Insurance dataset
df = pd.read_csv('insurance[1].csv')

# Display first few rows of the dataset
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nBasic Statistics:")
print(df.describe())


### 1.2 Data Quality Assessment


In [None]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())

# Check data types
print("\nData Types:")
print(df.dtypes)

# Check for duplicates
print(f"\nDuplicate rows: {df.duplicated().sum()}")

# Check unique values in categorical columns
print("\nUnique values in categorical columns:")
categorical_cols = ['gender', 'smoker', 'region']
for col in categorical_cols:
    print(f"{col}: {df[col].unique()}")


### 1.3 Data Preprocessing


In [None]:
# Handle categorical variables using Label Encoding
le_gender = LabelEncoder()
le_smoker = LabelEncoder()
le_region = LabelEncoder()

df['gender_encoded'] = le_gender.fit_transform(df['gender'])
df['smoker_encoded'] = le_smoker.fit_transform(df['smoker'])
df['region_encoded'] = le_region.fit_transform(df['region'])

print("Encoded values:")
print(f"Gender: {dict(zip(le_gender.classes_, le_gender.transform(le_gender.classes_)))}")
print(f"Smoker: {dict(zip(le_smoker.classes_, le_smoker.transform(le_smoker.classes_)))}")
print(f"Region: {dict(zip(le_region.classes_, le_region.transform(le_region.classes_)))}")

# Create feature matrix with encoded variables
feature_cols = ['age', 'gender_encoded', 'bmi', 'children', 'smoker_encoded', 'region_encoded']
X = df[feature_cols]
y = df['charges']

print(f"\nFeature matrix shape: {X.shape}")
print(f"Target variable shape: {y.shape}")


### 1.4 Outlier Analysis using Z-score


In [None]:
# Outlier analysis using Z-score
z_scores = np.abs(stats.zscore(X))
print("Outliers (Z > 3):")
outlier_indices = np.where(z_scores > 3)
print(f"Number of outliers: {len(outlier_indices[0])}")

# Remove outliers based on Z-score
df_clean = df[(z_scores < 3).all(axis=1)]
print(f"Original shape: {df.shape}")
print(f"Shape after removing outliers: {df_clean.shape}")
print(f"Percentage of data retained: {(df_clean.shape[0]/df.shape[0])*100:.2f}%")


## Phase 2: Data Visualization

### 2.1 Correlation Matrix and Feature Distributions


In [None]:
# Prepare clean data for visualization
X_clean = df_clean[feature_cols]
y_clean = df_clean['charges']

# Visualize the correlation matrix
plt.figure(figsize=(12, 8))
correlation_data = X_clean.copy()
correlation_data['charges'] = y_clean
sns.heatmap(correlation_data.corr(), annot=True, cmap='coolwarm', linewidths=0.5, fmt='.3f')
plt.title('Correlation Matrix - Insurance Dataset')
plt.tight_layout()
plt.show()


In [None]:
# Feature distributions before and after outlier removal
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Before outlier removal
df[feature_cols + ['charges']].hist(bins=20, ax=axes[0], color='blue', alpha=0.7)
axes[0].set_title('Feature Distributions - Before Outlier Removal')

# After outlier removal
df_clean[feature_cols + ['charges']].hist(bins=20, ax=axes[1], color='green', alpha=0.7)
axes[1].set_title('Feature Distributions - After Outlier Removal')

plt.tight_layout()
plt.show()


## Phase 3: Linear Regression Implementation

### Lab Practice 1: Single Variable Linear Regression (Age vs Charges)


In [None]:
# Single variable linear regression using Age as feature
X_single = X_clean[['age']]  # Single input feature
y_single = y_clean  # Target variable

# Train-test split
X_train_single, X_test_single, y_train_single, y_test_single = train_test_split(
    X_single, y_single, test_size=0.2, random_state=42
)

# Train the Linear Regression model
model_single = LinearRegression()
model_single.fit(X_train_single, y_train_single)

# Predict on the test data
y_pred_single = model_single.predict(X_test_single)

# Calculate metrics
mse_single = mean_squared_error(y_test_single, y_pred_single)
r2_single = r2_score(y_test_single, y_pred_single)

print(f"Single Variable Linear Regression (Age vs Charges):")
print(f"Mean Squared Error: {mse_single:.2f}")
print(f"R-squared: {r2_single:.4f}")
print(f"Coefficient (Age): {model_single.coef_[0]:.2f}")
print(f"Intercept: {model_single.intercept_:.2f}")


In [None]:
# Visualizing single variable model performance
plt.figure(figsize=(12, 5))

# Actual vs Predicted
plt.subplot(1, 2, 1)
plt.scatter(y_test_single, y_pred_single, color='purple', alpha=0.6)
plt.plot([y_test_single.min(), y_test_single.max()], 
         [y_test_single.min(), y_test_single.max()], 
         color='red', linewidth=2, linestyle='--')
plt.xlabel("Actual Charges")
plt.ylabel("Predicted Charges")
plt.title("Single Variable LR: Actual vs Predicted")
plt.grid(True, alpha=0.3)

# Residuals
plt.subplot(1, 2, 2)
residuals_single = y_test_single - y_pred_single
plt.scatter(y_pred_single, residuals_single, color='orange', alpha=0.6)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel("Predicted Charges")
plt.ylabel("Residuals")
plt.title("Residuals Plot")
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


### Lab Practice 2: Multi-Variable Linear Regression with Feature Scaling


In [None]:
# Multi-variable linear regression
X_multi = X_clean  # All features
y_multi = y_clean  # Target variable

# Train-test split
X_train_multi, X_test_multi, y_train_multi, y_test_multi = train_test_split(
    X_multi, y_multi, test_size=0.2, random_state=42
)

# Feature scaling (Standardization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_multi)
X_test_scaled = scaler.transform(X_test_multi)

# Train the model
model_multi = LinearRegression()
model_multi.fit(X_train_scaled, y_train_multi)

# Predict on the test data
y_pred_multi = model_multi.predict(X_test_scaled)

# Calculate metrics
mse_multi = mean_squared_error(y_test_multi, y_pred_multi)
r2_multi = r2_score(y_test_multi, y_pred_multi)

print(f"Multi-Variable Linear Regression:")
print(f"Mean Squared Error: {mse_multi:.2f}")
print(f"R-squared: {r2_multi:.4f}")
print(f"\nFeature Coefficients:")
for feature, coef in zip(feature_cols, model_multi.coef_):
    print(f"{feature}: {coef:.4f}")
print(f"Intercept: {model_multi.intercept_:.2f}")


In [None]:
# Visualizing multi-variable model performance
plt.figure(figsize=(12, 5))

# Actual vs Predicted
plt.subplot(1, 2, 1)
plt.scatter(y_test_multi, y_pred_multi, color='purple', alpha=0.6)
plt.plot([y_test_multi.min(), y_test_multi.max()], 
         [y_test_multi.min(), y_test_multi.max()], 
         color='red', linewidth=2, linestyle='--')
plt.xlabel("Actual Charges")
plt.ylabel("Predicted Charges")
plt.title("Multi-Variable LR: Actual vs Predicted")
plt.grid(True, alpha=0.3)

# Residuals
plt.subplot(1, 2, 2)
residuals_multi = y_test_multi - y_pred_multi
plt.scatter(y_pred_multi, residuals_multi, color='orange', alpha=0.6)
plt.axhline(y=0, color='red', linestyle='--')
plt.xlabel("Predicted Charges")
plt.ylabel("Residuals")
plt.title("Residuals Plot")
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


### Lab Practice 3: Performance Metrics (Accuracy, Precision, Recall, F1-Score)


In [None]:
# For classification metrics, we need to convert regression predictions to binary
# Let's create a binary classification problem: high charges vs low charges
charges_median = y_clean.median()
y_binary = (y_clean > charges_median).astype(int)

print(f"Median charges: ${charges_median:.2f}")
print(f"High charges (>median): {y_binary.sum()} samples")
print(f"Low charges (<=median): {(y_binary == 0).sum()} samples")

# Split for binary classification
X_train_binary, X_test_binary, y_train_binary, y_test_binary = train_test_split(
    X_clean, y_binary, test_size=0.2, random_state=42
)

# Scale features
scaler_binary = StandardScaler()
X_train_binary_scaled = scaler_binary.fit_transform(X_train_binary)
X_test_binary_scaled = scaler_binary.transform(X_test_binary)

# Train model
model_binary = LinearRegression()
model_binary.fit(X_train_binary_scaled, y_train_binary)

# Predict probabilities
y_pred_proba = model_binary.predict(X_test_binary_scaled)
y_pred_binary = (y_pred_proba >= 0.5).astype(int)

# Calculate all metrics
accuracy = accuracy_score(y_test_binary, y_pred_binary)
precision = precision_score(y_test_binary, y_pred_binary)
recall = recall_score(y_test_binary, y_pred_binary)
f1 = f1_score(y_test_binary, y_pred_binary)
mse_binary = mean_squared_error(y_test_binary, y_pred_proba)
r2_binary = r2_score(y_test_binary, y_pred_proba)

print(f"\nBinary Classification Metrics (High vs Low Charges):")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Mean Squared Error: {mse_binary:.4f}")
print(f"R-squared: {r2_binary:.4f}")


### Lab Practice 4: Custom Gradient Descent and Cost Function


In [None]:
# Custom Gradient Descent Implementation
def compute_cost(X, y, theta, bias):
    """Compute the cost function (MSE)"""
    m = len(y)
    predictions = np.dot(X, theta) + bias
    cost = (1 / (2 * m)) * np.sum((predictions - y) ** 2)
    return cost

def gradient_descent(X, y, theta, bias, learning_rate, epochs):
    """Implement gradient descent"""
    m = len(y)
    cost_history = []
    
    for epoch in range(epochs):
        # Make predictions
        predictions = np.dot(X, theta) + bias
        
        # Compute gradients
        d_theta = (1 / m) * np.dot(X.T, (predictions - y))
        d_bias = (1 / m) * np.sum(predictions - y)
        
        # Update weights
        theta -= learning_rate * d_theta
        bias -= learning_rate * d_bias
        
        # Calculate cost and save it for plotting
        cost = compute_cost(X, y, theta, bias)
        cost_history.append(cost)
        
        if epoch % 100 == 0:
            print(f"Epoch {epoch}: Cost = {cost:.4f}")
    
    return theta, bias, cost_history

# Initialize parameters
np.random.seed(42)
m, n = X_train_scaled.shape
theta = np.random.randn(n)  # Initial weights
bias = 0.0  # Initial bias
learning_rate = 0.01
epochs = 1000

print(f"Starting Gradient Descent with {epochs} epochs...")
print(f"Learning rate: {learning_rate}")
print(f"Initial cost: {compute_cost(X_train_scaled, y_train_multi, theta, bias):.4f}")
print("\nTraining progress:")

# Run gradient descent
theta_optimized, bias_optimized, cost_history = gradient_descent(
    X_train_scaled, y_train_multi, theta, bias, learning_rate, epochs
)

print(f"\nFinal cost: {cost_history[-1]:.4f}")
print(f"Final weights: {theta_optimized}")
print(f"Final bias: {bias_optimized:.4f}")


In [None]:
# Make predictions using custom gradient descent
y_pred_train_gd = np.dot(X_train_scaled, theta_optimized) + bias_optimized
y_pred_test_gd = np.dot(X_test_scaled, theta_optimized) + bias_optimized

# Calculate metrics
mse_test_gd = mean_squared_error(y_test_multi, y_pred_test_gd)
r2_test_gd = r2_score(y_test_multi, y_pred_test_gd)

print(f"Custom Gradient Descent Results:")
print(f"Test MSE: {mse_test_gd:.2f}")
print(f"Test R-squared: {r2_test_gd:.4f}")

# Plot cost over epochs
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(epochs), cost_history, color='blue', linewidth=2)
plt.xlabel('Epochs')
plt.ylabel('Cost (MSE)')
plt.title('Cost Over Epochs (Gradient Descent)')
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.scatter(y_test_multi, y_pred_test_gd, color='purple', alpha=0.6)
plt.plot([y_test_multi.min(), y_test_multi.max()], 
         [y_test_multi.min(), y_test_multi.max()], 
         color='red', linewidth=2, linestyle='--')
plt.xlabel("Actual Charges")
plt.ylabel("Predicted Charges")
plt.title("Gradient Descent: Actual vs Predicted")
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## Phase 4: Results Summary and Analysis


In [None]:
# Summary of all models
print("=" * 60)
print("LINEAR REGRESSION MODELS COMPARISON")
print("=" * 60)
print(f"\n1. Single Variable (Age only):")
print(f"   MSE: {mse_single:.2f}")
print(f"   R²:  {r2_single:.4f}")

print(f"\n2. Multi-Variable (All features):")
print(f"   MSE: {mse_multi:.2f}")
print(f"   R²:  {r2_multi:.4f}")

print(f"\n3. Custom Gradient Descent:")
print(f"   MSE: {mse_test_gd:.2f}")
print(f"   R²:  {r2_test_gd:.4f}")

print(f"\n4. Binary Classification (High vs Low Charges):")
print(f"   Accuracy:  {accuracy:.4f}")
print(f"   Precision: {precision:.4f}")
print(f"   Recall:    {recall:.4f}")
print(f"   F1-Score:  {f1:.4f}")

print("\n" + "=" * 60)
print("KEY INSIGHTS:")
print("=" * 60)
print(f"• Dataset contains {df.shape[0]} samples with {df.shape[1]} features")
print(f"• After outlier removal: {df_clean.shape[0]} samples ({df_clean.shape[0]/df.shape[0]*100:.1f}% retained)")
print(f"• Most important features: {feature_cols[np.argmax(np.abs(model_multi.coef_))]} (coef: {model_multi.coef_[np.argmax(np.abs(model_multi.coef_))]:.4f})")
print(f"• Multi-variable model performs {'better' if r2_multi > r2_single else 'worse'} than single variable model")
print(f"• Custom gradient descent {'matches' if abs(r2_test_gd - r2_multi) < 0.01 else 'differs from'} sklearn implementation")
print("=" * 60)
