In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import os
import sys

project_root = os.path.abspath("..") 
sys.path.append(os.path.join(project_root, "scripts"))
from linear_regression import LinearRegressionGD

In [None]:
# Load Dataset
data_path = "../data/insurance.csv"
df = pd.read_csv(data_path)

# Display dataset information
print("Data loaded successfully!")
print(df.head())
print(df.info())
print("Columns:", df.columns)

# Check for missing values
print("\nMissing Values:\n", df.isnull().sum())
if df is not None:
    # Drop index column if it exists
    df = df.drop(columns=['index'], errors='ignore')

In [None]:
if df is not None:
    # Drop index column if it exists
    df = df.drop(columns=['index'], errors='ignore')
    
    # Feature Engineering
    # BMI Categories
    df['bmi_category'] = pd.cut(df['bmi'], 
                            bins=[0, 18.5, 25, 30, float('inf')], 
                            labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
    
    # Age × BMI interaction
    df['age_bmi_interaction'] = df['age'] * df['bmi']
    
    # Define features
    categorical_features = ['sex', 'smoker', 'region', 'bmi_category']
    numerical_features = ['age', 'bmi', 'children', 'age_bmi_interaction']
    
    # One-Hot Encoding
    ohe = ColumnTransformer([
        ('encoder', OneHotEncoder(drop='first'), categorical_features)
    ], remainder='passthrough')
    
    # Extract features and target
    features = ohe.fit_transform(df.drop(columns=['charges']))
    target = np.log(df['charges'].values)
    
    # Print results
    print(f"Features Shape: {features.shape}, Target Shape: {target.shape}")
    print("\nFirst 5 Log-Transformed Target Values (y):")
    print(target[:5])

In [None]:
# Split dataset into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Standardize only numerical features
scaler = StandardScaler()

# Identify numeric feature indices (assumes categorical features are first in `features`)
num_features_start = len(ohe.named_transformers_['encoder'].get_feature_names_out())
X_train[:, num_features_start:] = scaler.fit_transform(X_train[:, num_features_start:])
X_test[:, num_features_start:] = scaler.transform(X_test[:, num_features_start:])

# Confirm the split
print(f"Training Set Shape: {X_train.shape}, Testing Set Shape: {X_test.shape}")

In [None]:
# Initialize the Linear Regression model
try:
    lr = LinearRegressionGD(learning_rate=0.001, n_iter=5000)  # n_iter specifies the number of iterations/epochs for gradient descent optimization

    print("Custom Linear Regression model imported successfully!")
except NameError:
    print("Error: LinearRegressionGD is not defined. Ensure it's correctly imported.")



In [None]:
try:
    # Train the custom model
    print("Training and evaluating custom model...")
    lr_custom = LinearRegressionGD(learning_rate=0.001, n_iter=5000)
    # Fit custom model
    lr_custom.fit(X_train, y_train)
    # Predict using custom model
    y_pred_custom = lr_custom.predict(X_test)
    # Calculate MSE for custom model
    mse_custom = mean_squared_error(y_test, y_pred_custom)

    # Train the scikit-learn model
    print("Training and evaluating scikit-learn model...")
    lr_sklearn = LinearRegression()
    # Fit scikit-learn model
    lr_sklearn.fit(X_train, y_train)
    # Predict using scikit-learn model
    y_pred_sklearn = lr_sklearn.predict(X_test)
    # Calculate MSE for scikit-learn model
    mse_sklearn = mean_squared_error(y_test, y_pred_sklearn)

    # Print comparison of model performances
    print("\nModel Performance Comparison:")
    print(f"Custom Model MSE on Test Set:            {mse_custom:.4f}")
    print(f"scikit-learn Linear Regression MSE:      {mse_sklearn:.4f}")

except Exception as e:
    print(f"An error occurred during model training/evaluation: {str(e)}")
    


In [None]:
if hasattr(lr, 'costs') and len(lr.costs) > 0:
    print("Cost values exist, proceeding with plotting.")
else:
    print("No cost values recorded. Ensure the model has been trained.")


In [None]:
# Scatter plot of actual vs predicted values
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred_custom, alpha=0.5, label="Custom Model Predictions", color="blue")
plt.scatter(y_test, y_pred_sklearn, alpha=0.5, label="SKLearn Model Predictions", color="green")

# Add the reference line (Perfect Predictions)
min_val = np.min([y_test.min(), y_pred_custom.min(), y_pred_sklearn.min()])
max_val = np.max([y_test.max(), y_pred_custom.max(), y_pred_sklearn.max()])
plt.plot([min_val, max_val], [min_val, max_val], linestyle='--', color='red', label="Perfect Predictions")

plt.xlabel('Actual Charges (log-transformed)')
plt.ylabel('Predicted Charges (log-transformed)')
plt.title('Actual vs Predicted Values: Custom vs SKLearn Models')
plt.legend()
plt.show()


In [None]:


# Compare predictions from the custom and scikit-learn models
print("\nComparison of Models:")
print(f"Custom Model MSE: {mse_custom:.4f}")  # mse from custom model earlier
print(f"scikit-learn Model MSE: {mse_sklearn:.4f}")
print(f"Custom Model R² Score: {r2_score(y_test, y_pred_custom):.4f}")
print(f"scikit-learn R² Score: {r2_score(y_test, y_pred_sklearn):.4f}")

# Plot comparison of predictions: Custom vs scikit-learn
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_custom, alpha=0.5, label="Custom Model Predictions", color="blue")
plt.scatter(y_test, y_pred_sklearn, alpha=0.5, label="scikit-learn Predictions", color="green")

# Draw the line of perfect prediction
min_val = np.min([y_test.min(), y_pred_custom.min(), y_pred_sklearn.min()])
max_val = np.max([y_test.max(), y_pred_custom.max(), y_pred_sklearn.max()])
plt.plot([min_val, max_val], [min_val, max_val], linestyle='--', color='red', label="Perfect Prediction")

plt.xlabel('Actual Charges (log-transformed)')
plt.ylabel('Predicted Charges (log-transformed)')
plt.title('Comparison of Predictions: Custom vs scikit-learn')
plt.legend()
plt.show()

# Plot residuals
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
residuals_custom = y_test - y_pred_custom
plt.scatter(y_pred_custom, residuals_custom, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot - Custom Model')

plt.subplot(1, 2, 2)
residuals_sklearn = y_test - y_pred_sklearn
plt.scatter(y_pred_sklearn, residuals_sklearn, alpha=0.5)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot - Scikit-learn Model')

plt.tight_layout()
plt.show()

# Analysis & Reporting
print("\n--- Analysis & Reporting ---")
print("1. Performance Metrics:")
print("   - The custom Linear Regression model using gradient descent achieved:")
print("     * MSE: {:.4f}".format(mse_custom))
print("     * R² Score: {:.4f}".format(r2_score(y_test, y_pred_custom)))
print("   - The scikit-learn Linear Regression model achieved:")
print("     * MSE: {:.4f}".format(mse_sklearn))
print("     * R² Score: {:.4f}".format(r2_score(y_test, y_pred_sklearn)))

print("\n2. Visual Analysis:")
print("   - The prediction scatter plot shows both models' predictions against actual log-transformed charges.")
print("   - The residual plots help visualize the distribution of prediction errors and identify any patterns.")
print("   - Both models' predictions lie close to the perfect prediction line, indicating good performance.")

print("\n3. Discussion:")
print("   - The close MSE and R² values between models validate the custom implementation.")
print("   - The residual plots show whether the models' errors are randomly distributed or show systematic patterns.")
print("   - Minor differences may be due to differences in optimization details between implementations.")
print("   - The log-transformation of the target variable helps stabilize variance and improve model performance.")

print("\n4. Conclusion:")
print("   - Both models demonstrate similar performance across multiple metrics.")
print("   - The residual analysis provides additional confidence in the models' predictions.")
print("   - The exercise successfully validates the custom implementation against an established library.")