# Lab 02: Linear Regression Analysis on Insurance Dataset

**Department Of Electrical & Computer Engineering**  
**Machine Learning Lab**  
**Lab Coordinator: Ms. Sana Saleem**  
**Course Instructor: Dr. Abid Ali**  
**Program: BS(AI) -F23, Semester: 5th**  
**Deadline: 13th September 2025**

## Objective
Implement a linear regression model on an insurance dataset to predict medical charges based on various features. Analyze the dataset, implement the model, evaluate its performance, and visualize the results.

## Dataset Information
- **Dataset**: Insurance.csv
- **Target Variable**: charges (medical insurance charges)
- **Features**: age, gender, bmi, children, smoker, region
- **Total Records**: 1,338


## 1. Import Required Libraries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")


## 2. Load and Explore the Dataset


In [None]:
# Load the dataset
df = pd.read_csv('insurance.csv')

print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
df.head()


In [None]:
# Basic information about the dataset
print("Dataset Information:")
print("=" * 50)
df.info()

print("\n\nDataset Description:")
print("=" * 50)
df.describe()


In [None]:
# Check for missing values
print("Missing Values:")
print("=" * 30)
missing_values = df.isnull().sum()
print(missing_values)

print("\nMissing Values Percentage:")
print("=" * 30)
missing_percentage = (missing_values / len(df)) * 100
print(missing_percentage)


## 3. Data Visualization and Analysis


In [None]:
# Distribution of target variable (charges)
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df['charges'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('Distribution of Medical Charges')
plt.xlabel('Charges ($)')
plt.ylabel('Frequency')

plt.subplot(1, 2, 2)
plt.hist(np.log(df['charges']), bins=30, alpha=0.7, color='lightcoral', edgecolor='black')
plt.title('Distribution of Log-Transformed Charges')
plt.xlabel('Log(Charges)')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()

print(f"Charges Statistics:")
print(f"Mean: ${df['charges'].mean():.2f}")
print(f"Median: ${df['charges'].median():.2f}")
print(f"Standard Deviation: ${df['charges'].std():.2f}")
print(f"Min: ${df['charges'].min():.2f}")
print(f"Max: ${df['charges'].max():.2f}")


In [None]:
# Categorical variables analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Gender distribution
df['gender'].value_counts().plot(kind='bar', ax=axes[0,0], color='lightblue')
axes[0,0].set_title('Gender Distribution')
axes[0,0].set_xlabel('Gender')
axes[0,0].set_ylabel('Count')

# Smoker distribution
df['smoker'].value_counts().plot(kind='bar', ax=axes[0,1], color='lightgreen')
axes[0,1].set_title('Smoker Distribution')
axes[0,1].set_xlabel('Smoker')
axes[0,1].set_ylabel('Count')

# Region distribution
df['region'].value_counts().plot(kind='bar', ax=axes[1,0], color='lightcoral')
axes[1,0].set_title('Region Distribution')
axes[1,0].set_xlabel('Region')
axes[1,0].set_ylabel('Count')
axes[1,0].tick_params(axis='x', rotation=45)

# Children distribution
df['children'].value_counts().sort_index().plot(kind='bar', ax=axes[1,1], color='lightyellow')
axes[1,1].set_title('Children Distribution')
axes[1,1].set_xlabel('Number of Children')
axes[1,1].set_ylabel('Count')

plt.tight_layout()
plt.show()


In [None]:
# Numerical variables analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Age distribution
axes[0,0].hist(df['age'], bins=20, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].set_title('Age Distribution')
axes[0,0].set_xlabel('Age')
axes[0,0].set_ylabel('Frequency')

# BMI distribution
axes[0,1].hist(df['bmi'], bins=20, alpha=0.7, color='lightgreen', edgecolor='black')
axes[0,1].set_title('BMI Distribution')
axes[0,1].set_xlabel('BMI')
axes[0,1].set_ylabel('Frequency')

# Age vs Charges
axes[1,0].scatter(df['age'], df['charges'], alpha=0.6, color='coral')
axes[1,0].set_title('Age vs Charges')
axes[1,0].set_xlabel('Age')
axes[1,0].set_ylabel('Charges ($)')

# BMI vs Charges
axes[1,1].scatter(df['bmi'], df['charges'], alpha=0.6, color='purple')
axes[1,1].set_title('BMI vs Charges')
axes[1,1].set_xlabel('BMI')
axes[1,1].set_ylabel('Charges ($)')

plt.tight_layout()
plt.show()


In [None]:
# Correlation analysis
# Create a copy for correlation analysis
df_corr = df.copy()

# Encode categorical variables for correlation
le_gender = LabelEncoder()
le_smoker = LabelEncoder()
le_region = LabelEncoder()

df_corr['gender_encoded'] = le_gender.fit_transform(df_corr['gender'])
df_corr['smoker_encoded'] = le_smoker.fit_transform(df_corr['smoker'])
df_corr['region_encoded'] = le_region.fit_transform(df_corr['region'])

# Select numerical columns for correlation
numerical_cols = ['age', 'bmi', 'children', 'gender_encoded', 'smoker_encoded', 'region_encoded', 'charges']
correlation_matrix = df_corr[numerical_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, fmt='.2f')
plt.title('Correlation Matrix of All Variables')
plt.tight_layout()
plt.show()

print("\nCorrelation with Charges (Target Variable):")
print("=" * 50)
charges_corr = correlation_matrix['charges'].sort_values(ascending=False)
print(charges_corr)


## 4. Data Preprocessing


In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

# Encode categorical variables
le_gender = LabelEncoder()
le_smoker = LabelEncoder()
le_region = LabelEncoder()

df_processed['gender'] = le_gender.fit_transform(df_processed['gender'])
df_processed['smoker'] = le_smoker.fit_transform(df_processed['smoker'])
df_processed['region'] = le_region.fit_transform(df_processed['region'])

print("Categorical variables encoded successfully!")
print("\nEncoded values:")
print(f"Gender: {dict(zip(le_gender.classes_, le_gender.transform(le_gender.classes_)))}")
print(f"Smoker: {dict(zip(le_smoker.classes_, le_smoker.transform(le_smoker.classes_)))}")
print(f"Region: {dict(zip(le_region.classes_, le_region.transform(le_region.classes_)))}")

print("\nFirst 5 rows after encoding:")
df_processed.head()


In [None]:
# Prepare features and target
X = df_processed.drop('charges', axis=1)
y = df_processed['charges']

print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)
print("\nFeature columns:", list(X.columns))

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"\nTraining set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")
print(f"Training set features: {X_train.shape[1]} features")


In [None]:
# Feature scaling (optional for linear regression but good practice)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Features scaled successfully!")
print(f"Scaled training set shape: {X_train_scaled.shape}")
print(f"Scaled test set shape: {X_test_scaled.shape}")


## 5. Linear Regression Model Implementation


In [None]:
# Create and train the linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train_scaled, y_train)

print("Linear Regression model trained successfully!")
print(f"Model coefficients: {lr_model.coef_}")
print(f"Model intercept: {lr_model.intercept_:.2f}")

# Display feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lr_model.coef_
}).sort_values('Coefficient', key=abs, ascending=False)

print("\nFeature Importance (by coefficient magnitude):")
print("=" * 50)
print(feature_importance)


In [None]:
# Make predictions
y_train_pred = lr_model.predict(X_train_scaled)
y_test_pred = lr_model.predict(X_test_scaled)

print("Predictions made successfully!")
print(f"Training predictions shape: {y_train_pred.shape}")
print(f"Test predictions shape: {y_test_pred.shape}")


## 6. Model Evaluation


In [None]:
# Calculate evaluation metrics
def calculate_metrics(y_true, y_pred, dataset_name):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n{dataset_name} Metrics:")
    print("=" * 30)
    print(f"Mean Squared Error (MSE): ${mse:.2f}")
    print(f"Root Mean Squared Error (RMSE): ${rmse:.2f}")
    print(f"Mean Absolute Error (MAE): ${mae:.2f}")
    print(f"R-squared (R²): {r2:.4f}")
    
    return mse, rmse, mae, r2

# Evaluate on training set
train_mse, train_rmse, train_mae, train_r2 = calculate_metrics(y_train, y_train_pred, "Training")

# Evaluate on test set
test_mse, test_rmse, test_mae, test_r2 = calculate_metrics(y_test, y_test_pred, "Test")

# Model performance summary
print("\n\nModel Performance Summary:")
print("=" * 50)
print(f"Training R²: {train_r2:.4f}")
print(f"Test R²: {test_r2:.4f}")
print(f"Overfitting Check (Train R² - Test R²): {train_r2 - test_r2:.4f}")


## 7. Visualization of Results


In [None]:
# Actual vs Predicted scatter plots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Training set
axes[0].scatter(y_train, y_train_pred, alpha=0.6, color='blue')
axes[0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Charges ($)')
axes[0].set_ylabel('Predicted Charges ($)')
axes[0].set_title(f'Training Set: Actual vs Predicted\nR² = {train_r2:.4f}')
axes[0].grid(True, alpha=0.3)

# Test set
axes[1].scatter(y_test, y_test_pred, alpha=0.6, color='green')
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1].set_xlabel('Actual Charges ($)')
axes[1].set_ylabel('Predicted Charges ($)')
axes[1].set_title(f'Test Set: Actual vs Predicted\nR² = {test_r2:.4f}')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Residual plots
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Training set residuals
train_residuals = y_train - y_train_pred
axes[0].scatter(y_train_pred, train_residuals, alpha=0.6, color='blue')
axes[0].axhline(y=0, color='r', linestyle='--')
axes[0].set_xlabel('Predicted Charges ($)')
axes[0].set_ylabel('Residuals ($)')
axes[0].set_title('Training Set: Residual Plot')
axes[0].grid(True, alpha=0.3)

# Test set residuals
test_residuals = y_test - y_test_pred
axes[1].scatter(y_test_pred, test_residuals, alpha=0.6, color='green')
axes[1].axhline(y=0, color='r', linestyle='--')
axes[1].set_xlabel('Predicted Charges ($)')
axes[1].set_ylabel('Residuals ($)')
axes[1].set_title('Test Set: Residual Plot')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


In [None]:
# Feature importance visualization
plt.figure(figsize=(10, 6))
feature_importance_sorted = feature_importance.sort_values('Coefficient', key=abs, ascending=True)
colors = ['red' if x < 0 else 'blue' for x in feature_importance_sorted['Coefficient']]

bars = plt.barh(feature_importance_sorted['Feature'], feature_importance_sorted['Coefficient'], color=colors, alpha=0.7)
plt.xlabel('Coefficient Value')
plt.title('Feature Importance in Linear Regression Model')
plt.grid(True, alpha=0.3)

# Add value labels on bars
for i, bar in enumerate(bars):
    width = bar.get_width()
    plt.text(width + (0.1 if width >= 0 else -0.1), bar.get_y() + bar.get_height()/2, 
             f'{width:.2f}', ha='left' if width >= 0 else 'right', va='center')

plt.tight_layout()
plt.show()


## 8. Model Interpretation and Analysis


In [None]:
# Detailed model interpretation
print("Linear Regression Model Interpretation:")
print("=" * 50)
print(f"Model Equation: charges = {lr_model.intercept_:.2f}")

for i, (feature, coef) in enumerate(zip(X.columns, lr_model.coef_)):
    print(f" + ({coef:.2f}) * {feature}")

print("\n\nFeature Impact Analysis:")
print("=" * 50)
for feature, coef in zip(X.columns, lr_model.coef_):
    impact = "increases" if coef > 0 else "decreases"
    print(f"• {feature}: {impact} charges by ${abs(coef):.2f} per unit increase")

print("\n\nModel Performance Analysis:")
print("=" * 50)
print(f"• The model explains {test_r2*100:.2f}% of the variance in medical charges")
print(f"• Average prediction error: ${test_rmse:.2f}")
print(f"• Mean absolute error: ${test_mae:.2f}")

if train_r2 - test_r2 > 0.1:
    print("• Warning: Model may be overfitting (large difference between train and test R²)")
else:
    print("• Model shows good generalization (similar train and test performance)")


## 9. Sample Predictions


In [None]:
# Sample predictions on test set
sample_indices = np.random.choice(len(X_test), 10, replace=False)
sample_X = X_test.iloc[sample_indices]
sample_y_actual = y_test.iloc[sample_indices]
sample_y_pred = y_test_pred[sample_indices]

print("Sample Predictions on Test Set:")
print("=" * 60)
print(f"{'Index':<6} {'Actual':<12} {'Predicted':<12} {'Error':<12} {'Error %':<10}")
print("-" * 60)

for i, (idx, actual, pred) in enumerate(zip(sample_indices, sample_y_actual, sample_y_pred)):
    error = actual - pred
    error_pct = (error / actual) * 100
    print(f"{idx:<6} ${actual:<11.2f} ${pred:<11.2f} ${error:<11.2f} {error_pct:<9.2f}%")

print("\n\nFeature Values for Sample Predictions:")
print("=" * 60)
sample_X_display = sample_X.copy()
# Decode categorical variables for display
sample_X_display['gender'] = le_gender.inverse_transform(sample_X_display['gender'])
sample_X_display['smoker'] = le_smoker.inverse_transform(sample_X_display['smoker'])
sample_X_display['region'] = le_region.inverse_transform(sample_X_display['region'])
print(sample_X_display.head(10))


## 10. Conclusions and Insights


In [None]:
print("LAB 02 ANALYSIS CONCLUSIONS")
print("=" * 50)

print("\n1. DATASET OVERVIEW:")
print(f"   • Total records: {len(df)} insurance claims")
print(f"   • Features: {len(X.columns)} (age, gender, bmi, children, smoker, region)")
print(f"   • Target: Medical charges (mean: ${df['charges'].mean():.2f})")
print(f"   • No missing values found")

print("\n2. KEY FINDINGS:")
print(f"   • Smoking status has the strongest impact on charges")
print(f"   • Age and BMI are significant predictors")
print(f"   • Gender and region have smaller but measurable effects")
print(f"   • Number of children has minimal impact")

print("\n3. MODEL PERFORMANCE:")
print(f"   • R² Score: {test_r2:.4f} ({test_r2*100:.2f}% variance explained)")
print(f"   • RMSE: ${test_rmse:.2f} (average prediction error)")
print(f"   • MAE: ${test_mae:.2f} (mean absolute error)")

print("\n4. BUSINESS INSIGHTS:")
print(f"   • Smokers pay significantly more for insurance")
print(f"   • Older individuals and those with higher BMI have higher charges")
print(f"   • The model can help predict insurance costs for new customers")
print(f"   • Risk assessment can be improved using these factors")

print("\n5. MODEL LIMITATIONS:")
print(f"   • Linear regression assumes linear relationships")
print(f"   • May not capture complex interactions between features")
print(f"   • R² of {test_r2:.4f} means {100-test_r2*100:.2f}% variance unexplained")
print(f"   • Could benefit from feature engineering or non-linear models")

print("\n6. RECOMMENDATIONS:")
print(f"   • Consider polynomial features for non-linear relationships")
print(f"   • Try other algorithms (Random Forest, XGBoost) for comparison")
print(f"   • Collect more data to improve model accuracy")
print(f"   • Feature engineering could improve performance")

print("\n" + "=" * 50)
print("LAB 02 ANALYSIS COMPLETED SUCCESSFULLY!")
print("=" * 50)
