# Height, Weight, and Age Dataset with Multiple Regression

This notebook demonstrates the relationship between height, weight, and age using a synthetic dataset with multiple regression analysis and 3D visualization.


## Configuration Parameters

This section contains adjustable parameters for the dataset generation and visualization.


In [None]:
# Configuration parameters
config = {
    # Dataset parameters
    "n_samples": 200,  # Number of samples to generate
    "random_seed": 42,  # Random seed for reproducibility
    # Height distribution parameters
    "height_mean": 170,  # Mean height in cm
    "height_std": 10,  # Standard deviation for height
    # Age distribution parameters
    "age_min": 18,  # Minimum age
    "age_max": 80,  # Maximum age
    # Weight parameters
    "base_weight": -80,  # Base weight component
    "height_factor": 0.8,  # Weight factor per cm of height
    "age_factor": 0.15,  # Weight factor per year of age
    "weight_noise_std": 5,  # Standard deviation of noise in weight
    # Plot parameters
    "plot_figsize": (12, 8),  # Figure size
    "plot_figsize_3d": (14, 10),  # Figure size for 3D plots
    "scatter_alpha": 0.6,  # Transparency of scatter points
    "scatter_color": "blue",  # Color of scatter points
    "line_color": "red",  # Color of regression line
    "line_width": 2,  # Width of regression line
    "grid_alpha": 0.3,  # Transparency of grid lines
    "cmap": "viridis",  # Colormap for 3D scatter
}

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Set style and random seed for reproducibility
sns.set(style="whitegrid")
np.random.seed(config["random_seed"])

In [None]:
# Generate heights in cm (normally distributed)
heights = np.random.normal(config["height_mean"], config["height_std"], config["n_samples"])

# Generate ages (uniformly distributed)
ages = np.random.uniform(config["age_min"], config["age_max"], config["n_samples"])

# Create weights with a linear relationship to height and age plus some noise
# Weight = base_weight + (height_factor * height) + (age_factor * age) + noise
noise = np.random.normal(0, config["weight_noise_std"], config["n_samples"])
weights = (
    config["base_weight"]
    + (config["height_factor"] * heights)
    + (config["age_factor"] * ages)
    + noise
)

# Create a DataFrame
data = pd.DataFrame({"Height (cm)": heights, "Age (years)": ages, "Weight (kg)": weights})

# Display the first few rows
data.head()

## Exploratory Data Analysis

Let's analyze the dataset from a data scientist's perspective, including 3D visualizations.


In [None]:
# Display descriptive statistics
print("Descriptive Statistics:")
display(data.describe())

# Check for missing values
print("\nMissing Values:")
display(data.isnull().sum())

# Calculate correlations
print("\nCorrelation Matrix:")
display(data.corr())

In [None]:
# Distribution analysis
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Height distribution
sns.histplot(x=data["Height (cm)"], kde=True, ax=axes[0], color="skyblue")
axes[0].set_title("Height Distribution")
axes[0].axvline(
    data["Height (cm)"].mean(),
    color="red",
    linestyle="--",
    label=f'Mean: {data["Height (cm)"].mean():.2f} cm',
)
axes[0].legend()

# Age distribution
sns.histplot(x=data["Age (years)"], kde=True, ax=axes[1], color="orange")
axes[1].set_title("Age Distribution")
axes[1].axvline(
    data["Age (years)"].mean(),
    color="red",
    linestyle="--",
    label=f'Mean: {data["Age (years)"].mean():.2f} years',
)
axes[1].legend()

# Weight distribution
sns.histplot(x=data["Weight (kg)"], kde=True, ax=axes[2], color="green")
axes[2].set_title("Weight Distribution")
axes[2].axvline(
    data["Weight (kg)"].mean(),
    color="red",
    linestyle="--",
    label=f'Mean: {data["Weight (kg)"].mean():.2f} kg',
)
axes[2].legend()

plt.tight_layout()
plt.show()

In [None]:
# Create pairplot to visualize relationships between variables
sns.pairplot(data, diag_kind="kde")
plt.suptitle("Pairwise Relationships", y=1.02, fontsize=16)
plt.tight_layout()
plt.show()

In [None]:
# 3D scatter plot visualization
fig = plt.figure(figsize=config["plot_figsize_3d"])
ax = fig.add_subplot(111, projection="3d")

# Create the 3D scatter plot
scatter = ax.scatter(
    data["Height (cm)"],
    data["Age (years)"],
    data["Weight (kg)"],
    c=data["Weight (kg)"],  # Color points by weight
    cmap=config["cmap"],
    alpha=config["scatter_alpha"],
    s=50,  # Point size
)

# Add labels and title
ax.set_xlabel("Height (cm)", fontsize=12)
ax.set_ylabel("Age (years)", fontsize=12)
ax.set_zlabel("Weight (kg)", fontsize=12)
ax.set_title("3D Relationship: Height, Age, and Weight", fontsize=14)

# Add a color bar
cbar = fig.colorbar(scatter, ax=ax, pad=0.1)
cbar.set_label("Weight (kg)", rotation=270, labelpad=20)

# Adjust the viewing angle for better visualization
ax.view_init(elev=30, azim=45)

plt.tight_layout()
plt.show()

## Multiple Linear Regression Analysis

Predicting weight based on height and age using multiple linear regression.


In [None]:
# Prepare data for multiple linear regression
X = data[["Height (cm)", "Age (years)"]].values  # Independent variables
y = data["Weight (kg)"].values  # Dependent variable

# Create and fit the multiple linear regression model
model = LinearRegression()
model.fit(X, y)

# Get the coefficients and intercept
height_coef = model.coef_[0]
age_coef = model.coef_[1]
intercept = model.intercept_

# Make predictions
y_pred = model.predict(X)

# Calculate metrics
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y, y_pred)

# Print model results
print("Multiple Linear Regression Model:")
print(f"Weight = {height_coef:.4f} × Height + {age_coef:.4f} × Age + {intercept:.4f}")
print(f"R-squared: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")

In [None]:
# Create a 3D surface plot to visualize the regression plane
fig = plt.figure(figsize=config["plot_figsize_3d"])
ax = fig.add_subplot(111, projection="3d")

# Create a meshgrid for height and age
height_range = np.linspace(data["Height (cm)"].min(), data["Height (cm)"].max(), 20)
age_range = np.linspace(data["Age (years)"].min(), data["Age (years)"].max(), 20)
height_mesh, age_mesh = np.meshgrid(height_range, age_range)

# Predict weights for the meshgrid
X_mesh = np.column_stack((height_mesh.flatten(), age_mesh.flatten()))
weight_pred = model.predict(X_mesh).reshape(height_mesh.shape)

# Plot the regression plane as a surface
surface = ax.plot_surface(
    height_mesh, age_mesh, weight_pred, alpha=0.5, cmap="viridis", linewidth=0
)

# Add scatter points for actual data
scatter = ax.scatter(
    data["Height (cm)"],
    data["Age (years)"],
    data["Weight (kg)"],
    c="red",
    s=30,
    alpha=0.7,
    label="Actual data",
)

# Add labels and title
ax.set_xlabel("Height (cm)", fontsize=12)
ax.set_ylabel("Age (years)", fontsize=12)
ax.set_zlabel("Weight (kg)", fontsize=12)
ax.set_title("Multiple Regression: Weight based on Height and Age", fontsize=14)

# Add a color bar for the surface
cbar = fig.colorbar(surface, ax=ax, pad=0.1, shrink=0.5)
cbar.set_label("Predicted Weight (kg)", rotation=270, labelpad=20)

ax.legend()
plt.tight_layout()
plt.show()

## Advanced Regression Analysis

Further insights into the multiple regression model.


In [None]:
# Calculate residuals
residuals = y - y_pred

# Calculate advanced metrics
mean_abs_error = np.mean(np.abs(residuals))

# Print regression metrics
print("Regression Metrics:")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R²): {r2:.4f}")
print(f"Mean Absolute Error: {mean_abs_error:.4f}")

# Checking assumptions of linear regression
print("\nResidual Statistics:")
print(f"Mean of Residuals: {np.mean(residuals):.6f}")
print(f"Standard Deviation of Residuals: {np.std(residuals):.4f}")

In [None]:
# Residual analysis plots
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Residuals vs. Fitted values plot
axes[0, 0].scatter(y_pred, residuals, alpha=0.6, color="blue")
axes[0, 0].axhline(y=0, color="r", linestyle="-")
axes[0, 0].set_xlabel("Predicted Weight (kg)")
axes[0, 0].set_ylabel("Residuals")
axes[0, 0].set_title("Residuals vs Fitted Values")
axes[0, 0].grid(alpha=0.3)

# Histogram of residuals
sns.histplot(residuals, kde=True, ax=axes[0, 1], color="green")
axes[0, 1].axvline(x=0, color="r", linestyle="-")
axes[0, 1].set_xlabel("Residual Value")
axes[0, 1].set_title("Distribution of Residuals")
axes[0, 1].grid(alpha=0.3)

# Residuals vs Height
axes[1, 0].scatter(data["Height (cm)"], residuals, alpha=0.6, color="purple")
axes[1, 0].axhline(y=0, color="r", linestyle="-")
axes[1, 0].set_xlabel("Height (cm)")
axes[1, 0].set_ylabel("Residuals")
axes[1, 0].set_title("Residuals vs Height")
axes[1, 0].grid(alpha=0.3)

# Residuals vs Age
axes[1, 1].scatter(data["Age (years)"], residuals, alpha=0.6, color="orange")
axes[1, 1].axhline(y=0, color="r", linestyle="-")
axes[1, 1].set_xlabel("Age (years)")
axes[1, 1].set_ylabel("Residuals")
axes[1, 1].set_title("Residuals vs Age")
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## Prediction Example

Using the model to predict weights for different combinations of height and age.


In [None]:
# Create a DataFrame with example height-age combinations
example_data = pd.DataFrame(
    {
        "Height (cm)": [160, 165, 170, 175, 180, 185, 190],
        "Age (years)": [25, 30, 35, 40, 45, 50, 55],
    }
)

# Make predictions
example_X = example_data.values
predicted_weights = model.predict(example_X)

# Add predictions to the DataFrame
example_data["Predicted Weight (kg)"] = predicted_weights
example_data["Lower Bound (kg)"] = predicted_weights - 1.96 * rmse
example_data["Upper Bound (kg)"] = predicted_weights + 1.96 * rmse

# Display the predictions
example_data

In [None]:
# Interactive prediction using ipywidgets
from ipywidgets import interactive, FloatSlider, VBox, HBox, Label, Output
import ipywidgets as widgets

# Create output widget to display results
output = Output()


# Define the prediction function
def predict_weight(height, age):
    # Clear previous output
    output.clear_output()

    # Make prediction
    input_data = np.array([[height, age]])
    weight_prediction = model.predict(input_data)[0]
    lower_bound = weight_prediction - 1.96 * rmse
    upper_bound = weight_prediction + 1.96 * rmse

    # Display results
    with output:
        print(f"Predicted Weight: {weight_prediction:.2f} kg")
        print(f"95% Prediction Interval: ({lower_bound:.2f}, {upper_bound:.2f}) kg")


# Create sliders for height and age
height_slider = FloatSlider(
    min=data["Height (cm)"].min() - 10,
    max=data["Height (cm)"].max() + 10,
    step=1,
    value=170,
    description="Height (cm):",
)

age_slider = FloatSlider(
    min=data["Age (years)"].min(),
    max=data["Age (years)"].max(),
    step=1,
    value=40,
    description="Age (years):",
)

# Create interactive widget
interactive_widget = interactive(predict_weight, height=height_slider, age=age_slider)

# Display widgets and output
display(VBox([interactive_widget, output]))

# Initial prediction
predict_weight(height_slider.value, age_slider.value)