# Height vs Weight Dataset with Linear Regression

This notebook demonstrates the relationship between height and weight using a synthetic dataset with linear regression analysis.


## Configuration Parameters

This section contains adjustable parameters for the dataset generation and visualization.


In [None]:
# Configuration parameters
config = {
    # Dataset parameters
    "n_samples": 200,  # Number of samples to generate
    "random_seed": 42,  # Random seed for reproducibility
    # Height distribution parameters
    "height_mean": 170,  # Mean height in cm
    "height_std": 10,  # Standard deviation for height
    # Weight parameters
    "base_weight": -70,  # Base weight component
    "height_factor": 0.8,  # Weight factor per cm of height
    "weight_noise_std": 5,  # Standard deviation of noise in weight
    # Plot parameters
    "plot_figsize": (12, 8),  # Figure size
    "scatter_alpha": 0.6,  # Transparency of scatter points
    "scatter_color": "blue",  # Color of scatter points
    "line_color": "red",  # Color of regression line
    "line_width": 2,  # Width of regression line
    "grid_alpha": 0.3,  # Transparency of grid lines
}

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Set random seed for reproducibility
np.random.seed(config["random_seed"])

In [None]:
# Generate heights in cm (normally distributed)
heights = np.random.normal(config["height_mean"], config["height_std"], config["n_samples"])

# Create weights with a linear relationship to height plus some noise
# Weight = base_weight + (height_factor * height) + noise
noise = np.random.normal(0, config["weight_noise_std"], config["n_samples"])
weights = config["base_weight"] + (config["height_factor"] * heights) + noise

# Create a DataFrame
data = pd.DataFrame({"Height (cm)": heights, "Weight (kg)": weights})

# Display the first few rows
data.head()

## Exploratory Data Analysis

Let's analyze the dataset from a data scientist's perspective.


In [None]:
# Display descriptive statistics
print("Descriptive Statistics:")
display(data.describe())

# Check for missing values
print("\nMissing Values:")
display(data.isnull().sum())

# Calculate correlation
correlation = data["Height (cm)"].corr(data["Weight (kg)"])
print(f"\nCorrelation between Height and Weight: {correlation:.4f}")

In [None]:
# Distribution analysis
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Height distribution
sns.histplot(x=data["Height (cm)"], kde=True, ax=axes[0], color="skyblue")
axes[0].set_title("Height Distribution")
axes[0].axvline(
    data["Height (cm)"].mean(),
    color="red",
    linestyle="--",
    label=f'Mean: {data["Height (cm)"].mean():.2f} cm',
)
axes[0].legend()

# Weight distribution
sns.histplot(x=data["Weight (kg)"], kde=True, ax=axes[1], color="green")
axes[1].set_title("Weight Distribution")
axes[1].axvline(
    data["Weight (kg)"].mean(),
    color="red",
    linestyle="--",
    label=f'Mean: {data["Weight (kg)"].mean():.2f} kg',
)
axes[1].legend()

plt.tight_layout()
plt.show()

In [None]:
# Prepare data for linear regression
X = data["Height (cm)"].to_numpy().reshape(-1, 1)  # Independent variable
y = data["Weight (kg)"].to_numpy()  # Dependent variable

# Create and fit the linear regression model
model = LinearRegression()
model.fit(X, y)

# Get the coefficient (slope) and intercept


slope = model.coef_[0]
intercept = model.intercept_

# Create the scatter plot


plt.figure(figsize=config["plot_figsize"])


plt.scatter(
    data["Height (cm)"],
    data["Weight (kg)"],
    alpha=config["scatter_alpha"],
    color=config["scatter_color"],
    label="Data points",
)

# Add the regression line in red
height_range = np.linspace(min(heights), max(heights), 100)


plt.plot(
    height_range,
    model.predict(height_range.reshape(-1, 1)),
    color=config["line_color"],
    linewidth=config["line_width"],
    label=f"Linear regression: y = {slope:.2f}x + {intercept:.2f}",
)


# Customize the plot
plt.title("Height vs Weight Relationship with Linear Regression", fontsize=14)
plt.xlabel("Height (cm)", fontsize=12)
plt.ylabel("Weight (kg)", fontsize=12)


plt.grid(True, alpha=config["grid_alpha"])
plt.legend()

# Display the plot


plt.tight_layout()
plt.show()


# Print model metrics


print(f"Linear Regression Model: Weight = {slope:.4f} × Height + {intercept:.4f}")
print(f"R-squared: {model.score(X, y):.4f}")

## Advanced Regression Analysis

Further insights into the linear regression model.


In [None]:
# Make predictions
y_pred = model.predict(X)

# Calculate residuals
residuals = y - y_pred

# Calculate metrics
mse = mean_squared_error(y, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y, y_pred)
mean_abs_error = np.mean(np.abs(residuals))

# Print regression metrics
print("Regression Metrics:")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R-squared (R²): {r2:.4f}")
print(f"Mean Absolute Error: {mean_abs_error:.4f}")

# Checking assumptions of linear regression
print("\nResidual Statistics:")
print(f"Mean of Residuals: {np.mean(residuals):.6f}")
print(f"Standard Deviation of Residuals: {np.std(residuals):.4f}")

In [None]:
# Residual analysis
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Residuals vs. Fitted values plot
axes[0].scatter(y_pred, residuals, alpha=0.6)
axes[0].axhline(y=0, color="r", linestyle="-")
axes[0].set_xlabel("Predicted Weight (kg)")
axes[0].set_ylabel("Residuals")
axes[0].set_title("Residuals vs Fitted Values")
axes[0].grid(alpha=0.3)

# Histogram of residuals
sns.histplot(residuals, kde=True, ax=axes[1], color="green")
axes[1].axvline(x=0, color="r", linestyle="-")
axes[1].set_xlabel("Residual Value")
axes[1].set_title("Distribution of Residuals")
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Creating a plot with prediction intervals
plt.figure(figsize=config["plot_figsize"])

# Original data points
plt.scatter(X, y, alpha=config["scatter_alpha"], label="Data")

# Sort X for smoother line
sort_idx = np.argsort(X.flatten())
X_sorted = X[sort_idx]
y_pred_sorted = y_pred[sort_idx]

# Regression line
plt.plot(X_sorted, y_pred_sorted, color="red", lw=2, label="Regression Line")

# Add prediction intervals (approximately 95% of points should fall within)
interval = 1.96 * rmse
plt.fill_between(
    X_sorted.flatten(),
    y_pred_sorted - interval,
    y_pred_sorted + interval,
    color="gray",
    alpha=0.2,
    label=f"95% Prediction Interval (±{interval:.2f} kg)",
)

plt.title("Height vs Weight with Prediction Intervals")
plt.xlabel("Height (cm)")
plt.ylabel("Weight (kg)")
plt.grid(alpha=0.3)
plt.legend()
plt.tight_layout()
plt.show()

## Prediction Example

Using the model to predict weights for new height values.


In [None]:
# Example predictions for different heights
example_heights = np.array([160, 165, 170, 175, 180, 185, 190])
example_heights_reshaped = example_heights.reshape(-1, 1)
predicted_weights = model.predict(example_heights_reshaped)

# Create a DataFrame for the predictions
predictions_df = pd.DataFrame(
    {
        "Height (cm)": example_heights,
        "Predicted Weight (kg)": predicted_weights,
        "Lower Bound (kg)": predicted_weights - interval,
        "Upper Bound (kg)": predicted_weights + interval,
    }
)

# Display the predictions
display(predictions_df)