# Height-Weight BMI Category Classification

This notebook demonstrates classification of BMI categories (Underweight, Normal, Overweight, Obese) based on height and weight using a synthetic dataset.


## Configuration Parameters

This section contains adjustable parameters for the dataset generation and visualization.


In [None]:
# Configuration parameters
config = {
    # Dataset parameters
    "n_samples": 1000,  # Number of samples to generate
    "random_seed": 42,  # Random seed for reproducibility
    # Category distribution
    "category_distribution": {  # Distribution of samples across categories
        "Underweight": 0.15,
        "Normal": 0.45,
        "Overweight": 0.3,
        "Obese": 0.1,
    },
    # Height and weight parameters for each category
    "underweight": {
        "height_mean": 170,  # Mean height in cm
        "height_std": 8,  # Standard deviation for height
        "weight_mean": 50,  # Mean weight in kg
        "weight_std": 5,  # Standard deviation for weight
    },
    "normal": {
        "height_mean": 172,  # Mean height in cm
        "height_std": 9,  # Standard deviation for height
        "weight_mean": 65,  # Mean weight in kg
        "weight_std": 7,  # Standard deviation for weight
    },
    "overweight": {
        "height_mean": 175,  # Mean height in cm
        "height_std": 10,  # Standard deviation for height
        "weight_mean": 85,  # Mean weight in kg
        "weight_std": 8,  # Standard deviation for weight
    },
    "obese": {
        "height_mean": 173,  # Mean height in cm
        "height_std": 9,  # Standard deviation for height
        "weight_mean": 105,  # Mean weight in kg
        "weight_std": 10,  # Standard deviation for weight
    },
    # Plot parameters
    "plot_figsize": (12, 8),  # Figure size
    "scatter_alpha": 0.6,  # Transparency of scatter points
    "category_colors": {  # Colors for each category
        "Underweight": "skyblue",
        "Normal": "green",
        "Overweight": "orange",
        "Obese": "red",
    },
    "grid_alpha": 0.3,  # Transparency of grid lines
    "contour_alpha": 0.2,  # Transparency of decision boundary contour
}

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.inspection import DecisionBoundaryDisplay

# Set random seed for reproducibility
np.random.seed(config["random_seed"])

In [None]:
# Initialize empty lists to store data
heights = []
weights = []
categories = []

# Generate synthetic data for each BMI category
for category, proportion in config["category_distribution"].items():
    # Calculate number of samples for this category
    n_category_samples = int(config["n_samples"] * proportion)

    # Get parameters for this category
    category_params = config[category.lower()]

    # Generate heights and weights for this category
    category_heights = np.random.normal(
        category_params["height_mean"], category_params["height_std"], n_category_samples
    )

    category_weights = np.random.normal(
        category_params["weight_mean"], category_params["weight_std"], n_category_samples
    )

    # Append to main lists
    heights.extend(category_heights)
    weights.extend(category_weights)
    categories.extend([category] * n_category_samples)

# Create a DataFrame
data = pd.DataFrame({"Height (cm)": heights, "Weight (kg)": weights, "BMI_Category": categories})

# Calculate actual BMI for reference
data["BMI"] = data["Weight (kg)"] / ((data["Height (cm)"] / 100) ** 2)

# Display the first few rows
data.head()

## Exploratory Data Analysis

Let's analyze the dataset from a data scientist's perspective.


In [None]:
# Display descriptive statistics
print("Descriptive Statistics:")
display(data.describe())

# Check class distribution
print("\nBMI Category Distribution:")
display(data["BMI_Category"].value_counts())

# Calculate statistics by category
print("\nStatistics by BMI Category:")
display(
    data.groupby("BMI_Category").agg(
        {
            "Height (cm)": ["mean", "std", "min", "max"],
            "Weight (kg)": ["mean", "std", "min", "max"],
            "BMI": ["mean", "std", "min", "max"],
        }
    )
)

In [None]:
# Distribution analysis by category
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Height distribution by category
sns.boxplot(
    x="BMI_Category", y="Height (cm)", data=data, ax=axes[0], palette=config["category_colors"]
)
axes[0].set_title("Height Distribution by BMI Category")
axes[0].grid(alpha=config["grid_alpha"])

# Weight distribution by category
sns.boxplot(
    x="BMI_Category", y="Weight (kg)", data=data, ax=axes[1], palette=config["category_colors"]
)
axes[1].set_title("Weight Distribution by BMI Category")
axes[1].grid(alpha=config["grid_alpha"])

# BMI distribution by category
sns.boxplot(x="BMI_Category", y="BMI", data=data, ax=axes[2], palette=config["category_colors"])
axes[2].set_title("BMI Distribution by Category")
axes[2].axhline(y=18.5, color="black", linestyle="--", label="Underweight Threshold (18.5)")
axes[2].axhline(y=25, color="black", linestyle="-.", label="Normal Threshold (25)")
axes[2].axhline(y=30, color="black", linestyle=":", label="Overweight Threshold (30)")
axes[2].legend()
axes[2].grid(alpha=config["grid_alpha"])

plt.tight_layout()
plt.show()

In [None]:
# Scatter plot of Height vs Weight colored by BMI category
plt.figure(figsize=config["plot_figsize"])

# Plot each category with different colors
for category, color in config["category_colors"].items():
    mask = data["BMI_Category"] == category
    plt.scatter(
        data.loc[mask, "Height (cm)"],
        data.loc[mask, "Weight (kg)"],
        alpha=config["scatter_alpha"],
        color=color,
        label=category,
    )

# Draw BMI lines (for reference)
height_range = np.linspace(data["Height (cm)"].min() - 5, data["Height (cm)"].max() + 5, 100)
plt.plot(height_range, 18.5 * ((height_range / 100) ** 2), "k--", label="BMI = 18.5")
plt.plot(height_range, 25 * ((height_range / 100) ** 2), "k-.", label="BMI = 25")
plt.plot(height_range, 30 * ((height_range / 100) ** 2), "k:", label="BMI = 30")

# Customize the plot
plt.title("Height vs Weight by BMI Category", fontsize=14)
plt.xlabel("Height (cm)", fontsize=12)
plt.ylabel("Weight (kg)", fontsize=12)
plt.grid(True, alpha=config["grid_alpha"])
plt.legend(fontsize=10)

# Display the plot
plt.tight_layout()
plt.show()

## Classification Modeling

Train a classifier to predict BMI category based on height and weight.


In [None]:
# Prepare features and target
X = data[["Height (cm)", "Weight (kg)"]].values
y = data["BMI_Category"].values

# Encode the target categories
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
category_mapping = {i: category for i, category in enumerate(label_encoder.classes_)}
print("Category mapping:", category_mapping)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=config["random_seed"], stratify=y_encoded
)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create and train the classifier
clf = RandomForestClassifier(n_estimators=100, random_state=config["random_seed"])
clf.fit(X_train_scaled, y_train)

# Make predictions
y_pred = clf.predict(X_test_scaled)

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
# Visualize the confusion matrix
plt.figure(figsize=(10, 8))
conf_df = pd.DataFrame(conf_matrix, index=label_encoder.classes_, columns=label_encoder.classes_)
sns.heatmap(conf_df, annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.ylabel("True BMI Category")
plt.xlabel("Predicted BMI Category")
plt.tight_layout()
plt.show()

# Feature importance
plt.figure(figsize=(8, 6))
importances = clf.feature_importances_
feature_names = ["Height", "Weight"]
feature_importance = pd.DataFrame({"Feature": feature_names, "Importance": importances})
sns.barplot(x="Importance", y="Feature", data=feature_importance)
plt.title("Feature Importance")
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Visualize the decision boundary
plt.figure(figsize=config["plot_figsize"])

# Create a meshgrid to visualize the decision boundary
h = 0.5  # step size in the mesh
x_min, x_max = data["Height (cm)"].min() - 5, data["Height (cm)"].max() + 5
y_min, y_max = data["Weight (kg)"].min() - 5, data["Weight (kg)"].max() + 5
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Scale the meshgrid
grid_points = np.c_[xx.ravel(), yy.ravel()]
grid_points_scaled = scaler.transform(grid_points)

# Predict with the classifier
Z = clf.predict(grid_points_scaled)
Z = Z.reshape(xx.shape)

# Plot the decision boundary
plt.contourf(xx, yy, Z, alpha=config["contour_alpha"], cmap=plt.cm.RdYlBu)

# Plot the original data points
for category, color in config["category_colors"].items():
    category_idx = np.where(label_encoder.classes_ == category)[0][0]
    mask = y_encoded == category_idx
    plt.scatter(
        X[mask, 0],
        X[mask, 1],
        alpha=config["scatter_alpha"],
        color=color,
        edgecolors="k",
        label=category,
    )

# Customize the plot
plt.title("Decision Boundaries for BMI Categories", fontsize=14)
plt.xlabel("Height (cm)", fontsize=12)
plt.ylabel("Weight (kg)", fontsize=12)
plt.grid(True, alpha=config["grid_alpha"])
plt.legend(fontsize=10)

# Display the plot
plt.tight_layout()
plt.show()

## Prediction Example

Using the model to predict BMI categories for new height and weight values.


In [None]:
# Example predictions for different heights and weights
example_data = pd.DataFrame(
    {
        "Height (cm)": [165, 170, 175, 180, 175, 165, 180],
        "Weight (kg)": [45, 60, 75, 90, 100, 85, 70],
    }
)

# Calculate BMI for reference
example_data["BMI"] = example_data["Weight (kg)"] / ((example_data["Height (cm)"] / 100) ** 2)

# Scale the example data
example_scaled = scaler.transform(example_data[["Height (cm)", "Weight (kg)"]])

# Predict BMI categories
example_data["Predicted_Category"] = label_encoder.inverse_transform(clf.predict(example_scaled))

# Calculate prediction probabilities
probabilities = clf.predict_proba(example_scaled)
for i, category in enumerate(label_encoder.classes_):
    example_data[f"Prob_{category}"] = probabilities[:, i]

# Display the predictions
display(example_data)

# Visualize predictions on the decision boundary
plt.figure(figsize=config["plot_figsize"])

# Plot the decision boundary
plt.contourf(xx, yy, Z, alpha=config["contour_alpha"], cmap=plt.cm.RdYlBu)

# Plot example data points with predicted categories
for category in label_encoder.classes_:
    mask = example_data["Predicted_Category"] == category
    if mask.any():
        plt.scatter(
            example_data.loc[mask, "Height (cm)"],
            example_data.loc[mask, "Weight (kg)"],
            s=100,
            marker="*",
            color=config["category_colors"][category],
            edgecolors="black",
            label=f"Predicted {category}",
        )

# Draw BMI lines (for reference)
plt.plot(height_range, 18.5 * ((height_range / 100) ** 2), "k--", label="BMI = 18.5")
plt.plot(height_range, 25 * ((height_range / 100) ** 2), "k-.", label="BMI = 25")
plt.plot(height_range, 30 * ((height_range / 100) ** 2), "k:", label="BMI = 30")

# Customize the plot
plt.title("Predictions for Example Data Points", fontsize=14)
plt.xlabel("Height (cm)", fontsize=12)
plt.ylabel("Weight (kg)", fontsize=12)
plt.grid(True, alpha=config["grid_alpha"])
plt.legend(fontsize=10)

# Display the plot
plt.tight_layout()
plt.show()