# Regularized Linear Regression Project - U.S. County-Level Socio-Demographic and Health Data (2018–2019)

In [None]:
# -------------------------------------
# Import libraries
# -------------------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns

## 1. Load the dataset

In [None]:
url = 'https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv'
df = pd.read_csv(url)

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")

> We load the dataset directly from the provided GitHub link to ensure we are using the correct data source.  

> This dataset contains socio-demographic and health-related data for U.S. counties.


## 2. Define target variable

In [None]:
target = 'Obesity_prevalence'

if target not in df.columns:
    raise ValueError("Target variable not found in dataset!")

> We select **Obesity Prevalence** as the target variable because it is a key health indicator we aim to predict based on socio-demographic features.

## 3. Exploratory Data Analysis (EDA)

In [None]:
# Basic dataset information
print("\nDataset info:")
print(df.info())

# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())

### Correlation

In [None]:
# Correlation Heatmap

plt.figure(figsize=(12, 10))
correlation_matrix = df.corr(numeric_only=True)
sns.heatmap(correlation_matrix, cmap='coolwarm', annot=False)
plt.title('Correlation Heatmap')
plt.show()

> We plot a correlation heatmap to visualize the relationships between variables. This helps us identify which features are most related to our target and whether there is multicollinearity.

In [None]:
# Correlation with target variable
cor_target = correlation_matrix[target].drop(target)
top_features = cor_target.abs().sort_values(ascending=False).head(10)
print("\nTop 10 correlated features with target:")
print(top_features)

## 4. Data Cleaning

In [None]:
# Remove **identifier columns** (such as county names) that do not help prediction.
id_cols = ['fips', 'COUNTY_NAME', 'STATE_NAME', 'STATE_FIPS', 'CNTY_FIPS']
df.drop(columns=[col for col in id_cols if col in df.columns], inplace=True)

# Drop **columns with more than 30% missing data** to avoid introducing bias.
threshold = len(df) * 0.3
df = df.loc[:, df.isnull().sum() <= threshold]

# Remove **leakage columns** that could give the model information too directly related to the target.
leakage_cols = [
    'Obesity_Upper 95% CI', 'Obesity_Lower 95% CI',
    'anycondition_Upper 95% CI', 'anycondition_Lower 95% CI',
    'anycondition_prevalence'
]
df.drop(columns=[col for col in leakage_cols if col in df.columns], inplace=True)

# Remaining missing values are filled with the mean of each feature to preserve as much data as possible while providing reasonable estimates.
df.fillna(df.mean(numeric_only=True), inplace=True)

## 5. Prepare features and target

> We separate our features (X) from the target variable (y) to prepare for model training.


In [None]:
X = df.drop(columns=[target])
y = df[target]

## 6. Train-Test Split

> We split the dataset into training and testing sets to evaluate model performance on unseen data. This helps prevent overfitting and gives us a better idea of real-world accuracy.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 7. Linear Regression Model

> We train a **Linear Regression model** as our baseline. This model predicts obesity prevalence based on the input features without any regularization.

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"\nLinear Regression R² score: {r2_lr:.4f}")

## 8. Lasso Regression Model

> **Lasso Regression** is a linear model that includes regularization, which shrinks some feature coefficients and can even reduce some to zero, simplifying the model.

### What is Alpha?

> **Alpha controls the regularization strength:**

> Low alpha → minimal penalty → model behaves like regular regression.

> High alpha → stronger penalty → model becomes simpler but may lose accuracy.


In [None]:
alphas = np.linspace(0.01, 20, 50)
r2_scores = []

for alpha in alphas:
    lasso = Lasso(alpha=alpha, max_iter=10000)
    lasso.fit(X_train, y_train)
    y_pred_lasso = lasso.predict(X_test)
    r2_scores.append(r2_score(y_test, y_pred_lasso))

> We test a **range of alpha values** to see how model performance changes.

In [None]:
# Plot R² vs Alpha
plt.figure(figsize=(10, 6))
plt.plot(alphas, r2_scores, marker='o')
plt.xlabel('Alpha')
plt.ylabel('R² Score')
plt.title('Lasso Regression: R² Score vs Alpha')
plt.grid(True)
plt.show()

> When **alpha increases**, the R² score typically **decreases** because the model is becoming overly simple and starts underfitting the data.


## 9. Optimize Lasso with Cross-Validation

In [None]:
param_grid = {'alpha': alphas}
grid = GridSearchCV(Lasso(max_iter=10000), param_grid, cv=5, scoring='r2')
grid.fit(X_train, y_train)

best_alpha = grid.best_params_['alpha']
print(f"\nBest alpha from cross-validation: {best_alpha:.2f}")
print(f"Best cross-validated R² score: {grid.best_score_:.4f}")

# Final Lasso Model
lasso_final = Lasso(alpha=best_alpha, max_iter=10000)
lasso_final.fit(X_train, y_train)
y_pred_final = lasso_final.predict(X_test)
r2_final = r2_score(y_test, y_pred_final)
print(f"Final Lasso Model R² score on test set: {r2_final:.4f}")

## 10. Cross-Validation for Linear Regression

In [None]:
cv_scores_lr = cross_val_score(lr, X, y, cv=5, scoring='r2')
print(f"\nCross-validated R² scores (Linear Regression): {cv_scores_lr}")
print(f"Mean CV R² score: {cv_scores_lr.mean():.4f}")