# Explore here

In [2]:
import pandas as pd

url = "https://breathecode.herokuapp.com/asset/internal-link?id=418&path=demographic_health_data.csv"
df = pd.read_csv(url)

df.head()

Unnamed: 0,fips,TOT_POP,0-9,0-9 y/o % of total pop,19-Oct,10-19 y/o % of total pop,20-29,20-29 y/o % of total pop,30-39,30-39 y/o % of total pop,...,COPD_number,diabetes_prevalence,diabetes_Lower 95% CI,diabetes_Upper 95% CI,diabetes_number,CKD_prevalence,CKD_Lower 95% CI,CKD_Upper 95% CI,CKD_number,Urban_rural_code
0,1001,55601,6787,12.206615,7637,13.735364,6878,12.370281,7089,12.749771,...,3644,12.9,11.9,13.8,5462,3.1,2.9,3.3,1326,3
1,1003,218022,24757,11.355276,26913,12.344167,23579,10.814964,25213,11.564429,...,14692,12.0,11.0,13.1,20520,3.2,3.0,3.5,5479,4
2,1005,24881,2732,10.980266,2960,11.896628,3268,13.13452,3201,12.865239,...,2373,19.7,18.6,20.6,3870,4.5,4.2,4.8,887,6
3,1007,22400,2456,10.964286,2596,11.589286,3029,13.522321,3113,13.897321,...,1789,14.1,13.2,14.9,2511,3.3,3.1,3.6,595,2
4,1009,57840,7095,12.266598,7570,13.087828,6742,11.656293,6884,11.901798,...,4661,13.5,12.6,14.5,6017,3.4,3.2,3.7,1507,2


In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)



Train shape: (2512, 108)
Test shape: (628, 108)


In [None]:
import numpy as np
import pandas as pd

print("Rows, Columns:", df.shape)
print("Duplicate rows:", df.duplicated().sum())






Rows, Columns: (3140, 108)
Duplicate rows: 0
                        Missing Count  Missing %
fips                                0        0.0
TOT_POP                             0        0.0
0-9                                 0        0.0
0-9 y/o % of total pop              0        0.0
19-Oct                              0        0.0
...                               ...        ...
CKD_prevalence                      0        0.0
CKD_Lower 95% CI                    0        0.0
CKD_Upper 95% CI                    0        0.0
CKD_number                          0        0.0
Urban_rural_code                    0        0.0

[108 rows x 2 columns]


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

target = "diabetes_prevalence"

X = df.drop(columns=[target])
y = df[target]

X = X.select_dtypes(include=["number"])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

y_pred = linear_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Target: {target}")
print(f"Features used: {X.shape[1]}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")

Target: diabetes_prevalence
Features used: 105
MAE: 0.0351
RMSE: 0.0439
R²: 0.9998


In [11]:
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Baseline model: Linear Regression
baseline_model = LinearRegression()
baseline_model.fit(X_train, y_train)
y_pred_baseline = baseline_model.predict(X_test)

# Lasso model with automatic alpha selection
lasso_model = Pipeline([
    ("scaler", StandardScaler()),
    ("lasso", LassoCV(cv=5, random_state=42, max_iter=10000))
])
lasso_model.fit(X_train, y_train)
y_pred_lasso = lasso_model.predict(X_test)

comparison_df = pd.DataFrame({
    "Model": ["Baseline LinearRegression", "LassoCV"],
    "MAE": [
        mean_absolute_error(y_test, y_pred_baseline),
        mean_absolute_error(y_test, y_pred_lasso),
    ],
    "RMSE": [
        np.sqrt(mean_squared_error(y_test, y_pred_baseline)),
        np.sqrt(mean_squared_error(y_test, y_pred_lasso)),
    ],
    "R2": [
        r2_score(y_test, y_pred_baseline),
        r2_score(y_test, y_pred_lasso),
    ],
})

comparison_df

Unnamed: 0,Model,MAE,RMSE,R2
0,Baseline LinearRegression,0.035132,0.043885,0.999761
1,LassoCV,0.034841,0.046936,0.999726


In [12]:
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import pandas as pd

# Optimized linear model (Ridge with CV)
alphas = np.logspace(-4, 2, 40)
optimized_model = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", RidgeCV(alphas=alphas, cv=5))
])

optimized_model.fit(X_train, y_train)
y_pred_optimized = optimized_model.predict(X_test)

optimized_results = pd.DataFrame({
    "Model": ["Baseline LinearRegression", "Optimized RidgeCV"],
    "MAE": [
        mean_absolute_error(y_test, y_pred_baseline),
        mean_absolute_error(y_test, y_pred_optimized),
    ],
    "RMSE": [
        np.sqrt(mean_squared_error(y_test, y_pred_baseline)),
        np.sqrt(mean_squared_error(y_test, y_pred_optimized),),
    ],
    "R2": [
        r2_score(y_test, y_pred_baseline),
        r2_score(y_test, y_pred_optimized),
    ],
})

best_alpha = optimized_model.named_steps["ridge"].alpha_
print(f"Best alpha (RidgeCV): {best_alpha:.6f}")
optimized_results

Best alpha (RidgeCV): 0.041246


Unnamed: 0,Model,MAE,RMSE,R2
0,Baseline LinearRegression,0.035132,0.043885,0.999761
1,Optimized RidgeCV,0.035028,0.043798,0.999762


In [13]:
# Final model comparison (bottom)
final_comparison = pd.DataFrame({
    "Model": ["Baseline LinearRegression", "LassoCV", "Optimized RidgeCV"],
    "MAE": [
        mean_absolute_error(y_test, y_pred_baseline),
        mean_absolute_error(y_test, y_pred_lasso),
        mean_absolute_error(y_test, y_pred_optimized),
    ],
    "RMSE": [
        np.sqrt(mean_squared_error(y_test, y_pred_baseline)),
        np.sqrt(mean_squared_error(y_test, y_pred_lasso)),
        np.sqrt(mean_squared_error(y_test, y_pred_optimized)),
    ],
    "R2": [
        r2_score(y_test, y_pred_baseline),
        r2_score(y_test, y_pred_lasso),
        r2_score(y_test, y_pred_optimized),
    ],
})

final_comparison = final_comparison.sort_values(by="RMSE", ascending=True).reset_index(drop=True)
print("Best model by RMSE:", final_comparison.loc[0, "Model"])
final_comparison

Best model by RMSE: Optimized RidgeCV


Unnamed: 0,Model,MAE,RMSE,R2
0,Optimized RidgeCV,0.035028,0.043798,0.999762
1,Baseline LinearRegression,0.035132,0.043885,0.999761
2,LassoCV,0.034841,0.046936,0.999726


## Model Comparison Summary

We compared three regression models on the same test set:

- **Optimized RidgeCV** achieved the best overall balance and the **lowest RMSE (0.043798)**.
- **Baseline LinearRegression** performed very similarly, with slightly higher RMSE.
- **LassoCV** had the lowest MAE, but higher RMSE and slightly lower R² than RidgeCV.

### Conclusion
Based on RMSE (our primary selection metric), **Optimized RidgeCV** is selected as the final model.