# Exploratory Data Analysis (EDA) & Regression Analysis

## Load & Inspect Data
This step loads the dataset, checks its shape, and provides a basic overview.

In [11]:
# Load necessary libraries
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Load the dataset
health_data = pd.read_csv('https://raw.githubusercontent.com/4GeeksAcademy/regularized-linear-regression-project-tutorial/main/demographic_health_data.csv')

# Dataset Overview: Check shape, info, and first few rows
print("\n🔹 Dataset Shape:", health_data.shape)
print("\n🔹 Dataset Info:\n")
print(health_data.info())


🔹 Dataset Shape: (3140, 108)

🔹 Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3140 entries, 0 to 3139
Columns: 108 entries, fips to Urban_rural_code
dtypes: float64(61), int64(45), object(2)
memory usage: 2.6+ MB
None


# Explore here

### Handle Missing Values & Duplicates
This step removes duplicate rows and fills missing values using the median for numeric columns.

## Data Preprocessing
- Handles missing values
- Removes duplicates
- Encodes categorical variables
- Drops highly correlated features

In [12]:
# Handle Missing Values & Duplicates
health_data.drop_duplicates(inplace=True) 
health_data.fillna(health_data.median(numeric_only=True), inplace=True)  

### Encode Categorical Variables
Factorizes categorical columns into numerical representations.

In [13]:
# Convert categorical features using factorize() (instead of get_dummies)
categorical_cols = health_data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    health_data[col], _ = pd.factorize(health_data[col])

### Drop Highly Correlated Features
Removes features with a correlation higher than 0.9 to avoid multicollinearity.

In [14]:
# Remove Highly Correlated Features to Reduce Multicollinearity
corr_matrix = health_data.corr().abs()
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.9)]

health_data.drop(columns=to_drop, inplace=True)
print(f"\n🔹 Dropped {len(to_drop)} highly correlated features.")


🔹 Dropped 63 highly correlated features.


## Data Normalization
Standardizes the dataset using `StandardScaler` to improve model performance.

In [15]:
# Normalize Data for Regularization using StandardScaler
scaler = StandardScaler()
health_data_scaled = scaler.fit_transform(health_data)
health_data = pd.DataFrame(health_data_scaled, columns=health_data.columns)

# Final Dataset Info
print("\n🔹 Final Dataset Shape:", health_data.shape)
print("\n🔹 Sample Processed Data:\n", health_data.head())
print("\n🔹 Column names:\n", health_data.columns)


🔹 Final Dataset Shape: (3140, 45)

🔹 Sample Processed Data:
        fips   TOT_POP  0-9 y/o % of total pop  10-19 y/o % of total pop  \
0 -1.940874 -0.145679                0.158006                  0.573496   
1 -1.940742  0.341296               -0.242861                 -0.193107   
2 -1.940610 -0.237785               -0.419441                 -0.439718   
3 -1.940478 -0.245223               -0.426966                 -0.609076   
4 -1.940346 -0.138966                0.186249                  0.216679   

   20-29 y/o % of total pop  30-39 y/o % of total pop  \
0                  0.027610                  0.588469   
1                 -0.469965                 -0.110300   
2                  0.272104                  0.656538   
3                  0.396168                  1.264959   
4                 -0.200808                  0.088582   

   40-49 y/o % of total pop  50-59 y/o % of total pop  \
0                  1.515069                  0.263445   
1                  0.715673   

## Train-Test Split
Divides the dataset into training and testing sets to evaluate model performance.

In [16]:
# Train-Test Split to Evaluate Model Performance
from sklearn.model_selection import train_test_split

# Attempting to predict Obesity Rate
y = health_data['Obesity_prevalence']
X = health_data.drop(columns=['fips', 'CNTY_FIPS', 'COUNTY_NAME', 'Obesity_prevalence'])

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=117)

# Fit Linear Regression Model & Evaluate Performance
from sklearn.linear_model import LinearRegression


## Linear Regression Model
Fits a basic Linear Regression model and evaluates its performance.

### Fit Linear Regression Model
Trains a basic linear regression model and evaluates its performance using MSE and R² score.

In [17]:
model = LinearRegression()
model.fit(X_train, y_train)

print(f'Model intercept: {model.intercept_}')
print(f'Model coefficients: {model.coef_}')

predictions = model.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f'Model mean squared error: {mse}')
print(f'Model r2 score: {r2}')

Model intercept: 7.736401244703463e-05
Model coefficients: [-1.13585036e-02  1.99482175e+06  1.70459095e+06  2.93604793e+06
  1.59335323e+06  1.28938551e+06  1.39152443e+06  2.37838409e+06
  2.08012784e+06  1.44161498e+06  3.04120110e+07  1.56339160e-02
  2.69321900e+07 -1.64287148e-02  1.43642682e+07 -6.11199541e-03
  5.31468898e+06  1.13117249e-02  1.80055127e+06  2.87297448e+06
  6.73692216e-03 -4.96255976e-02  4.22269680e-02  4.74303985e-02
  1.14765698e-02 -1.34418068e-02  3.73761579e-02  1.27718314e-01
  7.62983433e-02  9.63972481e-02 -1.49968964e-02  2.95391287e-02
  2.54505102e-04 -2.22292999e-02 -1.89619791e-02  3.55547433e-03
  1.07988165e+00  5.58973543e-02 -3.03194341e-01  3.20326656e-02
  1.00094801e-02]
Model mean squared error: 0.10964813436065668
Model r2 score: 0.8871106154713545


## Lasso Regression (L1 Regularization)
- Applies Lasso regression to improve generalization.
- Tunes hyperparameters using GridSearchCV.
- Evaluates optimized Lasso model performance.

### Apply Lasso Regression
Implements Lasso regression (L1 regularization) to improve generalization and feature selection.

### Hyperparameter Tuning for Lasso Regression
Uses GridSearchCV to optimize the alpha parameter for Lasso regression.

### Evaluate Optimized Lasso Model
Re-trains the Lasso model with the best alpha and evaluates its performance.

In [18]:
# Apply Lasso Regression (L1 Regularization) to Improve Generalization
from sklearn.linear_model import Lasso

l1_model = Lasso(alpha=0.1,max_iter=200)
l1_model.fit(X_train, y_train)

l1_predictions = l1_model.predict(X_test)

l1_mse = mean_squared_error(y_test, l1_predictions)
l1_r2 = r2_score(y_test, l1_predictions)
print(f'L1 model mean squared error: {l1_mse}')
print(f'L1 model r2 score: {l1_r2}')

# Hyperparameter Tuning for Lasso Regression using GridSearchCV

from sklearn.model_selection import GridSearchCV

hyperparams = {
    'alpha': np.arange(0.0001, 1.0, 0.01)
}

grid = GridSearchCV(l1_model, hyperparams, scoring='r2', cv=5)

# Suppress warnings due to incopatibilities or converges
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

grid.fit(X_train, y_train)
best_alpha = grid.best_params_['alpha']
print(f'Best alpha: {best_alpha}')

# Re-run optimized Lasso model
l1_model = Lasso(alpha=best_alpha,max_iter=200)
l1_model.fit(X_train, y_train)

l1_predictions = l1_model.predict(X_test)

l1_mse = mean_squared_error(y_test, l1_predictions)
l1_r2 = r2_score(y_test, l1_predictions)
print(f'L1 model mean squared error: {l1_mse}')
print(f'L1 model r2 score: {l1_r2}')


L1 model mean squared error: 0.17380580276166482
L1 model r2 score: 0.8210564163660614
Best alpha: 0.0101
L1 model mean squared error: 0.1125432522060905
L1 model r2 score: 0.8841299165874691
