In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [10]:
df_url = "https://raw.githubusercontent.com/EZags/Milliman-Project-2025/main/merged_df1.csv"
df = pd.read_csv(df_url)
#Drop non-numeric columns
df = df.select_dtypes(include=[np.number])

In [11]:
null_values = df.isnull().sum()
print('Number of nulls in each column')
print(null_values)

Number of nulls in each column
fips                                                                             0
population                                                                       0
dep_prevalence                                                                   0
Households Total                                                              2396
Households Mean income (dollars)                                              2396
Households Median income (dollars)                                            2396
Premature Death Deaths                                                         341
Premature Death Years of Potential Life Lost Rate                              341
Poor Physical Health Days Average Number of Physically Unhealthy Days          123
Low Birth Weight % Low Birth Weight                                            207
Poor Mental Health Days Average Number of Mentally Unhealthy Days              123
Poor or Fair Health % Fair or Poor Health               

In [12]:
# Drop rows with null values
df = df.dropna()

In [13]:
TARGET = 'dep_prevalence'

X = df.copy()
X = X.drop(TARGET, axis=1)
X = X.drop('fips', axis=1)
Y = df[[TARGET]]

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=707)

In [15]:
lr = LinearRegression(copy_X=True)
lr.fit(X_train, Y_train)

In [16]:
lr.intercept_[0]

np.float64(16.212869335182916)

In [None]:

lr_coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': lr.coef_[0]
})

lr_coefficients.head()

Unnamed: 0,Feature,Coefficient
0,population,7.356712e-07
1,Households Total,4.275106e-05
2,Households Mean income (dollars),4.93688e-07
3,Households Median income (dollars),-3.272582e-05
4,Premature Death Deaths,-0.0004388066


In [18]:
Y_pred_train = lr.predict(X_train)

In [19]:
lr_train_r2 = lr.score(X_train, Y_train)
print("Linear Regression Train R-squared:", lr_train_r2)

Linear Regression Train R-squared: 0.4775464977075984


In [20]:
lr_train_rmse = np.sqrt(mean_squared_error(Y_train, Y_pred_train))
print("Linear Regression Train RMSE:", lr_train_rmse)

Linear Regression Train RMSE: 2.09894439611473
