In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm

df = pd.read_csv('https://www.ishelp.info/data/insurance.csv')

# Set label and features
y = df['age']                    
X = df.select_dtypes(np.number).assign(const=1)
X = X.drop(columns=['age'])

# Run the multiple linear regression model
model = sm.OLS(y, X).fit()
print(model.summary())  # View results

                            OLS Regression Results                            
Dep. Variable:                    age   R-squared:                       0.092
Model:                            OLS   Adj. R-squared:                  0.090
Method:                 Least Squares   F-statistic:                     45.32
Date:                Thu, 23 Jan 2025   Prob (F-statistic):           6.84e-28
Time:                        18:30:38   Log-Likelihood:                -5368.9
No. Observations:                1338   AIC:                         1.075e+04
Df Residuals:                    1334   BIC:                         1.077e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
bmi            0.1199      0.061      1.955      0.0

In [9]:
df = pd.read_csv('https://www.ishelp.info/data/insurance.csv')

# Set label and features
y = df['bmi']                    
X = df.select_dtypes(np.number).assign(const=1)
X = X.drop(columns=['bmi'])

# Run the multiple linear regression model
model = sm.OLS(y, X).fit()
print(model.summary())  # View results

                            OLS Regression Results                            
Dep. Variable:                    bmi   R-squared:                       0.042
Model:                            OLS   Adj. R-squared:                  0.040
Method:                 Least Squares   F-statistic:                     19.54
Date:                Thu, 23 Jan 2025   Prob (F-statistic):           2.13e-12
Time:                        18:31:30   Log-Likelihood:                -4288.4
No. Observations:                1338   AIC:                             8585.
Df Residuals:                    1334   BIC:                             8606.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
age            0.0238      0.012      1.955      0.0

When predicting age, the R-squared value is 0.092, which means the model only explains about 9.2% of the variance in age. The p-value for charges is significant (below 0.05), but BMI and children aren’t statistically significant. While BMI is barely over the threshold, it may still be uyseful. Similarly, when predicting BMI, the R-squared value is also low, showing that the model doesn't fit well. Charges are still statistically significant, while children and age have higher p-values, meaning they might not be contributing much to the model. Based on these results, neither model seems very strong or useful because they don’t explain a lot of the variance in the dependent variables.

Age=0.1199⋅BMI+0.2596⋅Children+0.0003⋅Charges+30.8256

BMI=0.0238⋅Age−0.0099⋅Children+0.00009168⋅Charges+28.5231

In [4]:
df = pd.read_csv('https://www.ishelp.info/data/insurance.csv')

# Set label and features
y = df['age']                    
X = df.select_dtypes(np.number).assign(const=1)
X = X.drop(columns=['age', 'children'])

# Run the multiple linear regression model
model = sm.OLS(y, X).fit()
print(model.summary())  # View results

                            OLS Regression Results                            
Dep. Variable:                    age   R-squared:                       0.092
Model:                            OLS   Adj. R-squared:                  0.091
Method:                 Least Squares   F-statistic:                     67.64
Date:                Thu, 23 Jan 2025   Prob (F-statistic):           1.05e-28
Time:                        18:30:38   Log-Likelihood:                -5369.3
No. Observations:                1338   AIC:                         1.074e+04
Df Residuals:                    1335   BIC:                         1.076e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
bmi            0.1198      0.061      1.955      0.0

In [5]:
df = pd.read_csv('https://www.ishelp.info/data/insurance.csv')

# Set label and features
y = df['bmi']                    
X = df.select_dtypes(np.number).assign(const=1)
X = X.drop(columns=['bmi', 'children'])

# Run the multiple linear regression model
model = sm.OLS(y, X).fit()
print(model.summary())  # View results

                            OLS Regression Results                            
Dep. Variable:                    bmi   R-squared:                       0.042
Model:                            OLS   Adj. R-squared:                  0.041
Method:                 Least Squares   F-statistic:                     29.32
Date:                Thu, 23 Jan 2025   Prob (F-statistic):           3.44e-13
Time:                        18:30:39   Log-Likelihood:                -4288.4
No. Observations:                1338   AIC:                             8583.
Df Residuals:                    1335   BIC:                             8598.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
age            0.0238      0.012      1.955      0.0

Based on the regression results, children is the better variable to remove. Its p-value is much higher than 0.05 in both models (0.394 and 0.942), showing it's not a significant predictor for age or BMI. Removing it simplifies the models without losing meaningful explanatory power.

After removing children, the R-squared and Adjusted R-squared values stay almost the same, but the F-statistic increases: from 45.32 to 67.64 for age and from 19.54 to 29.32 for BMI. This shows the model fit improves, making the model more efficient.

While the R-squared values are still low, the models without children are statistically stronger. The higher F-statistic and slight boost in Adjusted R-squared suggest the reduced models fit better and are simpler, capturing the relationships between age, bmi, and charges more effectively. So, removing children makes the models more efficient and slightly more powerful.

In [6]:
df = pd.read_csv('counties.csv')
df.head()

Unnamed: 0,RN,State,County,landarea,totpop,physician,enroll,percpub,civlabor,unemp,farmpop,numfarm,farmacre,fedgrant,fedciv,milit,veterans,percviet
0,27,AL,Escambia,948,36023,24,6931,95.4,15247,1339,531,414,90646,122.3,85,370,3723,27.1
1,48,AL,Marshall,567,73524,44,11928,98.6,38803,3189,1592,1582,136599,235.7,316,748,8510,29.1
2,85,AK,Prince of Wales,7325,6408,7,1317,98.6,2787,383,71,2,214,32.2,126,63,809,44.6
3,126,AR,Cross,616,19261,11,4066,99.1,8336,704,762,492,339830,81.4,87,107,1505,23.9
4,158,AR,Newton,823,7649,3,1579,99.2,3280,270,600,562,98106,31.7,71,44,807,25.5


In [7]:
y = df['fedgrant']                    
X = df.select_dtypes(np.number).assign(const=1)
X = X.drop(columns=['fedgrant', 'unemp', 'enroll', 'RN', 'landarea', 'totpop', 'farmpop', 'milit', 'numfarm', 'farmacre', 'percviet'])

# Run the multiple linear regression model
model = sm.OLS(y, X).fit()
print(model.summary())  # View results

                            OLS Regression Results                            
Dep. Variable:               fedgrant   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.998
Method:                 Least Squares   F-statistic:                 1.283e+04
Date:                Thu, 23 Jan 2025   Prob (F-statistic):          1.45e-131
Time:                        18:30:39   Log-Likelihood:                -581.84
No. Observations:                 100   AIC:                             1176.
Df Residuals:                      94   BIC:                             1191.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
physician      1.0853      0.037     29.151      0.0

In [8]:
y = df['fedgrant']                    
X = df.select_dtypes(np.number).assign(const=1)
X = X.drop(columns=['fedgrant', 'unemp', 'enroll', 'RN', 'landarea', 'totpop', 'farmpop', 'percpub', 'milit', 'numfarm', 'farmacre', 'percviet'])
# Run the multiple linear regression model
model = sm.OLS(y, X).fit()
print(model.summary())  # View results

                            OLS Regression Results                            
Dep. Variable:               fedgrant   R-squared:                       0.998
Model:                            OLS   Adj. R-squared:                  0.998
Method:                 Least Squares   F-statistic:                 1.561e+04
Date:                Thu, 23 Jan 2025   Prob (F-statistic):          6.51e-133
Time:                        18:30:39   Log-Likelihood:                -583.74
No. Observations:                 100   AIC:                             1177.
Df Residuals:                      95   BIC:                             1191.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
physician      1.0842      0.038     28.730      0.0

The model without "percpub" seems slightly better, as it has a higher F-statistic, indicating a better fit, though both models have similar Adjusted R-squared values (0.998/0.999). The model without "percpub" also has slightly stronger relationships between predictors and federal grants. The final model without "percpub" is:

fedgrant = 1.0842 * physician + (-0.0041) * civlabor + 0.0486 * fedciv + 0.0259 * veterans + 37.0678

This means that more physicians, federal employees, and veterans are linked to higher federal grants, while a larger civilian labor force is associated with fewer grants.