In [37]:
import pandas as pd
import matplotlib as plt
from scipy import stats
import plotly as py
import plotly.graph_objects as go
import numpy as np
from sklearn.linear_model import LinearRegression
from statsmodels.formula.api import ols

**Importing csv file and first 5 rows**

In [38]:
df = pd.read_csv("csv-1.csv")
df.head()

Unnamed: 0,Indicator,Group,State,Subgroup,Week,Week Label,Value,Low CI,High CI,Confidence Interval,Quartile range
0,Symptoms of Depressive Disorder,National Estimate,United States,United States,1,Apr 23 - May 5,23.5,22.7,24.3,22.7 - 24.3,
1,Symptoms of Depressive Disorder,By Age,United States,18 - 29 years,1,Apr 23 - May 5,32.7,30.2,35.2,30.2 - 35.2,
2,Symptoms of Depressive Disorder,By Age,United States,30 - 39 years,1,Apr 23 - May 5,25.7,24.1,27.3,24.1 - 27.3,
3,Symptoms of Depressive Disorder,By Age,United States,40 - 49 years,1,Apr 23 - May 5,24.8,23.3,26.2,23.3 - 26.2,
4,Symptoms of Depressive Disorder,By Age,United States,50 - 59 years,1,Apr 23 - May 5,23.2,21.5,25.0,21.5 - 25.0,


**ANXIETY**

In [39]:
mask = df["Indicator"] == "Symptoms of Anxiety Disorder"
anx_df = df[mask]

anx_df.head()

Unnamed: 0,Indicator,Group,State,Subgroup,Week,Week Label,Value,Low CI,High CI,Confidence Interval,Quartile range
70,Symptoms of Anxiety Disorder,National Estimate,United States,United States,1,Apr 23 - May 5,30.8,30.0,31.7,30.0 - 31.7,
71,Symptoms of Anxiety Disorder,By Age,United States,18 - 29 years,1,Apr 23 - May 5,40.2,38.1,42.4,38.1 - 42.4,
72,Symptoms of Anxiety Disorder,By Age,United States,30 - 39 years,1,Apr 23 - May 5,34.4,32.6,36.1,32.6 - 36.1,
73,Symptoms of Anxiety Disorder,By Age,United States,40 - 49 years,1,Apr 23 - May 5,34.1,32.1,36.2,32.1 - 36.2,
74,Symptoms of Anxiety Disorder,By Age,United States,50 - 59 years,1,Apr 23 - May 5,31.0,29.0,33.1,29.0 - 33.1,


**Fitting Multiple Linear Regression Model for Anxiety**

We'll build our model based on the following predictors:

- Group

- State

- Subgroup

- Week (Same as Week Label)

$$y = \beta_0 + \beta_1*x_1 + \beta_2*x_2 + \beta_3*x_3 + \beta_4*x_4$$

where $y$ is the **Value**, $\beta_1$ is the **Group**, $\beta_2$ is the **State**, $\beta_3$ is the **Subgroup**, and $\beta_4$ is the **Week**.

**Fitting Model and Model Summary**

In [40]:
# fit the model using the formula above
# C() indicates categorical
model = ols('Value ~ C(Group) + C(State) + C(Subgroup) + C(Week)', data = anx_df).fit()

model.summary()

0,1,2,3
Dep. Variable:,Value,R-squared:,0.773
Model:,OLS,Adj. R-squared:,0.732
Method:,Least Squares,F-statistic:,18.77
Date:,"Sun, 23 Jul 2023",Prob (F-statistic):,1.7999999999999998e-94
Time:,09:49:04,Log-Likelihood:,-1119.1
No. Observations:,490,AIC:,2390.0
Df Residuals:,414,BIC:,2709.0
Df Model:,75,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,23.6326,0.383,61.745,0.000,22.880,24.385
C(Group)[T.By Education],-7.4686,0.873,-8.551,0.000,-9.185,-5.752
C(Group)[T.By Gender],-7.2000,0.797,-9.031,0.000,-8.767,-5.633
C(Group)[T.By Race/Hispanic ethnicity],-7.4000,0.891,-8.302,0.000,-9.152,-5.648
C(Group)[T.By State],5.7088,0.343,16.662,0.000,5.035,6.382
C(Group)[T.National Estimate],-5.3500,0.690,-7.748,0.000,-6.707,-3.993
C(State)[T.Alaska],-0.1925,0.484,-0.398,0.691,-1.143,0.758
C(State)[T.Arizona],0.5646,0.484,1.167,0.244,-0.386,1.515
C(State)[T.Arkansas],1.3932,0.484,2.880,0.004,0.442,2.344

0,1,2,3
Omnibus:,15.595,Durbin-Watson:,1.933
Prob(Omnibus):,0.0,Jarque-Bera (JB):,31.228
Skew:,0.119,Prob(JB):,1.66e-07
Kurtosis:,4.213,Cond. No.,1.34e+17


**Investigating significant variables**

Use a significance level of 0.05

In [41]:
mask = model.pvalues < 0.05

p_df = pd.DataFrame(model.pvalues[mask])
p_df = p_df.reset_index()
p_df = p_df.rename(columns = {"index": "variable",
                              0:"p-value"})
p_df

Unnamed: 0,variable,p-value
0,Intercept,5.714514e-211
1,C(Group)[T.By Education],2.381375e-16
2,C(Group)[T.By Gender],6.538261e-18
3,C(Group)[T.By Race/Hispanic ethnicity],1.470478e-15
4,C(Group)[T.By State],4.524857e-48
...,...,...
68,C(Subgroup)[T.West Virginia],1.190064e-03
69,C(Subgroup)[T.Wisconsin],2.460758e-05
70,C(Subgroup)[T.Wyoming],3.982763e-07
71,C(Week)[T.3],3.635844e-08


**Significant Groups**

In [42]:
mask = p_df['variable'].str.contains("Group")

p_df[mask]

Unnamed: 0,variable,p-value
1,C(Group)[T.By Education],2.381375e-16
2,C(Group)[T.By Gender],6.538261e-18
3,C(Group)[T.By Race/Hispanic ethnicity],1.470478e-15
4,C(Group)[T.By State],4.524857e-48
5,C(Group)[T.National Estimate],7.260608e-14


**Significant States**

In [43]:
mask = p_df['variable'].str.contains("State")

p_df[mask]

Unnamed: 0,variable,p-value
4,C(Group)[T.By State],4.524857e-48
6,C(State)[T.Arkansas],0.004183268
7,C(State)[T.California],0.0002667117
8,C(State)[T.District of Columbia],0.01404621
9,C(State)[T.Florida],6.591013e-05
10,C(State)[T.Georgia],0.007808482
11,C(State)[T.Hawaii],3.889398e-06
12,C(State)[T.Idaho],0.002891962
13,C(State)[T.Illinois],0.002159793
14,C(State)[T.Indiana],0.004796782


**Significant Subgroups**

In [44]:
mask = p_df['variable'].str.contains("Subgroup")

p_df[mask]

Unnamed: 0,variable,p-value
32,C(Subgroup)[T.30 - 39 years],1.603821e-05
33,C(Subgroup)[T.40 - 49 years],8.654909e-09
34,C(Subgroup)[T.50 - 59 years],1.47367e-13
35,C(Subgroup)[T.60 - 69 years],6.006271e-33
36,C(Subgroup)[T.70 - 79 years],7.030799e-54
37,C(Subgroup)[T.80 years and above],1.650853e-61
38,C(Subgroup)[T.Arkansas],0.004183268
39,C(Subgroup)[T.Bachelor's degree or higher],6.6650979999999995e-19
40,C(Subgroup)[T.California],0.0002667117
41,C(Subgroup)[T.District of Columbia],0.01404621


**Significant Weeks**

In [45]:
mask = p_df['variable'].str.contains("Week")

p_df[mask]

Unnamed: 0,variable,p-value
71,C(Week)[T.3],3.635844e-08
72,C(Week)[T.4],0.001108777
