<a href="https://colab.research.google.com/github/EugeneHsiung/datasci_5_statistics/blob/main/datasci_5_statistics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import packages

In [1]:
import pandas as pd
from scipy.stats import ttest_ind
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
from statsmodels.formula.api import ols

# Chi Sqaure

Is there a relationship between HHA and PMD?

In [2]:
# Read in the data
df = pd.read_csv('Order and Referring.csv')
df

Unnamed: 0,NPI,LAST_NAME,FIRST_NAME,PARTB,DME,HHA,PMD
0,1558467555,.MCINDOE,THOMAS,Y,Y,Y,Y
1,1417051921,A BELLE,N,Y,Y,Y,Y
2,1972040137,A NOVOTNY,ELIZABETH,Y,Y,Y,Y
3,1760465553,A SATTAR,MUHAMMAD,Y,Y,Y,Y
4,1295400745,A'NEAL,BROGAN,Y,Y,N,N
...,...,...,...,...,...,...,...
1798884,1336502301,ZYZO,JOHN,Y,Y,Y,N
1798885,1225502768,ZZIWA,JACKIE,N,Y,N,Y
1798886,1124277249,ZZIWA-KABENGE,IRYNE,Y,Y,Y,Y
1798887,1033160296,ZZIWAMBAZZA,NATHAN,Y,Y,Y,Y


In [7]:
#value counts of HHA and PMD
df['HHA'].value_counts()      #HHA: Home Health Agency
df['PMD'].value_counts()      #PMD: Power Mobility Devices

Y    1487883
N     311006
Name: PMD, dtype: int64

In [4]:
contingency_table = pd.crosstab(df['PMD'], df['HHA'])
print(contingency_table)

HHA       N        Y
PMD                 
N    167986   143020
Y     35335  1452548


In [6]:
# Chi square test
chi2, p, expected, degreesoffreedom = chi2_contingency(contingency_table)
print(f"Chi2 value: {chi2}")
print(f"P-value: {p}")
print(f"expected: {expected}")
print(f"degreesoffreedom: {degreesoffreedom}")

Chi2 value: 684218.8366913624
P-value: 0.0
expected: 1
degreesoffreedom: [[  35151.72471787  275854.27528213]
 [ 168169.27528213 1319713.72471787]]


# Is there a relationship between HHA and PMD?
####Null Hypothesis (H0): There is no relationship between HHA and PMD.
####Hypothesis1 (H1): There is a relationship between HHA and PMD.

#### The P value is 0 which is smaller than 0.05 meaning there is a relationship between HHA and PMD. H0 would be rejected.

# T- Test

In [29]:
# Read in the data
df = pd.read_csv('Medicare_Telehealth_Trends_Q1_2023.csv')
df.head(30)

Unnamed: 0,Year,quarter,Bene_Geo_Desc,Bene_Mdcd_Mdcr_Enrl_Stus,Bene_Race_Desc,Bene_Sex_Desc,Bene_Mdcr_Entlmt_Stus,Bene_Age_Desc,Bene_RUCA_Desc,Total_Bene_TH_Elig,Total_PartB_Enrl,Total_Bene_Telehealth,Pct_Telehealth
0,2023,1,Alabama,All,All,All,Aged,All,All,272828.0,338844.0,17353.0,0.0636
1,2023,1,Alabama,All,All,All,All,0-64,All,40304.0,57509.0,4916.0,0.122
2,2023,1,Alabama,All,All,All,All,65-74,All,132911.0,173380.0,8814.0,0.0663
3,2023,1,Alabama,All,All,All,All,75-84,All,101540.0,119882.0,6290.0,0.0619
4,2023,1,Alabama,All,All,All,All,85 and over,All,40435.0,47623.0,2438.0,0.0603
5,2023,1,Alabama,All,All,All,All,All,All,315190.0,398394.0,22458.0,0.0713
6,2023,1,Alabama,All,All,All,All,All,Rural,89729.0,113891.0,5928.0,0.0661
7,2023,1,Alabama,All,All,All,All,All,Urban,224778.0,283581.0,16454.0,0.0732
8,2023,1,Alabama,All,All,All,Disabled,All,All,38287.0,55395.0,4722.0,0.1233
9,2023,1,Alabama,All,All,All,ESRD,All,All,4075.0,4155.0,383.0,0.094


In [32]:
# Separate the Gender into Male and Female

Alabama_data = df[df['Bene_Geo_Desc'] == 'Alabama']['Total_Bene_TH_Elig']
Alaska_data = df[df['Bene_Geo_Desc'] == 'Alaska']['Total_Bene_TH_Elig']

print(Alabama_data)
print(Alaska_data)

0     272828.0
1      40304.0
2     132911.0
3     101540.0
4      40435.0
5     315190.0
6      89729.0
7     224778.0
8      38287.0
9       4075.0
10    179810.0
11    135380.0
12       266.0
13      1818.0
14     44165.0
15      2645.0
16    261439.0
17     48785.0
18    266405.0
Name: Total_Bene_TH_Elig, dtype: float64
19    55909.0
20     6802.0
21    33302.0
22    17892.0
23     5016.0
24    63012.0
25    29776.0
26    30994.0
27     6472.0
28      631.0
29    34289.0
30    28723.0
31     7300.0
32     2390.0
33     1530.0
34     1600.0
35    47439.0
36    14432.0
37    48580.0
Name: Total_Bene_TH_Elig, dtype: float64


In [33]:
t_stat, p_val = ttest_ind(Alabama_data, Alaska_data, equal_var=False)

In [34]:
print(f"T-statistic: {t_stat}")
print(f"P-value: {p_val}")

T-statistic: 3.731896988535638
P-value: 0.001382987445231886


In [35]:
Alabama_mean = Alabama_data.mean()
Alaska_mean = Alaska_data.mean()

print(f"Mean prevalence for Alabama: {Alabama_mean}")
print(f"Mean prevalence for Alaska: {Alaska_mean}")

Mean prevalence for Alabama: 115831.05263157895
Mean prevalence for Alaska: 22952.052631578947
