In [1]:
import pandas as pd
import scipy.stats as stats

In [2]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Get data
afib_df = pd.read_csv('/content/drive/MyDrive/OSU/Programming for Data Science I/Python/Data/afib.csv')

afib_df.head()

Unnamed: 0,Eid,Race,Gender,Discharge,AdTm,DscTm,Drug,Age
0,15000111,African American,Female,Expired,8/19/2015 17:56,8/23/2015 19:55,diltiazem,82
1,15000111,African American,Female,Expired,8/19/2015 17:56,8/23/2015 19:55,digoxin,82
2,15000112,African American,Female,Expired,6/6/2015 22:34,6/13/2015 10:30,diltiazem,86
3,15000112,African American,Female,Expired,6/6/2015 22:34,6/13/2015 10:30,digoxin,86
4,15000113,African American,Female,Expired,2/23/2015 13:33,2/27/2015 17:34,diltiazem,69


In [4]:
### Drop Duplicates
print(afib_df.shape)
afib_df = afib_df.drop_duplicates()
print(afib_df.shape)

(3383, 8)
(3383, 8)


In [7]:
### Remove Multi-Drug Patients
drugN = afib_df.groupby('Eid')['Drug'].nunique()
multiDrug = drugN[drugN>1].index
afib_df = afib_df[~afib_df['Eid'].isin(multiDrug)]
print(afib_df.shape)
afib_df.head()

(3369, 9)


Unnamed: 0,Eid,Race,Gender,Discharge,AdTm,DscTm,Drug,Age,LOS
8,15000115,African American,Female,Expired,3/13/2015 21:35,4/7/2015 12:02,digoxin,74,24.602083
9,15000116,African American,Female,Expired,3/5/2015 9:52,3/5/2015 15:15,digoxin,67,0.224306
10,15000117,African American,Female,Expired,4/14/2015 19:43,4/15/2015 13:53,digoxin,64,0.756944
11,15000118,African American,Female,Expired,6/28/2015 15:14,6/29/2015 19:14,digoxin,83,1.166667
12,15000119,African American,Female,Expired,9/1/2015 20:28,9/2/2015 11:45,digoxin,78,0.636806


In [8]:
# Get delta of dates
afib_df['LOS'] = pd.to_datetime(afib_df.DscTm) - pd.to_datetime(afib_df.AdTm)

# Convert to Days to a float
afib_df['LOS'] = afib_df['LOS'].dt.total_seconds() / (24 * 60 * 60)

afib_df.head()

Unnamed: 0,Eid,Race,Gender,Discharge,AdTm,DscTm,Drug,Age,LOS
8,15000115,African American,Female,Expired,3/13/2015 21:35,4/7/2015 12:02,digoxin,74,24.602083
9,15000116,African American,Female,Expired,3/5/2015 9:52,3/5/2015 15:15,digoxin,67,0.224306
10,15000117,African American,Female,Expired,4/14/2015 19:43,4/15/2015 13:53,digoxin,64,0.756944
11,15000118,African American,Female,Expired,6/28/2015 15:14,6/29/2015 19:14,digoxin,83,1.166667
12,15000119,African American,Female,Expired,9/1/2015 20:28,9/2/2015 11:45,digoxin,78,0.636806


In [9]:
### Remove Outliers
u = afib_df['LOS'].quantile(0.99)
l = 1/6
condition = (afib_df['LOS']<u) & (afib_df['LOS']>l)
afib_df = afib_df[condition]
print(afib_df.shape)
#afib_df['LOS'].describe()
afib_df.head(10)

(3057, 9)


Unnamed: 0,Eid,Race,Gender,Discharge,AdTm,DscTm,Drug,Age,LOS
8,15000115,African American,Female,Expired,3/13/2015 21:35,4/7/2015 12:02,digoxin,74,24.602083
9,15000116,African American,Female,Expired,3/5/2015 9:52,3/5/2015 15:15,digoxin,67,0.224306
10,15000117,African American,Female,Expired,4/14/2015 19:43,4/15/2015 13:53,digoxin,64,0.756944
11,15000118,African American,Female,Expired,6/28/2015 15:14,6/29/2015 19:14,digoxin,83,1.166667
12,15000119,African American,Female,Expired,9/1/2015 20:28,9/2/2015 11:45,digoxin,78,0.636806
13,15000120,African American,Female,Expired,7/14/2015 0:05,7/14/2015 4:15,digoxin,70,0.173611
14,15000121,African American,Female,Expired,3/29/2015 17:48,4/5/2015 18:58,digoxin,55,7.048611
15,15000122,African American,Female,Expired,2/22/2015 10:46,2/24/2015 11:30,digoxin,57,2.030556
16,15000123,African American,Female,Expired,2/14/2015 15:09,2/17/2015 22:40,digoxin,78,3.313194
17,15000124,African American,Female,Expired,1/1/2015 8:35,1/5/2015 15:45,digoxin,82,4.298611


In [10]:
def check_field(df, field):
  """
  Get field details looking for anomalies

  """
  print(field + " Missing values: "+str(df[field].isna().sum()))
  print(field + " Unique values: "+str(df[field].nunique()))
  print(field + " Value Counts: \n" + str(df[field].value_counts()))
  print(field + " Stats \n"+str(df[field].describe()))
  print('\n')



In [11]:
# Data Clean up conditions
gender_condition = (afib_df['Gender'] == 'Male') | (afib_df['Gender'] == 'Female')
race_condition = afib_df['Race'].isin(['Caucasian', 'African American', 'Other',
                                       'Asian', 'Hispanic', 'Native American',
                                       'Biracial', 'Pacific Islander'])

# Update dataframe
afib_df = afib_df[gender_condition & race_condition]

afib_df.head()

Unnamed: 0,Eid,Race,Gender,Discharge,AdTm,DscTm,Drug,Age,LOS
8,15000115,African American,Female,Expired,3/13/2015 21:35,4/7/2015 12:02,digoxin,74,24.602083
9,15000116,African American,Female,Expired,3/5/2015 9:52,3/5/2015 15:15,digoxin,67,0.224306
10,15000117,African American,Female,Expired,4/14/2015 19:43,4/15/2015 13:53,digoxin,64,0.756944
11,15000118,African American,Female,Expired,6/28/2015 15:14,6/29/2015 19:14,digoxin,83,1.166667
12,15000119,African American,Female,Expired,9/1/2015 20:28,9/2/2015 11:45,digoxin,78,0.636806


In [12]:
# Merge smaller race instances with similar types
afib_df.loc[afib_df['Race'] == 'Biracial', 'Race'] = 'Other'

afib_df.loc[afib_df['Race'] == 'Pacific Islander', 'Race'] = 'Asian'
afib_df.replace('Asian', 'Asian/PI', inplace=True)


In [13]:
afib_df.head()

Unnamed: 0,Eid,Race,Gender,Discharge,AdTm,DscTm,Drug,Age,LOS
8,15000115,African American,Female,Expired,3/13/2015 21:35,4/7/2015 12:02,digoxin,74,24.602083
9,15000116,African American,Female,Expired,3/5/2015 9:52,3/5/2015 15:15,digoxin,67,0.224306
10,15000117,African American,Female,Expired,4/14/2015 19:43,4/15/2015 13:53,digoxin,64,0.756944
11,15000118,African American,Female,Expired,6/28/2015 15:14,6/29/2015 19:14,digoxin,83,1.166667
12,15000119,African American,Female,Expired,9/1/2015 20:28,9/2/2015 11:45,digoxin,78,0.636806


In [None]:
check_field(afib_df, 'Race')

Race Missing values: 0
Race Unique values: 6
Race Value Counts: 
Race
Caucasian           2563
African American     271
Other                 84
Asian/PI              29
Hispanic              20
Native American       17
Name: count, dtype: int64
Race Stats 
count          2984
unique            6
top       Caucasian
freq           2563
Name: Race, dtype: object




In [None]:
check_field(afib_df, 'Gender')

Gender Missing values: 0
Gender Unique values: 2
Gender Value Counts: 
Gender
Female    1701
Male      1283
Name: count, dtype: int64
Gender Stats 
count       2984
unique         2
top       Female
freq        1701
Name: Gender, dtype: object




In [None]:
check_field(afib_df, 'Drug')

Drug Missing values: 0
Drug Unique values: 2
Drug Value Counts: 
Drug
diltiazem    2241
digoxin       743
Name: count, dtype: int64
Drug Stats 
count          2984
unique            2
top       diltiazem
freq           2241
Name: Drug, dtype: object




In [None]:
check_field(afib_df, 'Discharge')

Discharge Missing values: 0
Discharge Unique values: 18
Discharge Value Counts: 
Discharge
Discharged to home                                                                                           2685
Expired                                                                                                       119
Discharged/transferred to SNF                                                                                  54
Discharged/transferred to home with home health service                                                        42
Expired in a medical facility. Medicaid only, hospice.                                                         27
Expired, place unknown. Medicaid only, hospice.                                                                11
Discharged/transferred to another type of inpatient care institution                                           10
Discharged/transferred to another rehab fac including rehab units of a hospital .                              

In [None]:
# filter out all expired discharges out of the  afib_df dataframe
#living_df = afib_df[~afib_df['Discharge'].str.contains('expire', case=False)]

In [None]:
#check_field(living_df, 'Discharge')

In [None]:
# Combining lower discharge entry counts into to Other
#living_df.loc[living_df['Discharge'] == 'Hospice / medical facility', 'Discharge'] = 'Other'
#living_df.loc[living_df['Discharge'] == 'Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital', 'Discharge'] = 'Other'
#living_df.loc[living_df['Discharge'] == 'Discharged/transferred to a long term care hospital.', 'Discharge'] = 'Other'
#living_df.loc[living_df['Discharge'] == 'Hospice / home', 'Discharge'] = 'Other'
#living_df.loc[living_df['Discharge'] == 'Discharged/transferred to another Type of Health Care Institution not Defined Elsewhere', 'Discharge'] = 'Other'
#living_df.loc[living_df['Discharge'] == 'Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.', 'Discharge'] = 'Other'
#living_df.loc[living_df['Discharge'] == 'Discharged to Court/ Law Enforcement/Jail', 'Discharge'] = 'Other'
#living_df.loc[living_df['Discharge'] == 'Discharged/transferred to a Critical Access Hospital (CAH).', 'Discharge'] = 'Other'
#living_df.loc[living_df['Discharge'] == 'Discharged/transferred to home under care of Home IV provider', 'Discharge'] = 'Other'

#check_field(living_df, 'Discharge')

In [14]:
# Combining Disccharge and Expired Types
condition = afib_df['Discharge'].str.contains('Expired')
afib_df.loc[condition, 'Discharge'] = 'Expired'
afib_df.loc[~condition, 'Discharge'] = 'Discharged'
#afib_df['Discharge'].value_counts()

check_field(afib_df, 'Discharge')

Discharge Missing values: 0
Discharge Unique values: 2
Discharge Value Counts: 
Discharge
Discharged    2822
Expired        162
Name: count, dtype: int64
Discharge Stats 
count           2984
unique             2
top       Discharged
freq            2822
Name: Discharge, dtype: object




In [15]:
check_field(afib_df, 'Race')

Race Missing values: 0
Race Unique values: 6
Race Value Counts: 
Race
Caucasian           2563
African American     271
Other                 84
Asian/PI              29
Hispanic              20
Native American       17
Name: count, dtype: int64
Race Stats 
count          2984
unique            6
top       Caucasian
freq           2563
Name: Race, dtype: object




In [16]:
check_field(afib_df, 'Gender')

Gender Missing values: 0
Gender Unique values: 2
Gender Value Counts: 
Gender
Female    1701
Male      1283
Name: count, dtype: int64
Gender Stats 
count       2984
unique         2
top       Female
freq        1701
Name: Gender, dtype: object




In [17]:
check_field(afib_df, 'Drug')

Drug Missing values: 0
Drug Unique values: 2
Drug Value Counts: 
Drug
diltiazem    2241
digoxin       743
Name: count, dtype: int64
Drug Stats 
count          2984
unique            2
top       diltiazem
freq           2241
Name: Drug, dtype: object




In [18]:
check_field(afib_df, 'Discharge')

Discharge Missing values: 0
Discharge Unique values: 2
Discharge Value Counts: 
Discharge
Discharged    2822
Expired        162
Name: count, dtype: int64
Discharge Stats 
count           2984
unique             2
top       Discharged
freq            2822
Name: Discharge, dtype: object




In [19]:
#Find P-Number by Race
import scipy.stats as stats
import pandas as pd

ct = pd.crosstab(
    index=[afib_df['Race']],
    columns=afib_df['Drug'],
    margins=False
)
chi2, p, dof, expected = stats.chi2_contingency(ct)
display(ct)
p_number_by_race = round(p, 3)
print(f"ALL: p-value: {p_number_by_race:.3f}")

Drug,digoxin,diltiazem
Race,Unnamed: 1_level_1,Unnamed: 2_level_1
African American,134,137
Asian/PI,11,18
Caucasian,550,2013
Hispanic,7,13
Native American,3,14
Other,38,46


ALL: p-value: 0.000


In [20]:
#Find P-Number by Gender
import scipy.stats as stats
import pandas as pd

ct = pd.crosstab(
    index=[afib_df['Gender']],
    columns=afib_df['Drug'],
    margins=False
)
chi2, p, dof, expected = stats.chi2_contingency(ct)
display(ct)
p_number_by_gender = round(p, 3)
print(f"ALL: p-value: {p_number_by_gender:.3f}")

Drug,digoxin,diltiazem
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,456,1245
Male,287,996


ALL: p-value: 0.006


In [23]:
#Find P-Number by Age
import scipy.stats as stats
import pandas as pd

a = afib_df[afib_df['Drug']=='digoxin']
b = afib_df[afib_df['Drug']=='diltiazem']

t_stat, p_value = stats.ttest_ind(a['Age'], b['Age'], equal_var=False)

#print(f"t-statistic: {t_stat:.3f}")
p_number_by_age = round(p_value, 3)
print(f"p-value: {p_number_by_age:.3f}")

p-value: 0.015


In [24]:
# Find P-Number by Length of Stay
import scipy.stats as stats
import pandas as pd

a = afib_df[afib_df['Drug']=='digoxin']
b = afib_df[afib_df['Drug']=='diltiazem']

t_stat, p_value = stats.ttest_ind(a['LOS'], b['LOS'], equal_var=False)

#print(f"t-statistic: {t_stat:.3f}")
p_number_by_los = round(p_value, 3)
print(f"p-value: {p_number_by_los:.3f}")

p-value: 0.322


In [25]:
#Get Overall Counts
import pandas as pd
from scipy import stats

# Get Overall Counts
all_counts = afib_df['Eid'].count()
display(all_counts)

# Get digoxin Counts
digoxin_counts = afib_df[afib_df['Drug'] == 'digoxin'].shape[0]
#print(f"Number of rows where Drug is 'digoxin': {digoxin_counts}")

# Get diltiazem Counts
diltiazem_counts = afib_df[afib_df['Drug'] == 'diltiazem'].shape[0]
#print(f"Number of rows where Drug is 'diltiazem': {diltiazem_counts}")

# Get Missing Counts by LOS
missing_overall_cnt = ""
#display(missing_overall_cnt)

# Create the data for the DataFrame
data = [['n', '', missing_overall_cnt, all_counts, digoxin_counts, diltiazem_counts, '']]

# Create the column names
columns = [' ', '', 'Missing', 'Overall', 'digoxin', 'diltiazem', 'P-Value']

#print("Length of data:", len(data[0]))
#print("Length of columns:", len(columns))

# Create the DataFrame
combined_cnts_df = pd.DataFrame(data, columns=columns)

combined_cnts_df = combined_cnts_df.reset_index(drop=True)
combined_cnts_df.columns.name = None


# Display without index
display(combined_cnts_df.style.hide(axis="index"))

np.int64(2984)

Unnamed: 0,Unnamed: 1,Missing,Overall,digoxin,diltiazem,P-Value
n,,,2984,743,2241,


In [26]:

# Compile Stats by Race
import scipy.stats as stats

# Get Counts by Race
race_vc = afib_df['Race'].value_counts().sort_index()
race_pct = afib_df['Race'].value_counts(normalize=True).sort_index() * 100
race_pct = race_pct.round(1)

overall_race_value = race_vc.astype(str) + ' (' + race_pct.astype(str) + '%)'
#display(overall_race_value)

# Get Missing Counts by Race
missing_race_value = afib_df['Race'].isna().sum()
#display(missing_race_value)

race_vc_by_drug = afib_df.groupby(['Drug', 'Race']).size()
race_pct_by_drug = afib_df.groupby('Drug')['Race'].value_counts(normalize=True).sort_index() * 100
race_pct_by_drug = race_pct_by_drug.round(1)

#Combined counts and percentages for display.
race_by_drug_values = race_vc_by_drug.astype(str) + ' (' + race_pct_by_drug.astype(str) + '%)'
#display(race_by_drug_values)

#Convert to a DataFrame with Race on rows and Drug on columns.
combined_race_df = race_by_drug_values.unstack(level=0)
#display(combined_race_df)

#Add the overall columns
combined_race_df.insert(0, 'Overall', overall_race_value)

#Add the missing value column
combined_race_df.insert(0, 'Missing', missing_race_value)

# Reset index so Race becomes a column
combined_race_df = combined_race_df.reset_index()

# --- Create the "n" and unnamed columns ---
#combined_race_df.insert(0, '', combined_race_df['Race'])  # unnamed column for race names
combined_race_df.insert(0, ' ', '')                       # named column ' '
combined_race_df.loc[0, ' '] = 'Race, n(%)'               # only top cell has the label
combined_race_df = combined_race_df.rename(columns={'Race': ''}) # Rename to avoid conflict

# add P-Value Column
combined_race_df.insert(6, 'P-Value', '')
combined_race_df.loc[0, 'P-Value'] = round(p_number_by_race, 3)

#
combined_race_df = combined_race_df.reset_index(drop=True)
combined_race_df.columns.name = None
#if 'Drug' in combined_race_df.columns:
#    combined_race_df = combined_race_df.drop(columns=['Drug'])

# Display without index
from IPython.display import display, HTML
display(HTML(combined_race_df.to_html(index=False)))

Unnamed: 0,Unnamed: 1,Missing,Overall,digoxin,diltiazem,P-Value
"Race, n(%)",African American,0,271 (9.1%),134 (18.0%),137 (6.1%),0.0
,Asian/PI,0,29 (1.0%),11 (1.5%),18 (0.8%),
,Caucasian,0,2563 (85.9%),550 (74.0%),2013 (89.8%),
,Hispanic,0,20 (0.7%),7 (0.9%),13 (0.6%),
,Native American,0,17 (0.6%),3 (0.4%),14 (0.6%),
,Other,0,84 (2.8%),38 (5.1%),46 (2.1%),


In [27]:
# Get Stats by Gender
gender_vc = afib_df['Gender'].value_counts().sort_index()
gender_pct = afib_df['Gender'].value_counts(normalize=True).sort_index() * 100
gender_pct = gender_pct.round(1)

overall_gender_value = gender_vc.astype(str) + ' (' + gender_pct.astype(str) + '%)'
#display(overall_gender_value)

# Get Missing Counts by Gender
missing_gender_value = afib_df['Gender'].isna().sum()
#display(missing_gender_value)

gender_vc_by_drug = afib_df.groupby(['Drug', 'Gender']).size()
gender_pct_by_drug = afib_df.groupby('Drug')['Gender'].value_counts(normalize=True).sort_index() * 100
gender_pct_by_drug = gender_pct_by_drug.round(1)

#Combined counts and percentages for display.
gender_by_drug_values = gender_vc_by_drug.astype(str) + ' (' + gender_pct_by_drug.astype(str) + '%)'
#display(gender_by_drug_values)

#Convert to a DataFrame with Gender on rows and Drug on columns.
combined_gender_df =gender_by_drug_values.unstack(level=0)
#display(combined_gender_df)

#Add the overall columns
combined_gender_df.insert(0, 'Overall', overall_gender_value)

#Add the missing value column
combined_gender_df.insert(0, 'Missing', missing_gender_value)

# Reset index so Gender becomes a column
combined_gender_df = combined_gender_df.reset_index()

# --- Create the "n" and unnamed columns ---
combined_gender_df.insert(0, '', combined_gender_df['Gender'])  # unnamed column for gender names
combined_gender_df.insert(0, ' ', '')                       # named column ' '
combined_gender_df.loc[0, ' '] = 'Gender, n(%)'               # only top cell has the label
combined_gender_df = combined_gender_df.drop(columns=['Gender']) # drop the old duplicate gender col
# add P-Value Column
combined_gender_df.insert(6, 'P-Value', '')
combined_gender_df.loc[0, 'P-Value'] = round(p_number_by_gender, 3)

#
combined_gender_df = combined_gender_df.reset_index(drop=True)
combined_gender_df.columns.name = None
#if 'Drug' in combined_race_df.columns:
#    combined_race_df = combined_race_df.drop(columns=['Drug'])

# Display without index
from IPython.display import display, HTML
display(HTML(combined_gender_df.to_html(index=False)))


Unnamed: 0,Unnamed: 1,Missing,Overall,digoxin,diltiazem,P-Value
"Gender, n(%)",Female,0,1701 (57.0%),456 (61.4%),1245 (55.6%),0.006
,Male,0,1283 (43.0%),287 (38.6%),996 (44.4%),


In [29]:
# Get Stats by Age
import pandas as pd

#Get Age Mean
age_mean = afib_df['Age'].mean().round(1)
age_std_value = afib_df['Age'].std()
age_std = round(age_std_value, 1) # Use the built-in round function
overall_age_value = str(age_mean) + ' (' + str(age_std) + ')'

# Get Missing Counts by Age
missing_age_value = afib_df['Age'].isna().sum()
#display(missing_age_value)

mean_age_by_drug = afib_df.groupby('Drug')['Age'].mean().round(1)
std_age_by_drug = afib_df.groupby('Drug')['Age'].std().round(1)
age_by_drug_values = mean_age_by_drug.astype(str) + ' (' + std_age_by_drug.astype(str) + ')'

# Get values for digoxin and diltiazem
digoxin_age_value = age_by_drug_values['digoxin']
diltiazem_age_value = age_by_drug_values['diltiazem']

# Create the data for the DataFrame
data = [['Age, mean (SD)', '', missing_age_value, overall_age_value, digoxin_age_value, diltiazem_age_value, p_number_by_age]]

# Create the column names
columns = [' ', '', 'Missing', 'Overall', 'digoxin', 'diltiazem', 'P-Value']

#print("Length of data:", len(data[0]))
#print("Length of columns:", len(columns))

# Create the DataFrame
combined_age_df = pd.DataFrame(data, columns=columns)

combined_age_df = combined_age_df.reset_index(drop=True)
combined_age_df.columns.name = None

# Display without index
display(combined_age_df.style.hide(axis="index"))

Unnamed: 0,Unnamed: 1,Missing,Overall,digoxin,diltiazem,P-Value
"Age, mean (SD)",,0,70.8 (13.5),71.8 (12.6),70.5 (13.8),0.015


In [30]:
#Get Stats by Length of Stay
import pandas as pd
from scipy import stats

#Get LOS Mean
los_mean = afib_df['LOS'].mean().round(1)
los_std_value = afib_df['LOS'].std()
los_std = round(los_std_value, 1) # Use the built-in round function
overall_los_value = str(los_mean) + ' (' + str(los_std) + ')'

# Get Missing Counts by LOS
missing_los_value = afib_df['LOS'].isna().sum()
#display(missing_los_value)

mean_los_by_drug = afib_df.groupby('Drug')['LOS'].mean().round(1)
std_los_by_drug = afib_df.groupby('Drug')['LOS'].std().round(1)
los_by_drug_values = mean_los_by_drug.astype(str) + ' (' + std_los_by_drug.astype(str) + ')'

# Get values for digoxin and diltiazem
digoxin_los_value = los_by_drug_values['digoxin']
diltiazem_los_value = los_by_drug_values['diltiazem']

#Calculate P-Value
#stats.ttest_ind(afib_df[

# Create the data for the DataFrame
data = [['LOS, mean (SD)', '', missing_los_value, overall_los_value, digoxin_los_value, diltiazem_los_value, p_number_by_los]]

# Create the column names
columns = [' ', '', 'Missing', 'Overall', 'digoxin', 'diltiazem', 'P-Value']

#print("Length of data:", len(data[0]))
#print("Length of columns:", len(columns))

# Create the DataFrame
combined_los_df = pd.DataFrame(data, columns=columns)

combined_los_df = combined_los_df.reset_index(drop=True)
combined_los_df.columns.name = None



# Display without index
display(combined_los_df.style.hide(axis="index"))

Unnamed: 0,Unnamed: 1,Missing,Overall,digoxin,diltiazem,P-Value
"LOS, mean (SD)",,0,6.7 (11.3),7.0 (9.7),6.6 (11.8),0.322


In [None]:
# Q1: Create the following table (namely TableOne) to describe the demographic distribution of patients taking digoxin vs diltiazem
#
# Stack the Data (Overall, Race, Gender, Age, LOS)
df_vertical = pd.concat([combined_cnts_df, combined_race_df, combined_gender_df, combined_age_df, combined_los_df], ignore_index=True)
#display(df_vertical)

pd.set_option('display.precision', 3)
display(df_vertical.style.hide(axis="index"))

Unnamed: 0,Unnamed: 1,Missing,Overall,digoxin,diltiazem,P-Value
n,,,2984,743,2241,
"Race, n(%)",African American,0.0,271 (9.1%),134 (18.0%),137 (6.1%),0.0
,Asian/PI,0.0,29 (1.0%),11 (1.5%),18 (0.8%),
,Caucasian,0.0,2563 (85.9%),550 (74.0%),2013 (89.8%),
,Hispanic,0.0,20 (0.7%),7 (0.9%),13 (0.6%),
,Native American,0.0,17 (0.6%),3 (0.4%),14 (0.6%),
,Other,0.0,84 (2.8%),38 (5.1%),46 (2.1%),
"Gender, n(%)",Female,0.0,1701 (57.0%),456 (61.4%),1245 (55.6%),0.006
,Male,0.0,1283 (43.0%),287 (38.6%),996 (44.4%),
"Age, mean (SD)",,0.0,70.8 (13.5),71.8 (12.6),70.5 (13.8),0.015


# Q2:
# Question: What is grouping matching?

Answers: "Group matching" is a research methodology used to create comparable groups of patients to reduce bias when comparing treatments.

Group matching is a process where researchers deliberately pair or select patients in different treatment groups (e.g., digoxin vs. diltiazem) so that they have similar baseline characteristics such as age, sex, race, and other relevant health factors.
The primary goal is to ensure that any differences in outcomes later observed are due to the drug treatment itself, not pre-existing differences between the patients in each group. When groups are successfully matched, the P-values for baseline characteristics in the TableOne would typically be non-significant (e.g., P > 0.05), indicating balance.
The TableOne shows significant differences between the digoxin and diltiazem groups for several key demographic variables, as indicated by the low P-values:
Race: P-value < 0.001 (groups are significantly different in racial distribution)
Gender: P-value = 0.006 (groups are significantly different in gender distribution)
Age: P-value = 0.015 (mean ages are statistically different)
The presence of these significant P-values indicates that the patients were simply "grouped" by the drug they received (likely in an observational study or an unmatched analysis), not "matched" to ensure comparable characteristics

# Are the two groups well matched between digoxin and diltiazem ?

Answers: No, the two groups of patients (those taking digoxin versus those taking diltiazem) are not well-matched in demographics.
From the P-values shown in the TableOne, which show statistically significant differences between the two groups for key demographic variables:
Race (P < 0.001): The racial distribution is significantly different between the groups. The digoxin group has a much higher percentage of African American patients (18.7% vs. 6.3%), while the diltiazem group has a higher percentage of Caucasian patients (89.4% vs. 73.1%).
Gender (0.006): The gender distribution is significantly different. The digoxin group has a higher proportion of females (61.3% vs. 56.1%).
Age (0.015): The mean age is statistically different between the groups (digoxin: 71.9 years vs. diltiazem: 70.5 years).

A P-value less than 0.05 generally indicates that the observed difference is unlikely to be due to chance alone. Any difference in health outcomes between the digoxin and diltiazem groups might be due to these demographic differences (e.g., age, race, gender) rather than the effect of the medication itself.


# Q3:
# Question: Why isn't our initial analysis solid?
Answers: Because the dataset wasn't well matched in demographics.

# Question: How to address it?
Ideas: Randomize the data within each demographic category?  Utilize a model for
estimating the P-Value by accounting for demographic differences?  Grouping?  Matching?  Weighting?

In [33]:
# Q3 (CONTINUED)
#
# LOGIT Function Suggested by ChatGPT = "Logistic Regression MODEL" for a binary outcome.
# For instance improved (discharged) vs not improved (expired) is a binary outcome.
import statsmodels.formula.api as smf

# Encode 'Discharge' column numerically
afib_df['Discharge_encoded'] = afib_df['Discharge'].apply(lambda x: 1 if x == 'Discharged' else 0)

model = smf.logit('Discharge_encoded ~ Drug + Age + Gender + Race', data=afib_df).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.127600
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:      Discharge_encoded   No. Observations:                 2984
Model:                          Logit   Df Residuals:                     2975
Method:                           MLE   Df Model:                            8
Date:                Sun, 02 Nov 2025   Pseudo R-squ.:                  0.3951
Time:                        17:58:16   Log-Likelihood:                -380.76
converged:                       True   LL-Null:                       -629.50
Covariance Type:            nonrobust   LLR p-value:                2.449e-102
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                   0.7976      0.620      1.286      0.198      -0.418   