In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
from utils import (
    process_features,
    get_significant_variables,
    plot_feature_weights_horizontal_sm,
)
import statsmodels.api as sm

In [70]:
DATA_PATH = Path("/Users/jessbreda/Desktop/github/ca-sdoh/data")

cols_of_interest = [
    "condition_name",
    "number_of_readmissions",
    "number_of_discharges",
    "beneficiary_avg_age",
    "pct_beneficiaries_female",
    "pct_beneficiaries_arthritis",
    "pct_beneficiaries_alzheimers",
    "pct_beneficiaries_congestive_heart_failure",
    "pct_beneficiaries_ischemic_heart_disease",
    "pct_beneficiaries_copd",
    "pct_beneficiaries_depression",
    "pct_beneficiaries_diabetes",
    "pct_beneficiaries_arthritis",
    "pct_beneficiaries_psychotic_disorders",
    "pct_beneficiaries_stroke",
    "pct_beneficiaries_white",
    "pct_beneficiaries_black",
    "pct_beneficiaries_hispanic",
    "pct_beneficiaries_medicare_medicaid",
    "pct_beneficiaries_osteoporosis",
    "pct_beneficiaries_chronic_kidney_disease",
    "pct_beneficiaries_cancer",
]
df = pd.read_csv(
    (DATA_PATH / "obt_slimmer.csv"),
    usecols=cols_of_interest,
)

### drop nans
df.dropna(inplace=True)

### remove 'beneficiaries_' from col names
df.columns = df.columns.str.replace("beneficiaries_", "")

### Calculate new columns
df["readmission_rate"] = (
    df["number_of_readmissions"] / df["number_of_discharges"]
) * 100
df["pct_non_white"] = 1 - df["pct_white"]


### Condition Query
condition = "pn"  ###### ENTER NAME HERE #####
if condition == "hf":
    df = df.query("condition_name == 'Heart Failure'")
elif condition == "copd":
    df = df.query("condition_name == 'Chronic Obstructive Pulmonary Disease'")
elif condition == "pn":
    df = df.query("condition_name == 'Pneumonia'")

### Drop extra columns
df.drop(
    columns=[
        "number_of_readmissions",
        "number_of_discharges",
        "pct_white",
        "condition_name",
    ],
    inplace=True,
)

### Transform Data
log_transform = [
    # "pct_psychotic_disorders",
    # "pct_black",
    # "pct_hispanic",
    # "pct_non_white",
    # "pct_medicare_medicaid",
]

# for col in df.columns:
#     if col in log_transform:
#         df[col] = np.log10(df[col])

### SAVE OUT DMS
DM_PATH = Path("/Users/jessbreda/Desktop/github/ca-sdoh/code/jess/design_matrices")

# y
y = df["readmission_rate"]
y.reset_index(drop=True, inplace=True)
y.to_csv(DM_PATH / f"{condition}_y.csv", index=False)

# X
X = df.drop(columns=["readmission_rate"])
X.reset_index(drop=True, inplace=True)

X_race_1 = X.drop(columns=["pct_black", "pct_hispanic"])
X_race_2 = X.drop(columns=["pct_non_white"])
X_base = X.drop(columns=["pct_non_white", "pct_black", "pct_hispanic"])

X_race_1.to_csv(DM_PATH / f"{condition}_X_race1.csv", index=False)
X_race_2.to_csv(DM_PATH / f"{condition}_X_race2.csv", index=False)
X_base.to_csv(DM_PATH / f"{condition}_X_base.csv", index=False)


### 90th percentile

df.reset_index(drop=True, inplace=True)
percentile_90 = df["readmission_rate"].quantile(0.9)
df_90 = df.query("readmission_rate > @percentile_90").copy().reset_index(drop=True)

# y
y = df_90["readmission_rate"]
y.reset_index(drop=True, inplace=True)
y.to_csv(DM_PATH / f"{condition}_y_90.csv", index=False)

# X
X = df_90.drop(columns=["readmission_rate"])
X.reset_index(drop=True, inplace=True)
X_race_90 = X.drop(columns=["pct_black", "pct_hispanic"])
X_race_90.to_csv(DM_PATH / f"{condition}_X_race_90.csv", index=False)

In [25]:
### READ IN
y = pd.read_csv(DM_PATH / f"{condition}_y.csv", index_col=False)
y = y.squeeze()

X_race_1 = pd.read_csv(DM_PATH / f"{condition}_X_race1.csv")
X_race_2 = pd.read_csv(DM_PATH / f"{condition}_X_race2.csv")
X_base_2 = pd.read_csv(DM_PATH / f"{condition}_X_base.csv")

## 90th percentile

In [52]:
df.reset_index(drop=True, inplace=True)
percentile_90 = df["readmission_rate"].quantile(0.9)
df_90 = df.query("readmission_rate > @percentile_90").copy().reset_index(drop=True)

In [62]:
# y
y = df_90["readmission_rate"]
y.reset_index(drop=True, inplace=True)
y.to_csv(DM_PATH / f"{condition}_y_90.csv", index=False)

# X
X = df_90.drop(columns=["readmission_rate"])
X.reset_index(drop=True, inplace=True)
X_race_90 = X.drop(columns=["pct_black", "pct_hispanic"])

X_race_90.to_csv(DM_PATH / f"{condition}_X_race_90.csv", index=False)