In [436]:
import pandas as pd

def load_csv(url: str) -> pd.DataFrame:
    return pd.read_csv(url, sep=",")

FILENAME = "Drug_Use_Data_from_Selected_Hospitals.csv"
filepath = f"../data/{FILENAME}"

In [437]:
drug_use_df = load_csv(filepath)

The first ~80 rows have a patient count that we can use to extrapolate counts of each condition. but it involves a bit of math, and I think we are better off doing analysis on the ratios (percentages)

If we see the row 
| sex   | Start Time | End Time   | setting | all drugs |
| ------| -----------| ---------- | ------- | ----------|
| female| 01/01/2020 | 01/31/2020 | ip      | 23.498389 |
| male  | 01/01/2020 | 01/31/2020 | ip      | 30.455556 |


You can see that the percentages don't add up to 100%. This is because row reads: Given all males admitted to the hospital as In Patients for this time period, 30.4555% were admitted for condition all_drugs.

In the end, I created 2 dataframes.
- 1st with data grouped by sex categories: drug_sex_df
- 2nd with data grouped by age categories: drug_age_df

we can do EDA on each

### Create and cleanup base data

In [423]:
# set column names to lowercase
drug_use_df.columns = drug_use_df.columns.str.lower()
# set row data to lowercase
COLUMNS_TO_CONVERT_TO_LOWERCASE = ['setting', 'indicator', 'group', 'subgroup', 'measure']
for col in COLUMNS_TO_CONVERT_TO_LOWERCASE:
    drug_use_df[col] = drug_use_df[col].str.lower()

# Grab the base data
base_drug_df = drug_use_df.iloc[:82]

# Remove extra columns
COLUMNS_TO_DROP_NO_VALUES = ['figure', 'indicator', 'group', 'subgroup', 'measure', 'values']
base_drug_df = base_drug_df.drop(columns=COLUMNS_TO_DROP)

### Helper functions and constants

In [424]:
# Names of conditions in the rows we are converting to columns
BASE_DRUGS = ['All Drugs', 'All Opioids', 'Stimulants', 'Cannabis', 'Benzodiazepine']
BASE_DRUGS_CO_OCCURING = ['All Drugs and co-occurring disorders', 'All Opioids and co-occurring disorders', 'Stimulants and co-occurring disorders', 'Cannabis and co-occurring disorders', 'Benzodiazepine and co-occurring disorders']
BASE_DRUGS_COVID = ['All Drugs and COVID-19', 'All Opioids and COVID-19', 'Stimulants and COVID-19', 'Cannabis and COVID-19', 'Benzodiazepine and COVID-19']
# Fentanyl overdose is only present for some rows, leaving out of this list
BASE_DRUGS_OVERDOSE = ['All Opioids overdose', 'Stimulants overdose', 'Cannabis overdose', 'Benzodiazepine overdose', 'Heroin overdose']

LIST_OF_NEW_DRUG_COLUMN_LISTS = [BASE_DRUGS, BASE_DRUGS_CO_OCCURING, BASE_DRUGS_COVID, BASE_DRUGS_OVERDOSE]

In [412]:
def reshape_df_with_core_columns(df: pd.DataFrame, values: list, new_column: str) -> pd.DataFrame:
    """
    Reshape a DataFrame by repeating its rows and adding a new column with specified values.

    Parameters:
    - df (pd.DataFrame): The input DataFrame to be reshaped.
    - values (list): A list of values to populate the new column.
    - new_column (str): The name of the new column to be added.

    Returns:
    - pd.DataFrame: The reshaped DataFrame with repeated rows and the new column.

    Example:
    Given a DataFrame:
        date
    0  2021
    1  2022

    Calling reshape_df_with_core_columns(df, ['male', 'female'], 'sex') will result in:
       date     sex
    0  2021    male
    1  2021  female
    2  2022    male
    3  2022  female
    """
        
    df_repeated = df.loc[df.index.repeat(len(values))].reset_index(drop=True)
    df_repeated[new_column] = values * len(df)
    
    # Reorder columns for clarity
    columns = df_repeated.columns.tolist()
    # Moving Setting to end
    columns.remove('setting')
    columns.append('setting')
    
    # Moving newly added column to front
    columns.remove(new_column)
    columns.insert(0, new_column)
    df_repeated = df_repeated[columns]
    
    # Sort data and drop previous index
    df_repeated = df_repeated.sort_values(by=['time', new_column]).reset_index(drop=True)
    return df_repeated

In [425]:
# Lambda function to lookup the corresponding values in the rows
def transform_with_lookup_value(row, reference_df: pd.DataFrame, new_column: str, core_column: str):
    """
    Look up a value in a reference DataFrame based on multiple conditions.
    
    Parameters:
    - row (pd.Series): A row from the DataFrame that needs a value lookup.
    - reference_df (pd.DataFrame): The reference DataFrame to perform the lookup.
    - new_column (str): The name of the column in the reference DataFrame to match against 'indicator'.
    - core_column (str): The name of the column in the original DataFrame to match against 'subgroup' in the reference DataFrame.
    
    Returns:
    - value (float or int or None): The corresponding value from the 'value' column in the reference DataFrame.
      Returns None if no match is found.
    
    Notes:
    The function matches rows based on the 'time', 'subgroup', 'setting', and 'indicator' columns.
    """
    matching_row = reference_df[
        (reference_df['time'] == row['time']) & 
        (reference_df['subgroup'] == row[core_column]) &
        (reference_df['setting'] == row['setting']) &
        (reference_df['indicator'] == new_column)]
    if not matching_row.empty:
        return matching_row['value'].iloc[0]
    else:
        return None

In [426]:
def enrich_with_additional_columns(df: pd.DataFrame, core_column: str) -> pd.DataFrame:
    """
    Enrich the input DataFrame with additional columns based on lookup values from a global reference DataFrame.
    
    This function utilizes the `transform_with_lookup_value` function to populate new columns in the input DataFrame
    based on matching criteria. The global reference DataFrame, `drug_use_df`, is filtered for rows matching the 
    specified `core_column`. For each set of new drug columns, the function looks up values and populates the input 
    DataFrame's corresponding columns.
    
    Parameters:
    - df (pd.DataFrame): The input DataFrame to be enriched with new columns.
    - core_column (str): The key column name which is used to filter rows from the global reference DataFrame.
    
    Returns:
    - pd.DataFrame: The enriched DataFrame with additional columns.
    
    Notes:
    - The function assumes the existence of a global DataFrame `drug_use_df`.
    - The list `LIST_OF_NEW_DRUG_COLUMN_LISTS` is also assumed to be globally defined, containing lists of 
      column names to be added to the input DataFrame.
    """
    # grab the remaining rows with our core column name
    reference_df = drug_use_df[(drug_use_df.group == core_column)]
    
    for column_list in LIST_OF_NEW_DRUG_COLUMN_LISTS:
        # lowercase all column names
        column_list = [name.lower() for name in column_list]
        for new_column in column_list:
            df[new_column] = df.apply(lambda row: transform_with_lookup_value(row, reference_df, new_column, core_column), axis=1)
    return df

## Drug Data by Sex

organize and group the data by sex. Leaving out age.

In [427]:
SEX_VALUES = ['male', 'female']
CORE_COLUMN_NAME = 'sex'

drug_sex_df = reshape_df_with_core_columns(base_drug_df, SEX_VALUES, CORE_COLUMN_NAME)

In [428]:
drug_sex_df = enrich_with_additional_columns(drug_sex_df, CORE_COLUMN_NAME)

In [429]:
drug_sex_df.head()

Unnamed: 0,sex,time,start_time,end_time,value,setting,all drugs,all opioids,stimulants,cannabis,...,all drugs and covid-19,all opioids and covid-19,stimulants and covid-19,cannabis and covid-19,benzodiazepine and covid-19,all opioids overdose,stimulants overdose,cannabis overdose,benzodiazepine overdose,heroin overdose
0,female,1,01/01/2020,01/31/2020,38478.0,ip,23.498389,2.846958,1.123157,1.479637,...,0.083126,0.0,0.434783,0.0,0.0,15.053763,8.602151,0.0,10.215054,5.913978
1,female,1,01/01/2020,01/31/2020,124275.0,ed,28.173838,1.147053,0.867393,1.668985,...,0.010616,0.0,0.0,0.089606,0.0,35.730337,21.797753,14.382022,10.337079,6.966292
2,male,1,01/01/2020,01/31/2020,38478.0,ip,30.455556,4.322222,2.983333,2.477778,...,0.145932,0.257069,0.18622,0.224215,0.0,20.212766,25.531915,3.191489,13.297872,7.446809
3,male,1,01/01/2020,01/31/2020,124275.0,ed,31.993799,2.27146,2.057205,2.858487,...,0.05989,0.076687,0.0,0.304692,0.0,46.199702,26.378539,16.244411,12.66766,8.494784
4,female,2,02/01/2020,02/29/2020,35754.0,ip,24.153663,3.26611,1.223495,1.451605,...,0.042928,0.0,0.0,0.0,0.0,15.116279,8.139535,1.162791,11.046512,4.069767


## Drug Data by Age

organize and group the data by age. Leaving out sex.

In [430]:
AGE_VALUES = ['0-15 years', '16-34 years', '35-54 years', '55+ years']
CORE_COLUMN_NAME = 'age'
drug_age_df = reshape_df_with_core_columns(base_drug_df, AGE_VALUES, CORE_COLUMN_NAME)

In [431]:
drug_age_df = enrich_with_additional_columns(drug_age_df, CORE_COLUMN_NAME)

In [432]:
# in some cases, 0-15 years data is missing for some categories. E.g. stimulants and covid. Converted to 0
drug_age_df = drug_age_df.where(drug_age_df.notna(), 0)

In [433]:
drug_age_df.head(10)

Unnamed: 0,age,time,start_time,end_time,value,setting,all drugs,all opioids,stimulants,cannabis,...,all drugs and covid-19,all opioids and covid-19,stimulants and covid-19,cannabis and covid-19,benzodiazepine and covid-19,all opioids overdose,stimulants overdose,cannabis overdose,benzodiazepine overdose,heroin overdose
0,0-15 years,1,01/01/2020,01/31/2020,38478.0,ip,7.700695,0.845032,0.190814,1.035846,...,0.530973,0.0,0.0,0.0,0.0,1.785714,8.928571,0.0,0.0,0.0
1,0-15 years,1,01/01/2020,01/31/2020,124275.0,ed,4.061278,0.023937,0.023937,0.111705,...,0.0,0.0,0.0,0.0,0.0,1.904762,6.666667,13.333333,1.904762,0.0
2,16-34 years,1,01/01/2020,01/31/2020,38478.0,ip,28.129252,4.931973,3.078231,5.884354,...,0.060459,0.0,0.0,0.289017,0.0,20.512821,11.111111,4.273504,11.111111,10.25641
3,16-34 years,1,01/01/2020,01/31/2020,124275.0,ed,31.21017,2.212661,1.843884,4.548248,...,0.041459,0.0,0.0,0.284495,0.0,41.685649,23.462415,23.917995,11.161731,7.744875
4,35-54 years,1,01/01/2020,01/31/2020,38478.0,ip,39.777424,7.329094,5.659777,3.513514,...,0.119904,0.0,0.561798,0.0,0.0,19.148936,21.276596,1.06383,15.957447,8.510638
5,35-54 years,1,01/01/2020,01/31/2020,124275.0,ed,46.468344,3.71084,3.375592,3.498902,...,0.03317,0.0,0.0,0.220264,0.0,53.70844,32.225064,11.253197,17.13555,9.71867
6,55+ years,1,01/01/2020,01/31/2020,38478.0,ip,29.376417,2.888619,1.13858,0.558748,...,0.089718,0.364964,0.0,0.0,0.0,21.495327,24.299065,0.0,14.953271,4.672897
7,55+ years,1,01/01/2020,01/31/2020,124275.0,ed,46.867033,1.392057,1.026432,1.345096,...,0.035786,0.240964,0.0,0.0,0.0,40.883978,20.994475,5.524862,7.18232,8.839779
8,0-15 years,2,02/01/2020,02/29/2020,35754.0,ip,7.14924,1.042598,0.134048,0.863867,...,1.041667,0.0,0.0,0.0,0.0,7.843137,7.843137,1.960784,3.921569,1.960784
9,0-15 years,2,02/01/2020,02/29/2020,113888.0,ed,3.958125,0.023525,0.032347,0.099982,...,0.0,0.0,0.0,0.0,0.0,1.941748,3.883495,8.737864,2.912621,0.0
