In [28]:
import pandas as pd
from sympy import symbols, Eq, solve
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

def load_csv(url: str) -> pd.DataFrame:
    return pd.read_csv(url, sep=",")

FILENAME = "Drug_Use_Data_from_Selected_Hospitals.csv"
filepath = f"../data/{FILENAME}"

In [29]:
drug_use_df = load_csv(filepath)

The first ~80 rows have a patient count that we can use to extrapolate counts of each condition. but it involves a bit of math, and I think we are better off doing analysis on the ratios (percentages)

If we see the row 
| sex   | Start Time | End Time   | setting | all drugs |
| ------| -----------| ---------- | ------- | ----------|
| female| 01/01/2020 | 01/31/2020 | ip      | 23.498389 |
| male  | 01/01/2020 | 01/31/2020 | ip      | 30.455556 |


You can see that the percentages don't add up to 100%. This is because row reads: Given all males admitted to the hospital as In Patients for this time period, 30.4555% were admitted for condition all_drugs.

In the end, I created 2 dataframes.
- 1st with data grouped by sex categories: drug_sex_df
- 2nd with data grouped by age categories: drug_age_df

we can do EDA on each

### Create and cleanup base data

In [30]:
# set column names to lowercase
drug_use_df.columns = drug_use_df.columns.str.lower()
# set row data to lowercase
COLUMNS_TO_CONVERT_TO_LOWERCASE = ['setting', 'indicator', 'group', 'subgroup', 'measure']
for col in COLUMNS_TO_CONVERT_TO_LOWERCASE:
    drug_use_df[col] = drug_use_df[col].str.lower()

# round values to 4 decimal places
drug_use_df['value'] = drug_use_df['value'].round(4)

# Grab the base data
base_drug_df = drug_use_df.iloc[:82]

# Remove extra columns
COLUMNS_TO_DROP_WITH_VALUE = ['figure', 'indicator', 'group', 'subgroup', 'measure']
base_drug_df_counts = base_drug_df.drop(columns=COLUMNS_TO_DROP_WITH_VALUE)

# Remove extra columns
COLUMNS_TO_DROP= ['figure', 'indicator', 'group', 'subgroup', 'measure', 'value']
base_drug_df = base_drug_df.drop(columns=COLUMNS_TO_DROP)



### Helper functions and constants

In [31]:
# Names of conditions in the rows we are converting to columns
BASE_DRUGS = ['All Drugs', 'All Opioids', 'Stimulants', 'Cannabis', 'Benzodiazepine']
BASE_DRUGS_CO_OCCURING = ['All Drugs and co-occurring disorders', 'All Opioids and co-occurring disorders', 'Stimulants and co-occurring disorders', 'Cannabis and co-occurring disorders', 'Benzodiazepine and co-occurring disorders']
BASE_DRUGS_COVID = ['All Drugs and COVID-19', 'All Opioids and COVID-19', 'Stimulants and COVID-19', 'Cannabis and COVID-19', 'Benzodiazepine and COVID-19']
# Fentanyl overdose is only present for some rows, leaving out of this list
BASE_DRUGS_OVERDOSE = ['All Opioids overdose', 'Stimulants overdose', 'Cannabis overdose', 'Benzodiazepine overdose', 'Heroin overdose']

# No longer using all of these columns for our analysis
# LIST_OF_NEW_DRUG_COLUMN_LISTS = [BASE_DRUGS, BASE_DRUGS_CO_OCCURING, BASE_DRUGS_COVID, BASE_DRUGS_OVERDOSE]
LIST_OF_NEW_DRUG_COLUMN_LISTS = [BASE_DRUGS]

In [32]:
def reshape_df_with_core_columns(df: pd.DataFrame, values: list, new_column: str) -> pd.DataFrame:
    """
    Reshape a DataFrame by repeating its rows and adding a new column with specified values.

    Parameters:
    - df (pd.DataFrame): The input DataFrame to be reshaped.
    - values (list): A list of values to populate the new column.
    - new_column (str): The name of the new column to be added.

    Returns:
    - pd.DataFrame: The reshaped DataFrame with repeated rows and the new column.

    Example:
    Given a DataFrame:
        date
    0  2021
    1  2022

    Calling reshape_df_with_core_columns(df, ['male', 'female'], 'sex') will result in:
       date     sex
    0  2021    male
    1  2021  female
    2  2022    male
    3  2022  female
    """
        
    df_repeated = df.loc[df.index.repeat(len(values))].reset_index(drop=True)
    df_repeated[new_column] = values * len(df)
    
    # Reorder columns for clarity
    columns = df_repeated.columns.tolist()
    # Moving Setting to end
    columns.remove('setting')
    columns.append('setting')
    
    # Moving newly added column to front
    columns.remove(new_column)
    columns.insert(0, new_column)
    df_repeated = df_repeated[columns]
    
    # Sort data and drop previous index
    df_repeated = df_repeated.sort_values(by=['time', new_column]).reset_index(drop=True)
    return df_repeated

In [33]:
# Lambda function to lookup the corresponding values in the rows
def transform_with_lookup_value(row, reference_df: pd.DataFrame, new_column: str, core_column: str):
    """
    Look up a value in a reference DataFrame based on multiple conditions.
    
    Parameters:
    - row (pd.Series): A row from the DataFrame that needs a value lookup.
    - reference_df (pd.DataFrame): The reference DataFrame to perform the lookup.
    - new_column (str): The name of the column in the reference DataFrame to match against 'indicator'.
    - core_column (str): The name of the column in the original DataFrame to match against 'subgroup' in the reference DataFrame.
    
    Returns:
    - value (float or int or None): The corresponding value from the 'value' column in the reference DataFrame.
      Returns None if no match is found.
    
    Notes:
    The function matches rows based on the 'time', 'subgroup', 'setting', and 'indicator' columns.
    """
    matching_row = reference_df[
        (reference_df['time'] == row['time']) & 
        (reference_df['subgroup'] == row[core_column]) &
        (reference_df['setting'] == row['setting']) &
        (reference_df['indicator'] == new_column)]
    if not matching_row.empty:
        return matching_row['value'].iloc[0]
    else:
        return None

In [34]:
def enrich_with_additional_columns(df: pd.DataFrame, core_column: str) -> pd.DataFrame:
    """
    Enrich the input DataFrame with additional columns based on lookup values from a global reference DataFrame.
    
    This function utilizes the `transform_with_lookup_value` function to populate new columns in the input DataFrame
    based on matching criteria. The global reference DataFrame, `drug_use_df`, is filtered for rows matching the 
    specified `core_column`. For each set of new drug columns, the function looks up values and populates the input 
    DataFrame's corresponding columns.
    
    Parameters:
    - df (pd.DataFrame): The input DataFrame to be enriched with new columns.
    - core_column (str): The key column name which is used to filter rows from the global reference DataFrame.
    
    Returns:
    - pd.DataFrame: The enriched DataFrame with additional columns.
    
    Notes:
    - The function assumes the existence of a global DataFrame `drug_use_df`.
    - The list `LIST_OF_NEW_DRUG_COLUMN_LISTS` is also assumed to be globally defined, containing lists of 
      column names to be added to the input DataFrame.
    """
    # grab the remaining rows with our core column name
    reference_df = drug_use_df[(drug_use_df.group == core_column)]
    
    for column_list in LIST_OF_NEW_DRUG_COLUMN_LISTS:
        # lowercase all column names
        column_list = [name.lower() for name in column_list]
        for new_column in column_list:
            df[new_column] = df.apply(lambda row: transform_with_lookup_value(row, reference_df, new_column, core_column), axis=1)
    return df

## Drug Data by Age

organize and group the data by age. Leaving out sex.

In [35]:
AGE_VALUES = ['0-15 years', '16-34 years', '35-54 years', '55+ years']
CORE_COLUMN_NAME = 'age'
drug_age_df = reshape_df_with_core_columns(base_drug_df, AGE_VALUES, CORE_COLUMN_NAME)

In [36]:
drug_age_df = enrich_with_additional_columns(drug_age_df, CORE_COLUMN_NAME)

In [37]:
print(f"shape: {drug_age_df.shape}")
print(f"value_counts: {drug_age_df['age'].value_counts()}")

shape: (328, 10)
value_counts: 0-15 years     82
16-34 years    82
35-54 years    82
55+ years      82
Name: age, dtype: int64


In [38]:
drug_age_df.isnull().sum()

age               0
time              0
start_time        0
end_time          0
setting           0
all drugs         0
all opioids       0
stimulants        0
cannabis          0
benzodiazepine    0
dtype: int64

In [39]:
drug_age_df.head(10)

Unnamed: 0,age,time,start_time,end_time,setting,all drugs,all opioids,stimulants,cannabis,benzodiazepine
0,0-15 years,1,01/01/2020,01/31/2020,ip,7.7007,0.845,0.1908,1.0358,0.2044
1,0-15 years,1,01/01/2020,01/31/2020,ed,4.0613,0.0239,0.0239,0.1117,0.0106
2,16-34 years,1,01/01/2020,01/31/2020,ip,28.1293,4.932,3.0782,5.8844,1.1224
3,16-34 years,1,01/01/2020,01/31/2020,ed,31.2102,2.2127,1.8439,4.5482,0.4594
4,35-54 years,1,01/01/2020,01/31/2020,ip,39.7774,7.3291,5.6598,3.5135,1.4467
5,35-54 years,1,01/01/2020,01/31/2020,ed,46.4683,3.7108,3.3756,3.4989,0.8786
6,55+ years,1,01/01/2020,01/31/2020,ip,29.3764,2.8886,1.1386,0.5587,0.3848
7,55+ years,1,01/01/2020,01/31/2020,ed,46.867,1.3921,1.0264,1.3451,0.2281
8,0-15 years,2,02/01/2020,02/29/2020,ip,7.1492,1.0426,0.134,0.8639,0.134
9,0-15 years,2,02/01/2020,02/29/2020,ed,3.9581,0.0235,0.0323,0.1,0.0147


In [40]:
drug_age_df.to_csv('../data/drug_age.csv', index=False)

## Drug Data by Age with Values

organize and group the data by age. Leaving out sex. Convert percentages to counts of patients

## Not possible