### Importing packages and setting FILE_NAMES

In [None]:
import numpy as np
import pandas as pd

years = [2019,2021,2022,2023,2024]

FILE_NAMES = [
    f'../../data/raw/HMS_{year}-{year+1}_PUBLIC_instchars.csv' for year in years
]

In [None]:
race_cols = ['race_black', 'race_ainaan', 'race_asian', 'race_his', 'race_pi', 'race_mides', 'race_white']
# data in audit_9 is corrupted
alcohol_in_substance_module = [f'audit_{i}' for i in range(1,11) if i != 9]

categories_with_int_values = ['international','alc_any','alc_binge'] + alcohol_in_substance_module + ['smok_freq','smok_vape'] + ['exerc']

for i,year in enumerate(years):
    new_filename = f'../../data/interim/{year}-{year+1}_jose.csv'
    df = pd.read_csv(FILE_NAMES[i])

    # All years except 2021-2022 have an aggregate binge_fr feature.
    # For 2021-2022 need to aggregate q3_22 (which should be binge_fr_f), binge_fr_m, and q3_22_0 (which should be binge_fr_o)

    if year == 2021:
        df['alc_binge'] = df[['q3_22','binge_fr_m','q3_22_0']].astype('Int64').max(axis=1,skipna=True)
    else: 
        df['alc_binge'] = df['binge_fr']

    # smok_freq and smok_vape are missing in the last year

    if year == 2024:
        df['smok_freq'] = np.nan
        df['smok_vape'] = np.nan
    else:
        # Encoding changed from 1=yes 2=no to 1=yes 0=no
        df['smok_vape'] = df['smok_vape'] % 2

    # exerc

    if year == 2022:
        df['exerc'] = df['exerc_range4']
    elif year == 2021:
        df['exerc'] = df['exerc'].replace(to_replace=6, value=2)
    elif year == 2023 or year == 2024:
        df['exerc'] = df['exerc'].replace(to_replace=[3,4,5], value=[2,3,4])

    # Encoding race in one columns
    # Flag entries marking no race or multiple races

    df['race'] = df[race_cols].fillna(0).dot([1,2,3,4,5,6,7]).astype('Int64')
    df['will_remove'] = (df[race_cols].fillna(0).sum(axis =1) != 1).astype('Int64')

    for feature in categories_with_int_values:
        df[feature] = df[feature].astype('Int64')

    df = df[['responseid' , 'will_remove', 'race'] + categories_with_int_values]

    df.to_csv(new_filename , index=False)