In [5]:
import sys
import pandas as pd
from collections import defaultdict

sys.path.insert(0, '/Users/diol5851/Documents/rental-housing-cost/')
from notebooks.data_cleaning_EDA.AHS_Parser import AHS_Parser
sys.path.remove('/Users/diol5851/Documents/rental-housing-cost/')

In [8]:
def get_dataset(year):
    raw_dat = pd.read_feather(f'../../data/AHS_{year}.feather')
    # Drop the year column, since that does not add information here.
    dat = raw_dat.drop(columns=['YEAR'])
    # Move ADEQUACY to the first column.
    adequacy = dat.pop('ADEQUACY')
    dat.insert(0, 'ADEQUACY', adequacy)
    # Drop apartments with four or more bathrooms.
    dat = dat[dat['BATHROOMS'] < 4]
    # Remap the ADEQUACY column to a binary column, where 1 is adequate.
    ad_default = defaultdict(lambda: 0)
    ad_default['Adequate'] = 1
    dat['ADEQUACY'] = dat['ADEQUACY'].map(ad_default)
    # Dropping columns that seem redundant for prediction.
    dat = dat.drop(columns = [
        'UTILAMT', 'HOAAMT', 'INSURAMT', 'FINCP', 'KITCHENS', 'DINING', 'LAUNDY', 'GARAGE', 'PORCH',
        'NUMNONREL', 'HHADLTKIDS', 'SUBDIV'
        ])
    # Parse the remaining categorical columns.
    parser = AHS_Parser()
    for col in dat.select_dtypes(include=['object']).columns:
        dat[col] = dat[col].astype('category')
        dat[col] = dat[col].cat.rename_categories(parser.parse_categorical(col, 2023))
        if 'Not reported' in dat[col].cat.categories:
            dat[col] = dat[col].cat.remove_categories(['Not reported'])
    # Drop columns with less than 20k non-null values.
    dat = dat.drop(columns = dat.columns[dat.count() < 20e3])
    # Drop all remaining null values.
    dat = dat.dropna()
    return dat