In [10]:
import helpers
import numpy as np
import matplotlib.pyplot as plt

In [116]:
import os
import numpy as np

def load_csv_data(data_path, sub_sample=False, keep_cols=None):
    """
    Load CSV data and also return column names.
    """
    # --- Read header (for feature names) ---
    with open(os.path.join(data_path, "x_train.csv"), "r") as f:
        header = f.readline().strip().split(",")
    feature_names = header[1:]  # drop the first column ("Id")

    # --- Load arrays ---
    y_train = np.genfromtxt(
        os.path.join(data_path, "y_train.csv"),
        delimiter=",",
        skip_header=1,
        dtype=int,
        usecols=1,
    )
    x_train = np.genfromtxt(
        os.path.join(data_path, "x_train.csv"), delimiter=",", skip_header=1
    )
    x_test = np.genfromtxt(
        os.path.join(data_path, "x_test.csv"), delimiter=",", skip_header=1
    )

    train_ids = x_train[:, 0].astype(int)
    test_ids = x_test[:, 0].astype(int)
    x_train = x_train[:, 1:]
    x_test = x_test[:, 1:]

    # --- Keep only selected columns (if specified) ---
    if keep_cols is not None:
        x_train = x_train[:, keep_cols]
        x_test = x_test[:, keep_cols]
        feature_names = [feature_names[i] for i in keep_cols]

    # --- Sub-sample ---
    if sub_sample:
        y_train = y_train[::50]
        x_train = x_train[::50]
        train_ids = train_ids[::50]

    return x_train, x_test, y_train, train_ids, test_ids, feature_names


In [112]:
fields_to_drop = [
    "_STATE",
    "FMONTH",
    "IDATE",
    "IMONTH",
    "IDAY",
    "IYEAR",
    "DISPCODE",
    "SEQNO",
    "_PSU",
    "CTELENUM",
    "PVTRESD1",
    "STATERES",
    "CELLFON3",
    "NUMADULT",
    "NUMMEN",
    "NUMWOMEN",
    "CTELNUM1",
    "CELLFON2",
    "CADULT",
    "PVTRESD2",
    "CSTATE",
    "LANDLINE",
    "HHADULT",
    "HLTHPLN1",
    "PERSDOC2",
    "MARITAL",
    "EDUCA",
    "RENTHOM1",
    "NUMHHOL2",
    "CPDEMO1",
    "VETERAN3",
    "EMPLOY1",
    "CHILDREN",
    "INCOME2",
    "INTERNET",
    "MEDCOST",
    "USEEQUIP",
    "BLIND",
    "DECIDE", 
    "DIFFWALK",
    "DIFFDRES",
    "DIFFALON",
    "EXERANY2", #probably to keep?
    "EXRACT11", #probably to keep?
    "EXEROFT1",#probably to keep?
    "EXERHMM1",#probably to keep?
    "EXRACT21", #probably to keep?
    "EXEROFT2", #probably to keep?
    "EXERHMM2",#probably to keep?
    "STRENGTH", #probably to keep?
    "SEATBELT", 
    "IMFVPLAC",
    "SXORIENT",
    "TRNSGNDR", 
    "QSTVER",
    "QSTLANG",
    "MSCODE",
    '_STSTR',
    '_STRWT',
    '_RAWRAKE',
    '_WT2RAKE',
    '_CHISPNC',
    '_DUALUSE',
    '_DUALCOR',
    '_LLCPWT',
    '_HCVU651',
    '_DRDXAR1',
    '_PRACE1',
    '_MRACE1',
    '_HISPANC',
    '_RACE',
    '_RACEG21',
    '_RACEGR3',
    '_RACE_G1',
    '_CHLDCNT',
    '_EDUCAG',
    '_INCOMG',
    '_RFSEAT2',
    '_RFSEAT3',
    
]


In [118]:
x_train, x_test, y_train, train_ids, test_ids, feature_names = load_csv_data('data/dataset')

In [119]:
np.shape(x_train)

(328135, 321)

# Drop feautures with >70% of nan's

In [120]:
nan_counts = np.isnan(x_train).sum(axis=0)
nan_percent = nan_counts / x_train.shape[0] * 100


threshold = 70.0  # drop features with >90% NaNs

keep_indices = [i for i, pct in enumerate(nan_percent) if pct <= threshold]
keep_names   = [feature_names[i] for i in keep_indices]

print("Keeping", len(keep_indices), "features")

Keeping 200 features


# Drop selected feautures from list

In [121]:
drop_indices = [feature_names.index(name) for name in fields_to_drop]
keep_indices = [i for i in range(len(feature_names)) if (i not in drop_indices and i in keep_indices)]
keep_names   = [feature_names[i] for i in keep_indices]

In [122]:
x_train_clean = x_train[:, keep_indices]
x_test_clean  = x_test[:, keep_indices]

In [123]:
len(keep_names)

121

In [124]:
keep_names

['GENHLTH',
 'PHYSHLTH',
 'MENTHLTH',
 'POORHLTH',
 'CHECKUP1',
 'BPHIGH4',
 'BPMEDS',
 'BLOODCHO',
 'CHOLCHK',
 'TOLDHI2',
 'CVDSTRK3',
 'ASTHMA3',
 'CHCSCNCR',
 'CHCOCNCR',
 'CHCCOPD1',
 'HAVARTH3',
 'ADDEPEV2',
 'CHCKIDNY',
 'DIABETE3',
 'SEX',
 'WEIGHT2',
 'HEIGHT3',
 'QLACTLM2',
 'SMOKE100',
 'SMOKDAY2',
 'USENOW3',
 'ALCDAY5',
 'AVEDRNK2',
 'DRNK3GE5',
 'MAXDRNKS',
 'FRUITJU1',
 'FRUIT1',
 'FVBEANS',
 'FVGREEN',
 'FVORANG',
 'VEGETAB1',
 'LMTJOIN3',
 'ARTHDIS2',
 'ARTHSOCL',
 'JOINPAIN',
 'FLUSHOT6',
 'FLSHTMY2',
 'PNEUVAC3',
 'HIVTST6',
 '_RFHLTH',
 '_RFHYPE5',
 '_CHOLCHK',
 '_RFCHOL',
 '_LTASTH1',
 '_CASTHM1',
 '_ASTHMS1',
 '_AGEG5YR',
 '_AGE65YR',
 '_AGE80',
 '_AGE_G',
 'HTIN4',
 'HTM4',
 'WTKG3',
 '_BMI5',
 '_BMI5CAT',
 '_RFBMI5',
 '_SMOKER3',
 '_RFSMOK3',
 'DRNKANY5',
 'DROCDY3_',
 '_RFBING5',
 '_DRNKWEK',
 '_RFDRHV5',
 'FTJUDA1_',
 'FRUTDA1_',
 'BEANDAY_',
 'GRENDAY_',
 'ORNGDAY_',
 'VEGEDA1_',
 '_MISFRTN',
 '_MISVEGN',
 '_FRTRESP',
 '_VEGRESP',
 '_FRUTSUM',
 '_VEGESUM',
 '