A notebook to combine train data


In [1]:
HOUSE_PREFIX = "house_"
EDU_PREFIX = "edu_"

PROCESSED_DIR = "processed"

In [2]:
import os

import numpy as np
import pandas as pd

from data import *

house_train["psu_hh_idcode"] = (
    house_train["psu"].astype(str)
    + "_"
    + house_train["hh"].astype(str)
    + "_"
    + house_train["idcode"].astype(str)
)
edu_train["psu_hh_idcode"] = (
    edu_train["psu"].astype(str)
    + "_"
    + edu_train["hh"].astype(str)
    + "_"
    + edu_train["idcode"].astype(str)
)

house_train_prefixed = house_train.drop(
    columns=["psu", "hh", "idcode", "hhid"]
).add_prefix(HOUSE_PREFIX)
edu_train_prefixed = edu_train.drop(columns=["psu", "hh", "idcode"]).add_prefix(
    EDU_PREFIX
)

# remove prefix on the key merge column "psh_hh_idcode"
house_train_prefixed = house_train_prefixed.rename(
    columns={"house_psu_hh_idcode": "psu_hh_idcode"}
)
edu_train_prefixed = edu_train_prefixed.rename(
    columns={"edu_psu_hh_idcode": "psu_hh_idcode"}
)

combined_train = pd.merge(
    pd.merge(
        pov_train,
        house_train_prefixed,
        on="psu_hh_idcode",
        how="left",
        suffixes=[None, "_house"],
    ),
    edu_train_prefixed,
    on="psu_hh_idcode",
    how="left",
    suffixes=[None, "_edu"],
)

combined_train.columns = combined_train.columns.str.lower()
assert combined_train.shape[0] == pov_train.shape[0]  # same number of rows

# save to csv
combined_train.to_csv(os.path.join(PROCESSED_DIR, "combined_train.csv"), index=False)
combined_train.shape

(5337, 99)

In [3]:
# now test data
house_test["psu_hh_idcode"] = (
    house_test["psu"].astype(str)
    + "_"
    + house_test["hh"].astype(str)
    + "_"
    + house_test["idcode"].astype(str)
)
edu_test["psu_hh_idcode"] = (
    edu_test["psu"].astype(str)
    + "_"
    + edu_test["hh"].astype(str)
    + "_"
    + edu_test["idcode"].astype(str)
)
house_test_prefixed = house_test.drop(
    columns=["psu", "hh", "idcode", "hhid"]
).add_prefix("house_")
edu_test_prefixed = edu_test.drop(columns=["psu", "hh", "idcode"]).add_prefix("edu_")

# remove prefix on the key merge column "psh_hh_idcode"
house_test_prefixed = house_test_prefixed.rename(
    columns={"house_psu_hh_idcode": "psu_hh_idcode"}
)
edu_test_prefixed = edu_test_prefixed.rename(
    columns={"edu_psu_hh_idcode": "psu_hh_idcode"}
)

combined_test = pd.merge(
    pd.merge(
        sample_submission,
        house_test_prefixed,
        on="psu_hh_idcode",
        how="left",
        suffixes=[None, "_house"],
    ),
    edu_test_prefixed,
    on="psu_hh_idcode",
    how="left",
    suffixes=[None, "_edu"],
)
combined_test.columns = combined_test.columns.str.lower()
assert combined_test.shape[0] == sample_submission.shape[0]  # same number of rows

# save to csv
combined_test.to_csv(os.path.join(PROCESSED_DIR, "combined_test.csv"), index=False)
combined_test.shape

(1334, 99)