In [None]:
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import sklearn.datasets
from sklearn.base import clone

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from folktables import ACSDataSource, ACSIncome, BasicProblem
import folktables

from synthcity.plugins import Plugins
from synthcity.plugins.core.dataloader import GenericDataLoader

In [None]:
def get_X_y(df, target_name):
    X = df.drop(columns=[target_name])
    y = df[target_name]
    return X, y

def move_target_col_to_end(df, target_name):
    cols = list(df.columns)
    cols.remove(target_name)
    cols.append(target_name)
    return df.reindex(columns=cols)

def result_record(model_name, score, mse, medse, train_data_origin):
    return {
        "train_data": train_data_origin,
        "model": model_name,
        "R2": score,
        "mse": mse,
        "medse": medse,
    }

def train_algos(X_train, y_train, X_test, y_test, train_data_origin):
    records = []
    for name, model in tqdm(algos.items()):
        model = clone(model)
        model = model.fit(X_train, y_train)
        score = model.score(X_test, y_test)
        preds = model.predict(X_test)
        mse = np.mean((preds - y_test)**2)
        medse = np.median((preds - y_test)**2)
        records.append(result_record(name, score, mse, medse, train_data_origin))
    
    return records

def plot_results(records):
    res_df = pd.DataFrame.from_records(records)
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    ax = axes[0]
    sns.barplot(res_df, x="model", y="mse", ax=ax)
    ax.set_title("MSE")
    ax.tick_params(axis="x", labelrotation=90)

    ax = axes[1]
    sns.barplot(res_df, x="model", y="medse", ax=ax)
    ax.set_title("Median SE")
    ax.tick_params(axis="x", labelrotation=90)

    ax = axes[2]
    ax.set_title("R^2")
    sns.barplot(res_df, x="model", y="R2", ax=ax)
    ax.tick_params(axis="x", labelrotation=90)
    plt.show()

algos = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(),
    "1-NN": KNeighborsRegressor(n_neighbors=1),
    "5-NN": KNeighborsRegressor(n_neighbors=5),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "Gradient Boosting": GradientBoostingRegressor(),
    "MLP": make_pipeline(StandardScaler(), MLPRegressor(max_iter=1000)), # Default doesn't always converge
    "SVM": make_pipeline(StandardScaler(), SVR()),
}
algo_names = algos.keys()

In [None]:
# This dictionary is from https://github.com/socialfoundations/folktables/blob/main/examples/encoding_categorical_features.ipynb
ACSIncome_categories = {
    "COW": {
        1.0: (
            "Employee of a private for-profit company or"
            "business, or of an individual, for wages,"
            "salary, or commissions"
        ),
        2.0: (
            "Employee of a private not-for-profit, tax-exempt,"
            "or charitable organization"
        ),
        3.0: "Local government employee (city, county, etc.)",
        4.0: "State government employee",
        5.0: "Federal government employee",
        6.0: (
            "Self-employed in own not incorporated business,"
            "professional practice, or farm"
        ),
        7.0: (
            "Self-employed in own incorporated business,"
            "professional practice or farm"
        ),
        8.0: "Working without pay in family business or farm",
        9.0: "Unemployed and last worked 5 years ago or earlier or never worked",
    },
    "SCHL": {
        1.0: "No schooling completed",
        2.0: "Nursery school, preschool",
        3.0: "Kindergarten",
        4.0: "Grade 1",
        5.0: "Grade 2",
        6.0: "Grade 3",
        7.0: "Grade 4",
        8.0: "Grade 5",
        9.0: "Grade 6",
        10.0: "Grade 7",
        11.0: "Grade 8",
        12.0: "Grade 9",
        13.0: "Grade 10",
        14.0: "Grade 11",
        15.0: "12th grade - no diploma",
        16.0: "Regular high school diploma",
        17.0: "GED or alternative credential",
        18.0: "Some college, but less than 1 year",
        19.0: "1 or more years of college credit, no degree",
        20.0: "Associate's degree",
        21.0: "Bachelor's degree",
        22.0: "Master's degree",
        23.0: "Professional degree beyond a bachelor's degree",
        24.0: "Doctorate degree",
    },
    "MAR": {
        1.0: "Married",
        2.0: "Widowed",
        3.0: "Divorced",
        4.0: "Separated",
        5.0: "Never married or under 15 years old",
    },
    "SEX": {1.0: "Male", 2.0: "Female"},
    "RAC1P": {
        1.0: "White alone",
        2.0: "Black or African American alone",
        3.0: "American Indian alone",
        4.0: "Alaska Native alone",
        5.0: (
            "American Indian and Alaska Native tribes specified;"
            "or American Indian or Alaska Native,"
            "not specified and no other"
        ),
        6.0: "Asian alone",
        7.0: "Native Hawaiian and Other Pacific Islander alone",
        8.0: "Some Other Race alone",
        9.0: "Two or More Races",
    },
}

In [None]:
data_source = ACSDataSource(survey_year="2018", horizon="1-Year", survey="person")
ca_data = data_source.get_data(states=["CA"], download=True)

In [None]:
ACSIncomeRegression = BasicProblem(
    features=[
        'AGEP',
        'COW',
        'SCHL',
        'MAR',
        'WKHP',
        'SEX',
        'RAC1P',
    ],
    target='PINCP',
    # target_transform=lambda x: x > 50000,    
    group='RAC1P',
    preprocess=folktables.adult_filter,
    # postprocess=lambda x: np.nan_to_num(x, -1),
)
ca_features, ca_labels, _ = ACSIncomeRegression.df_to_pandas(ca_data, categories=ACSIncome_categories)

In [None]:
n_real = 50000
df = pd.concat([ca_features, ca_labels], axis="columns")
df.AGEP = df.AGEP.apply(int)
df.WKHP = df.WKHP.apply(int)
df["log_PINCP"] = np.log(df.PINCP)
df.drop(columns=["PINCP"], inplace=True)
df = df.sample(n_real)
df

In [None]:
def to_oh_df(df, target_name, orig_cols=None):
    oh_df = pd.get_dummies(df)
    oh_df = move_target_col_to_end(oh_df, target_name)
    if orig_cols is not None:
        oh_df = oh_df.reindex(columns=orig_cols, fill_value=0)
    return oh_df

target_name = "log_PINCP"
orig_oh_df = to_oh_df(df, target_name)
orig_cols = list(orig_oh_df.columns)

In [None]:
data, test_data = train_test_split(df, test_size=0.25)
data.to_csv("datasets/ACS-2018/ACS-2018-preprocessed-train.csv", index=False)
test_data.to_csv("datasets/ACS-2018/ACS-2018-preprocessed-test.csv", index=False)
df.to_csv("datasets/ACS-2018.csv", index=False)

In [None]:
train_oh = to_oh_df(data, target_name, orig_cols)
test_oh = to_oh_df(test_data, target_name, orig_cols)

X_train, y_train = get_X_y(train_oh, target_name)
X_test, y_test = get_X_y(test_oh, target_name)

In [None]:
real_data_records = train_algos(X_train, y_train, X_test, y_test, "real")
real_data_result_df = pd.DataFrame.from_records(real_data_records)
plot_results(real_data_records)