# Import

In [1]:
import os
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor
import sklearn

from helper import *

In [3]:
NUM_FEATURES = [
    'AGE',
    'YEARWRK',
    'FERTIL',
    'HOUR89',
    'HOURS',
    'RIDERS',
    'WEEK89', 
    'TRAVTIME',
    'YRSSERV',
    'REARNING',
    'INCOME1',
    'PWGT1',
    'DEPART',
    'RIDERS',
    'TRAVTIME'
]

# Training Data

In [4]:
# load metadata # Users/tom.jubb/Documents/data/us_census/
df_meta, all_codes = load_data(r"./", verbose=False)

WORK89


In [None]:
# load data
df = load_data_hl( fpath      = './us_census_data_cleaned_6.csv',
                   target     = 'REARNING', 
                   age_range  = [18,90],
                   wage_range = [5000,140000], 
                   retain     = 50)

In [5]:
print_features(df, NUM_FEATURES, df_meta)

AGE                  - NUM -      - Age
ANCSTRY1             - CAT - 255  - Ancestry First Entry See Appendix I Ance
ANCSTRY2             - CAT - 192  - Ancestry Second Entry See Appendix I Anc
CITIZEN              - CAT - 2    - Citizenship
CLASS                - CAT - 8    - Class of Worker
DEPART               - NUM -      - Time of Departure for Work Hour and Minu
DISABL1              - CAT - 2    - Work Limitation Stat.
ENGLISH              - CAT - 5    - Ability to Speak English
FEB55                - CAT - 2    - Served February 1955 July 1964
FERTIL               - NUM -      - No. of Chld. Ever Born
HISPANIC             - CAT - 56   - Detailed Hispanic Origin Code See Append
HOUR89               - NUM -      - Usual Hrs. Worked Per Week Last Yr. 1989
HOURS                - NUM -      - Hrs. Worked Last Week
IMMIGR               - CAT - 11   - Yr. of Entry
INDUSTRY             - CAT - 243  - Ind. See Appendix I Ind..lst
KOREAN               - CAT - 2    - Served Korean Conflict

## Training : Linear Models

In [6]:
model = train_linear_regression(df, 
                        target_feature     = 'REARNING',
                        keep_features      = None,
                        drop_features      = ['MIGSTATE','POWSTATE', 'MULTILINGUAL', 'LANG1'],
                        dummy_features     = ['MARITAL'],
                        reduce_cardinality = False,
                        n_repeats          = 10, 
                        test_split         = 0.05,
                        encode_cats        = True,
                        scale              = True,
                        verbose            = True,
                        NUM_FEATURES       = NUM_FEATURES, 
                        df_meta            = df_meta)

AGE                  - NUM -      - Age
ANCSTRY1             - CAT - 255  - Ancestry First Entry See Appendix I Ance
ANCSTRY2             - CAT - 192  - Ancestry Second Entry See Appendix I Anc
CITIZEN              - CAT - 2    - Citizenship
CLASS                - CAT - 8    - Class of Worker
DEPART               - NUM -      - Time of Departure for Work Hour and Minu
DISABL1              - CAT - 2    - Work Limitation Stat.
ENGLISH              - CAT - 5    - Ability to Speak English
FEB55                - CAT - 2    - Served February 1955 July 1964
FERTIL               - NUM -      - No. of Chld. Ever Born
HISPANIC             - CAT - 56   - Detailed Hispanic Origin Code See Append
HOUR89               - NUM -      - Usual Hrs. Worked Per Week Last Yr. 1989
HOURS                - NUM -      - Hrs. Worked Last Week
IMMIGR               - CAT - 11   - Yr. of Entry
INDUSTRY             - CAT - 243  - Ind. See Appendix I Ind..lst
KOREAN               - CAT - 2    - Served Korean Conflict

NameError: name 'plt' is not defined

## Training : Decision Tree Models

In [7]:
model = train_decision_trees(df.iloc[:100000,:], 
                        target_feature ='REARNING',
                        keep_features  = None,
                        drop_features  = ['MIGSTATE','POWSTATE','OCCUP_JOB','LANG2','MIGPUMA','POWPUMA','INDUSTRY_CAT'],
                        n_repeats      = 1, 
                        model_type     = 'grad',
                        model_params   = {"n_estimators":200, "max_depth":9, "learning_rate":0.25},
                        reduce_card    = True,
                        test_split     = 0.05,
                        encode_cats    = True,
                        scale          = False,
                        verbose        = True,
                        NUM_FEATURES   = NUM_FEATURES, 
                        df_meta        = df_meta)

CITIZEN              - CAT - 2    - Citizenship
FERTIL               - NUM -      - No. of Chld. Ever Born
OCCUP                - CAT - 492  - Occupation See Appendix I Occup.lst
HOURS                - NUM -      - Hrs. Worked Last Week
POB_COUNTRY          - CAT - 134  - 
POB                  - CAT - 207  - Place of Birth Appendix I Birth.lst Unit
YEARSCH              - CAT - 17   - Ed. Attainment
LANG1                - CAT - 2    - Language Other Than English At Home
FEB55                - CAT - 2    - Served February 1955 July 1964
RVETSERV             - CAT - 12   - Veteran Per. of Srvc.
IMMIGR               - CAT - 11   - Yr. of Entry
KOREAN               - CAT - 2    - Served Korean Conflict June 1950 January
MAY75880             - CAT - 2    - Served May 1975 to August 1980
TRAVTIME             - NUM -      - Travel Time to Work
POB_USA              - CAT - 2    - 
RACE                 - CAT - 3    - Recoded Detailed Race Code Appendix C Ra
VIETNAM              - CAT - 2    - Se

## Training : Neural Network

In [None]:
model = train_neural_network(df, 
                        target_feature,
                        keep_features = [],
                        drop_features = ['MIGSTATE','POWSTATE'],
                        model_params={ "random_state":1, "max_iter":300,"learning_rate_init":0.01},
                        n_repeats=10, 
                        test_split=0.05,
                        encode_cats=True,
                        scale=True,
                        verbose=True)

## Testing

In [9]:
def test_model_global():
    pass

In [10]:
def test_model_local():
    pass