In [1]:
from folktables import ACSDataSource, ACSEmployment, BasicProblem, adult_filter
import numpy as np

STATE = "MN"

data_source = ACSDataSource(survey_year='2018', 
                            horizon='1-Year', 
                            survey='person')

acs_data = data_source.get_data(states=[STATE], download=True)

acs_data.head()

Downloading data for 2018 1-Year person survey for MN...


Unnamed: 0,RT,SERIALNO,DIVISION,SPORDER,PUMA,REGION,ST,ADJINC,PWGTP,AGEP,...,PWGTP71,PWGTP72,PWGTP73,PWGTP74,PWGTP75,PWGTP76,PWGTP77,PWGTP78,PWGTP79,PWGTP80
0,P,2018GQ0000054,4,1,1900,2,27,1013097,23,36,...,6,25,24,40,22,21,24,25,41,21
1,P,2018GQ0000077,4,1,2300,2,27,1013097,74,56,...,72,91,144,76,19,17,19,83,68,125
2,P,2018GQ0000094,4,1,1410,2,27,1013097,11,21,...,18,9,3,2,2,11,17,2,11,9
3,P,2018GQ0000141,4,1,1304,2,27,1013097,13,20,...,0,30,17,22,24,13,12,12,19,0
4,P,2018GQ0000370,4,1,900,2,27,1013097,28,46,...,27,2,2,2,30,25,26,26,27,2


In [40]:
possible_features=['AGEP', 'SCHL', 'MAR', 'RELP', 'DIS', 'ESP', 'CIT', 'MIG', 'MIL', 'ANC', 'NATIVITY', 'DEAR', 'DEYE', 'DREM', 'SEX', 'RAC1P', 'ESR']
acs_data[possible_features].head()

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RAC1P,ESR
0,36,17.0,5,16,2,,1,3.0,4.0,1,1,2,2,2.0,1,1,6.0
1,56,17.0,5,16,2,,1,1.0,4.0,1,1,2,2,2.0,1,5,6.0
2,21,19.0,5,17,1,,1,3.0,4.0,4,1,2,1,2.0,2,1,6.0
3,20,15.0,5,16,1,,1,1.0,4.0,4,1,2,2,1.0,2,1,6.0
4,46,24.0,1,17,2,,1,1.0,4.0,1,1,2,2,2.0,2,1,1.0


In [63]:
features_to_use = [f for f in possible_features if f not in ["ESR", "RAC1P"]]

In [64]:
EmploymentProblem = BasicProblem(
    features=features_to_use,
    target='ESR',
    target_transform=lambda x: x == 1,
    group='RAC1P',
    preprocess=lambda x: x,
    postprocess=lambda x: np.nan_to_num(x, -1),
)

features, label, group = EmploymentProblem.df_to_numpy(acs_data)

In [65]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(
    features, label, group, test_size=0.2, random_state=0)

In [87]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn import tree

model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

model_tree = tree.DecisionTreeClassifier(max_depth = 2)
model_tree.fit(X_train, y_train)


In [88]:
y_hat = model_tree.predict(X_test)
(y_hat == y_test).mean()

0.8228018284485077

In [89]:
(y_hat == y_test)[group_test == 1].mean()

0.8273755882647442

In [90]:
(y_hat == y_test)[group_test == 2].mean()

0.764179104477612

In [92]:
from sklearn.model_selection import cross_val_score
cv_scores = cross_val_score(model_tree, X_test, y_test, cv=5)
cv_scores

array([0.82526882, 0.8328853 , 0.81667414, 0.8283281 , 0.80905424])

In [106]:
import pandas as pd
df = pd.DataFrame(X_train, columns = features_to_use)
df["RACE"] = group_train
df["ESR_Label"] = y_train
df

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RACE,ESR_Label
0,32.0,20.0,5.0,0.0,2.0,0.0,1.0,3.0,4.0,1.0,1.0,2.0,2.0,2.0,1.0,1,True
1,61.0,21.0,1.0,0.0,2.0,0.0,1.0,1.0,4.0,2.0,1.0,2.0,2.0,2.0,1.0,1,True
2,61.0,20.0,1.0,0.0,1.0,0.0,1.0,1.0,4.0,1.0,1.0,1.0,2.0,2.0,2.0,1,True
3,76.0,17.0,1.0,0.0,2.0,0.0,1.0,1.0,3.0,2.0,1.0,2.0,2.0,2.0,1.0,1,False
4,12.0,8.0,5.0,2.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,2.0,2.0,2.0,2.0,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44621,46.0,19.0,3.0,0.0,2.0,0.0,4.0,1.0,4.0,1.0,2.0,2.0,2.0,2.0,2.0,2,True
44622,57.0,16.0,1.0,1.0,2.0,0.0,1.0,1.0,3.0,1.0,1.0,2.0,2.0,2.0,1.0,1,True
44623,31.0,19.0,1.0,0.0,2.0,0.0,1.0,1.0,4.0,4.0,1.0,2.0,2.0,2.0,1.0,1,True
44624,32.0,20.0,5.0,0.0,2.0,0.0,1.0,1.0,4.0,2.0,1.0,2.0,2.0,2.0,2.0,1,True


### Questions about above data table:
1: How many individuals are in the dataframe?  

    44,625  
    
2: What proportion have a target label equal to 1?

    22,230/44,625 = 49.8%
    
3: How many individuals are in each group?   


    40,029 individuals are in group 1
    1,425 individuals are in group 2
    2,661 individuals are in all other groups
    
4. In each group, what proportion of individuals have a target label equal to 1?  

    Group 1: 50.84%  
    Group 2: 39.58%  
    All other groups: 41.39%
5. 

    

In [151]:
proportion = (df["ESR_Label"].mean())
proportion

0.49814009770089185

In [128]:
df.loc[df["RACE"] >= 3, "RACE"] = 3
df

Unnamed: 0,AGEP,SCHL,MAR,RELP,DIS,ESP,CIT,MIG,MIL,ANC,NATIVITY,DEAR,DEYE,DREM,SEX,RACE,ESR_Label
0,32.0,20.0,5.0,0.0,2.0,0.0,1.0,3.0,4.0,1.0,1.0,2.0,2.0,2.0,1.0,1,True
1,61.0,21.0,1.0,0.0,2.0,0.0,1.0,1.0,4.0,2.0,1.0,2.0,2.0,2.0,1.0,1,True
2,61.0,20.0,1.0,0.0,1.0,0.0,1.0,1.0,4.0,1.0,1.0,1.0,2.0,2.0,2.0,1,True
3,76.0,17.0,1.0,0.0,2.0,0.0,1.0,1.0,3.0,2.0,1.0,2.0,2.0,2.0,1.0,1,False
4,12.0,8.0,5.0,2.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,2.0,2.0,2.0,2.0,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44621,46.0,19.0,3.0,0.0,2.0,0.0,4.0,1.0,4.0,1.0,2.0,2.0,2.0,2.0,2.0,2,True
44622,57.0,16.0,1.0,1.0,2.0,0.0,1.0,1.0,3.0,1.0,1.0,2.0,2.0,2.0,1.0,1,True
44623,31.0,19.0,1.0,0.0,2.0,0.0,1.0,1.0,4.0,4.0,1.0,2.0,2.0,2.0,1.0,1,True
44624,32.0,20.0,5.0,0.0,2.0,0.0,1.0,1.0,4.0,2.0,1.0,2.0,2.0,2.0,2.0,1,True


In [163]:
df2 = pd.DataFrame()
df2.loc[:, "Total_num"] = df.groupby('RACE')[['RACE']].aggregate([len]).round(2)
df2.loc[:, "Num_Employed"] = df.groupby('RACE')[['ESR_Label']].sum()
df2.loc[:, "proportion"] = df.groupby('RACE')[['ESR_Label']].mean()
df2

Unnamed: 0_level_0,Total_num,Num_Employed,proportion
RACE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,40029,20353,0.508456
2,1425,564,0.395789
3,3172,1313,0.413934
