In [43]:
# Importing Libraries
import os
from pathlib import Path
import sys
import warnings

import numpy as np
import pandas as pd
from mizani.formatters import percent_format
from plotnine import *
from scipy.stats import logistic
from scipy.stats import norm
from stargazer.stargazer import Stargazer
from patsy import dmatrices
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import log_loss

In [44]:
# Current script
current_path = os.getcwd()

# Add utils folder to sys path 
sys.path.append(os.path.join(current_path, "utils"))

# Import the prewritten helper functions
from py_helper_functions import *

# Data & Data Cleaning

In [45]:
data = pd.read_csv("https://raw.githubusercontent.com/00Dabide/DA3-Assignment-1/main/morg-2014-emp.csv")



In [46]:
data.head()

Unnamed: 0.1,Unnamed: 0,hhid,intmonth,stfips,weight,earnwke,uhours,grade92,race,ethnic,...,ownchild,chldpres,prcitshp,state,ind02,occ2012,class,unionmme,unioncov,lfsr94
0,3,2600310997690,January,AL,3151.6801,1692.0,40,43,1,,...,0,0,"Native, Born In US",63,Employment services (5613),630,"Private, For Profit",No,No,Employed-At Work
1,5,75680310997590,January,AL,3457.1138,450.0,40,41,2,,...,2,6,"Native, Born In US",63,Outpatient care centers (6214),5400,"Private, For Profit",No,No,Employed-Absent
2,6,75680310997590,January,AL,3936.911,1090.0,60,41,2,,...,2,6,"Native, Born In US",63,Motor vehicles and motor vehicle equipment man...,8140,"Private, For Profit",No,No,Employed-At Work
3,10,179140131100930,January,AL,3288.364,769.23,40,40,1,,...,2,4,"Native, Born In US",63,"**Publishing, except newspapers and software (...",8255,"Private, For Profit",Yes,,Employed-At Work
4,11,179140131100930,January,AL,3422.85,826.92,40,43,1,,...,2,4,"Native, Born In US",63,"Banking and related activities (521, 52211,52219)",5940,"Private, For Profit",No,No,Employed-At Work


In [47]:
# Choosen occupation: Personal Care and Service Occupations (4300-4650)
data = data.loc[(data["occ2012"] >= 4300) & (data["occ2012"] <= 4650)]

# Create male, wage, lnwage variables

data["male"] = (data["sex"] == 1)
data["w"] = data["earnwke"] / data["uhours"]
data["lnw"] = np.log(data["w"])

data

Unnamed: 0.1,Unnamed: 0,hhid,intmonth,stfips,weight,earnwke,uhours,grade92,race,ethnic,...,state,ind02,occ2012,class,unionmme,unioncov,lfsr94,male,w,lnw
54,128,510365013001006,January,AL,2594.3007,400.00,20,44,1,,...,63,Child day care services (6244),4600,"Private, Nonprofit",No,No,Employed-At Work,False,20.000000,2.995732
141,315,299350627300630,January,AK,415.4125,192.30,40,41,1,,...,94,Child day care services (6244),4600,"Private, For Profit",No,No,Employed-Absent,False,4.807500,1.570177
207,476,299290794630670,January,AK,374.8012,1770.60,60,40,1,,...,94,Administration of human resource programs (923),4610,Government - State,No,No,Employed-At Work,True,29.510000,3.384729
230,518,610923097503049,January,AK,448.3966,36.25,5,37,1,,...,94,Elementary and secondary schools (6111),4600,"Private, Nonprofit",No,No,Employed-At Work,False,7.250000,1.981001
248,553,773019090200142,January,AK,589.5615,680.00,40,40,1,,...,94,Other personal services (8129),4350,Government - Federal,No,No,Employed-At Work,True,17.000000,2.833213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149078,316593,310001705119835,December,WI,3814.6828,345.00,30,43,1,,...,3,Child day care services (6244),4600,"Private, For Profit",No,No,Employed-At Work,False,11.500000,2.442347
149136,316690,615640001130506,December,WI,2977.7663,1923.00,40,39,1,,...,3,Beauty salons (812112),4510,"Private, For Profit",No,No,Employed-At Work,False,48.075000,3.872762
149194,316811,406604685991706,December,WY,266.6387,400.00,36,39,1,,...,8,"Residential care facilities, without nursing (...",4610,"Private, For Profit",No,No,Employed-At Work,False,11.111111,2.407946
149287,316996,210005535615846,December,WY,258.6668,654.00,35,45,1,,...,8,Beauty salons (812112),4510,"Private, For Profit",No,No,Employed-At Work,False,18.685714,2.927759
