In [1]:
import pandas as pd
import numpy as np
import scipy.optimize as opt
from scipy.optimize import minimize
import scipy.stats as stats
import time
import math
from statsmodels.iolib.summary2 import summary_col

In [12]:
#Here we drop all missing values and begin the genesis of variables that will be used in the creation of the lnwage
#variable
data=pd.read_stata('PS3_data.dta')

data = data[data.hlabinc.isnull() !=True] 
data = data[data.hannhrs.isnull() !=True] 
data = data[data.age.isnull() !=True] 
data = data[data.hyrsed.isnull() !=True] 


data['nuhrs'] = data['hannhrs'].where(data['hannhrs'] > 0)
data['wage'] = data['hlabinc']/data['nuhrs']
data['lnwage'] = np.log(data['wage'])   

#This column will allow for the calculation of an intercept (constant)
data['ones'] = 1  


#Having done the above we select only male heads of household in the given age range
#for the given income...
data = data[(data['hsex']==1) & (data['age'] <60) & (data['age'] > 25) & (data['wage'] > 7)] 

#Finally we generate the dummy variables for race...
data['Hispanic'] = (data.hrace == 4)
data['Hispanic'] = data['Hispanic'].replace(False, 0)  
data['Other Races'] = ((data.hrace ==3) & (data.hrace ==5) & (data.hrace==6) & (data.hrace==7))
data['Other Races'].replace(False, 0)
data = pd.get_dummies(data, columns=['hrace'])
data = data.rename(columns={'hrace_1.0': 'White', 'hrace_2.0': 'Black', 'hrace_3.0': 'Native_American'})
data = data.rename(columns={'hyrsed': 'Educ'})
data.head(n=3)

Unnamed: 0,id68,year,intid,relhh,hannhrs,wannhrs,hlabinc,wlabinc,nochild,wrace,...,wyrsed,pce,nuhrs,lnwage,ones,Hispanic,Other Races,White,Black,Native_American
11161,402,1971,1,Head,1523.0,0.0,62928.707031,,0,1.0,...,12.0,0.247121,1523.0,3.72132,1,False,False,1,0,0
11164,461,1971,4,Head,2010.0,0.0,22660.970703,,0,1.0,...,5.0,0.247121,2010.0,2.422509,1,False,False,1,0,0
11173,284,1971,20,Head,2400.0,0.0,76885.4375,,2,1.0,...,12.0,0.247121,2400.0,3.466848,1,False,False,1,0,0


In [19]:
#Now we divide the data into sundry parts based on year such that we can perform year based regressions later

year1 = data[(data['year']==1971)]
year2 = data[(data['year']==1980)]
year3 = data[(data['year']==1990)]
year4 = data[(data['year']==2000)]

In [28]:
#Here we define the function and perform a pooled cross-section regression
def LL(params):
    #Here we define the parameters
    b0 = params[0]
    b1 = params[1]
    b2 = params[2]
    b3 = params[3]
    b4 = params[4]
    sd = params[5]

    # Here we calculate predicted values for lnwage
    yPred = b0 + b1 * data['Educ'] + b2 * data['age'] + b3 * data['Black'] + b4 * data['Other Races'] 

    # This uses our predicted values and such to parameterize the normal pdf and 
    #from thence we make it a log-likelihood function
    loglike = -np.sum( stats.norm.logpdf(data['lnwage'], loc=yPred, scale=sd) )

    # Tell the function to return the NLL for minimization
    return(loglike)

# This list gives initial values for the coefficients and standard deviation    
initParams = [1, -3, 1, 1, 1, 1]

#Here we run the full regression model unseparated by year
results = opt.minimize(LL, initParams, method='Nelder-Mead')

print(results)

 final_simplex: (array([[ 1.39854068,  0.07882528,  0.01427788, -0.16706002,  2.07414773,
         0.49058407],
       [ 1.39852545,  0.07882854,  0.0142773 , -0.16710099,  2.07413204,
         0.49058377],
       [ 1.39851112,  0.07882891,  0.01427768, -0.16709284,  2.07412678,
         0.49058293],
       [ 1.39852133,  0.07882663,  0.01427796, -0.1670648 ,  2.07413594,
         0.49058761],
       [ 1.39852531,  0.0788253 ,  0.01427837, -0.16711253,  2.0741295 ,
         0.49057981],
       [ 1.39863328,  0.07882207,  0.01427685, -0.16708205,  2.07419166,
         0.4905801 ],
       [ 1.39851231,  0.07882581,  0.01427838, -0.16704303,  2.0741395 ,
         0.49057593]]), array([ 38512.86258476,  38512.86258696,  38512.86258794,  38512.86258822,
        38512.86259712,  38512.86259817,  38512.86259961]))
           fun: 38512.862584762355
       message: 'Optimization terminated successfully.'
          nfev: 845
           nit: 526
        status: 0
       success: True
           

Thus, the coefficient on $Educ_{i}$ is about 0.0788 which means that a one year change in education is associated with a $7.88\%$ increase in wages

In [32]:
#We now perform each regression by year... 
#1971
def LL(params):
    #Here we define the parameters
    b0 = params[0]
    b1 = params[1]
    b2 = params[2]
    b3 = params[3]
    b4 = params[4]
    sd = params[5]

    # Here we calculate predicted values for lnwage
    yPred = b0 + b1 * year1['Educ'] + b2 * year1['age'] + b3 * year1['Black'] + b4 * year1['Other Races'] 

    # This uses our predicted values and such to parameterize the normal pdf and 
    #from thence we make it a log-likelihood function
    loglike = -np.sum( stats.norm.logpdf(year1['lnwage'], loc=yPred, scale=sd) )

    # Tell the function to return the NLL for minimization
    return(loglike)

# This list gives initial values for the coefficients and standard deviation    
initParams = [1, -3, 1, 1, 1, 1]

#Here we run the full regression model unseparated by year
results = opt.minimize(LL, initParams, method='Nelder-Mead')

print(results)

 final_simplex: (array([[ 1.60527891,  0.06665266,  0.01338334, -0.17670229,  0.8778291 ,
         0.41220197],
       [ 1.60527482,  0.06665237,  0.01338346, -0.1767079 ,  0.87781499,
         0.41220163],
       [ 1.60526141,  0.06665266,  0.01338383, -0.17670676,  0.87781278,
         0.41219209],
       [ 1.605286  ,  0.06665311,  0.01338304, -0.17669869,  0.87784293,
         0.41220014],
       [ 1.60527146,  0.06665458,  0.01338309, -0.17665702,  0.87791837,
         0.41220086],
       [ 1.60527399,  0.06665319,  0.01338338, -0.17672748,  0.87777145,
         0.41220522],
       [ 1.60523517,  0.0666546 ,  0.01338368, -0.17668976,  0.87782478,
         0.41220254]]), array([ 678.67465269,  678.67465984,  678.6746627 ,  678.67467215,
        678.67467453,  678.67467599,  678.67467606]))
           fun: 678.67465268823685
       message: 'Optimization terminated successfully.'
          nfev: 785
           nit: 493
        status: 0
       success: True
             x: array([ 1

Here, the coefficient on $Educ_{i}$ is about 0.0667 which means that a one year change in education is associated with a $6.67\%$ increase in wages

In [33]:
#The year Back in Black came out...1980
def LL(params):
    #Here we define the parameters
    b0 = params[0]
    b1 = params[1]
    b2 = params[2]
    b3 = params[3]
    b4 = params[4]
    sd = params[5]

    # Here we calculate predicted values for lnwage
    yPred = b0 + b1 * year2['Educ'] + b2 * year2['age'] + b3 * year2['Black'] + b4 * year2['Other Races'] 

    # This uses our predicted values and such to parameterize the normal pdf and 
    #from thence we make it a log-likelihood function
    loglike = -np.sum( stats.norm.logpdf(year2['lnwage'], loc=yPred, scale=sd) )

    # Tell the function to return the NLL for minimization
    return(loglike)

# This list gives initial values for the coefficients and standard deviation    
initParams = [1, -3, 1, 1, 1, 1]

#Here we run the full regression model unseparated by year
results = opt.minimize(LL, initParams, method='Nelder-Mead')

print(results)

 final_simplex: (array([[ 1.61628802,  0.06749945,  0.01282822, -0.09049344,  8.30637729,
         0.45037927],
       [ 1.61631709,  0.06749784,  0.012828  , -0.09047423,  8.30631314,
         0.45037713],
       [ 1.6163427 ,  0.0674966 ,  0.0128276 , -0.09049403,  8.30632791,
         0.45037776],
       [ 1.61633234,  0.0674973 ,  0.01282797, -0.09049584,  8.30634809,
         0.45037506],
       [ 1.61633219,  0.06749637,  0.01282817, -0.09050717,  8.30637919,
         0.45037184],
       [ 1.61631259,  0.0674984 ,  0.01282794, -0.09049839,  8.30636953,
         0.45037683],
       [ 1.61634134,  0.06749688,  0.01282752, -0.09047954,  8.30630576,
         0.45037525]]), array([ 1087.84196114,  1087.84197113,  1087.84198465,  1087.84199155,
        1087.84199188,  1087.84199658,  1087.84199729]))
           fun: 1087.8419611376169
       message: 'Optimization terminated successfully.'
          nfev: 895
           nit: 565
        status: 0
       success: True
             x: ar

Thus, the coefficient on $Educ_{i}$ is about 0.0675 which means that a one year change in education is associated with a $6.75\%$ increase in wages

In [34]:
#1990...some of Chuck D's finest...
def LL(params):
    #Here we define the parameters
    b0 = params[0]
    b1 = params[1]
    b2 = params[2]
    b3 = params[3]
    b4 = params[4]
    sd = params[5]

    # Here we calculate predicted values for lnwage
    yPred = b0 + b1 * year3['Educ'] + b2 * year3['age'] + b3 * year3['Black'] + b4 * year3['Other Races'] 

    # This uses our predicted values and such to parameterize the normal pdf and 
    #from thence we make it a log-likelihood function
    loglike = -np.sum( stats.norm.logpdf(year3['lnwage'], loc=yPred, scale=sd) )

    # Tell the function to return the NLL for minimization
    return(loglike)

# This list gives initial values for the coefficients and standard deviation    
initParams = [1, -3, 1, 1, 1, 1]

#Here we run the full regression model unseparated by year
results = opt.minimize(LL, initParams, method='Nelder-Mead')

print(results)

 final_simplex: (array([[ 1.06561974,  0.09931128,  0.01432541, -0.16615014,  3.16463112,
         0.48075896],
       [ 1.06559356,  0.09930873,  0.01432673, -0.16610666,  3.16463188,
         0.48077056],
       [ 1.06553054,  0.09931416,  0.01432687, -0.16608664,  3.16468851,
         0.48076246],
       [ 1.06565911,  0.09930396,  0.01432739, -0.1661288 ,  3.16458953,
         0.48074487],
       [ 1.06565452,  0.09930941,  0.01432549, -0.16615931,  3.16460187,
         0.48075781],
       [ 1.06556066,  0.0993186 ,  0.01432392, -0.16608491,  3.16466727,
         0.48075398],
       [ 1.06571036,  0.09930816,  0.0143242 , -0.16612993,  3.16454309,
         0.48074923]]), array([ 1340.87389958,  1340.87390473,  1340.87390659,  1340.87390733,
        1340.87390755,  1340.87391052,  1340.87391114]))
           fun: 1340.8738995811332
       message: 'Optimization terminated successfully.'
          nfev: 778
           nit: 489
        status: 0
       success: True
             x: ar

Thus, the coefficient on $Educ_{i}$ is about 0.0993 which means that a one year change in education is associated with a $9.93\%$ increase in wages

In [35]:
#From Y2K to judgement day...
def LL(params):
    #Here we define the parameters
    b0 = params[0]
    b1 = params[1]
    b2 = params[2]
    b3 = params[3]
    b4 = params[4]
    sd = params[5]

    # Here we calculate predicted values for lnwage
    yPred = b0 + b1 * year4['Educ'] + b2 * year4['age'] + b3 * year4['Black'] + b4 * year4['Other Races'] 

    # This uses our predicted values and such to parameterize the normal pdf and 
    #from thence we make it a log-likelihood function
    loglike = -np.sum( stats.norm.logpdf(year4['lnwage'], loc=yPred, scale=sd) )

    # Tell the function to return the NLL for minimization
    return(loglike)

# This list gives initial values for the coefficients and standard deviation    
initParams = [1, -3, 1, 1, 1, 1]

#Here we run the full regression model unseparated by year
results = opt.minimize(LL, initParams, method='Nelder-Mead')

print(results)

 final_simplex: (array([[ 1.16074553,  0.1099366 ,  0.0108106 , -0.25116354, -0.72375724,
         0.54075505],
       [ 1.16073655,  0.10993785,  0.0108104 , -0.25113951, -0.72377021,
         0.5407713 ],
       [ 1.16071883,  0.10994003,  0.01080993, -0.25114931, -0.7237873 ,
         0.54075814],
       [ 1.1606691 ,  0.10993738,  0.01081198, -0.25114054, -0.72383163,
         0.54076127],
       [ 1.16068123,  0.10993748,  0.01081171, -0.25117546, -0.72381212,
         0.54076083],
       [ 1.16080296,  0.10993137,  0.01081077, -0.25112849, -0.72371021,
         0.54076137],
       [ 1.16074872,  0.10993583,  0.01081052, -0.25117094, -0.72375195,
         0.54075878]]), array([ 2012.80654056,  2012.80654492,  2012.80655308,  2012.80655409,
        2012.80656392,  2012.80656451,  2012.80657879]))
           fun: 2012.8065405638831
       message: 'Optimization terminated successfully.'
          nfev: 802
           nit: 498
        status: 0
       success: True
             x: ar

Thus, the coefficient on $Educ_{i}$ is about 0.1099 which means that a one year change in education is associated with a $10.99\%$ increase in wages

The rise in coefficients over time is what one would expect. Growth theorist have shown in their empirical work that the college education premium is rising; thus, we would expect to see an additional year of education increase ones earnings. 