In [471]:
import pandas
import numpy
from scipy import stats

## Hybrid sample

In [472]:
filepath = 'D:/BG/Data/Processing/4_data_hybrid.txt'
hybrid = pandas.read_csv(filepath, sep='\t')
print(hybrid.shape[0],'job postings in hybrid sample')
hybrid.columns

212821 job postings in hybrid sample


Index(['BGTJobId', 'JobDate', 'CleanTitle', 'CanonTitle', 'SOC', 'SOCName',
       'ONET', 'ONETName', 'Employer', 'Sector', 'SectorName', 'NAICS3',
       'NAICS4', 'NAICS5', 'NAICS6', 'City', 'County', 'State', 'Lat', 'Lon',
       'Edu', 'MaxEdu', 'Degree', 'MaxDegree', 'Exp', 'MaxExp', 'MinSalary',
       'MaxSalary', 'MinHrlySalary', 'MaxHrlySalary', 'PayFrequency',
       'SalaryType', 'JobHours', 'SOC2', 'SOC4', 'SOC5', 'SK',
       'Employer_clean', 'n_terms', 'SK_without', 'TECH', 'occupation',
       'firm1', 'firm2', 'firm', 'plant', 'plant_all', 'plant_5',
       'development', 'materials', 'design', 'inventory', 'tooling',
       'automation', 'production', 'maintenance', 'technical',
       'administrative', 'management', 'finance', 'business', 'data',
       'software', 'office', 'ml', 'cognitive', 'creativity', 'social',
       'character', 'customer', 'writing', 'nonroutine analytic',
       'nonroutine manual', 'routine cognitive', 'routine manual',
       'sequential

In [473]:
mydf = hybrid.copy()
mydf['Count of Terms']=[len(job.split(',')) for job in mydf.SK_without.tolist()]
annotation = ['nonroutine analytic','nonroutine manual','routine cognitive','routine manual','reciprocal','sequential']
for a in annotation:
  mydf[a] = mydf[a]/mydf['Count of Terms']

## Statistics for all occupations

In [474]:
df = mydf[['TECH'] + annotation]

t = []
for a in annotation:
  t.append([a,stats.ttest_ind(df.loc[df.TECH=='AM',a],df.loc[df.TECH=='TM',a],equal_var=False)[1]])
tdf = pandas.DataFrame(t,columns=['annotation','p-val'])
tdf.set_index('annotation',inplace=True) 

mymean = df.groupby(by=['TECH']).mean().T
mymean.index.name = 'annotation'
mymean['AM-TM'] = mymean['AM'] - mymean['TM']

mymean['N-AM'] = N = len(mydf.loc[mydf.TECH=='AM'])
mymean['N-TM'] = N = len(mydf.loc[mydf.TECH=='TM'])

table1 = pandas.merge(left=mymean,right=tdf,how='inner',left_on=['annotation'],right_on=['annotation'])
table1['statistics'] = 'Mean'
table1.set_index('statistics',append=True,inplace=True)

mysd = df.groupby(by=['TECH']).std(ddof=0).T
mysd.index.name = 'annotation'
mysd.reset_index(inplace=True)
mysd['statistics'] = 'SD'
mysd.set_index(['annotation','statistics'],inplace=True)
mysd['p-val'] = numpy.nan
table1 = pandas.concat([table1,mysd],axis=0).sort_index(kind='merge').reindex(annotation,axis=0,level='annotation')
table1.columns = pandas.MultiIndex.from_product([['All'],table1.columns])

postings = mydf.groupby(['TECH']).nunique()[['BGTJobId']].T
postings.rename_axis([None],axis=1,inplace=True)
postings.columns = pandas.MultiIndex.from_product([['All'],postings.columns])
postings.index = pandas.MultiIndex.from_product([['Number of job postings'],['']])
plants = mydf.groupby(['TECH']).nunique()[['plant']].T
plants.rename_axis([None],axis=1,inplace=True)
plants.columns = pandas.MultiIndex.from_product([['All'],plants.columns])
plants.index = pandas.MultiIndex.from_product([['Number of plants'],['']])
table1 = pandas.concat([table1,postings,plants],axis=0)
table1

Unnamed: 0_level_0,Unnamed: 1_level_0,All,All,All,All,All,All
Unnamed: 0_level_1,Unnamed: 1_level_1,AM,AM-TM,N-AM,N-TM,TM,p-val
annotation,statistics,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
nonroutine analytic,Mean,0.111699,0.030014,4638.0,208183.0,0.081685,9.561266e-83
nonroutine analytic,SD,0.10288,,,,0.103441,
nonroutine manual,Mean,0.006377,-0.004985,4638.0,208183.0,0.011362,1.6127889999999999e-34
nonroutine manual,SD,0.026801,,,,0.041327,
routine cognitive,Mean,0.002474,-0.001511,4638.0,208183.0,0.003985,1.303517e-13
routine cognitive,SD,0.013493,,,,0.02115,
routine manual,Mean,0.052322,0.007693,4638.0,208183.0,0.04463,6.325611e-11
routine manual,SD,0.079035,,,,0.081659,
reciprocal,Mean,0.00729,0.000666,4638.0,208183.0,0.006624,0.07034271
reciprocal,SD,0.024765,,,,0.025319,


## Statistics by occupation

In [475]:
occupations = ['Manager','Engineer','Technician','Operator']
df = mydf[['TECH','occupation'] + annotation]

t = []
for a in annotation:
  for o in occupations:
    t.append([a,o,stats.ttest_ind(df.loc[(df.TECH=='AM') & (df.occupation==o),a],df.loc[(df.TECH=='TM') & (df.occupation==o),a],equal_var=False)[1]])
tdf = pandas.DataFrame(t,columns=['annotation','occupations','p-val'])
tdf.set_index(['annotation','occupations'],inplace=True)

mymean = df.groupby(by=['occupation','TECH']).mean().unstack(level=0).T
mymean.index.names = ['annotation','occupations']
mymean['AM-TM'] = mymean['AM'] - mymean['TM']

mymean.loc[pandas.IndexSlice[:,'Manager'],'N-AM'] = len(mydf.loc[(mydf.TECH=='AM')&(mydf.occupation=='Manager')])
mymean.loc[pandas.IndexSlice[:,'Engineer'],'N-AM'] = len(mydf.loc[(mydf.TECH=='AM')&(mydf.occupation=='Engineer')])
mymean.loc[pandas.IndexSlice[:,'Technician'],'N-AM'] = len(mydf.loc[(mydf.TECH=='AM')&(mydf.occupation=='Technician')])
mymean.loc[pandas.IndexSlice[:,'Operator'],'N-AM'] = len(mydf.loc[(mydf.TECH=='AM')&(mydf.occupation=='Operator')])
mymean.loc[pandas.IndexSlice[:,'Manager'],'N-TM'] = len(mydf.loc[(mydf.TECH=='TM')&(mydf.occupation=='Manager')])
mymean.loc[pandas.IndexSlice[:,'Engineer'],'N-TM'] = len(mydf.loc[(mydf.TECH=='TM')&(mydf.occupation=='Engineer')])
mymean.loc[pandas.IndexSlice[:,'Technician'],'N-TM'] = len(mydf.loc[(mydf.TECH=='TM')&(mydf.occupation=='Technician')])
mymean.loc[pandas.IndexSlice[:,'Operator'],'N-TM'] = len(mydf.loc[(mydf.TECH=='TM')&(mydf.occupation=='Operator')])

table2 = pandas.merge(left=mymean,right=tdf,how='inner',left_on=['annotation','occupations'],right_on=['annotation','occupations'])
table2['statistics'] = 'Mean'
table2.set_index('statistics',append=True,inplace=True)

mysd = df.groupby(by=['occupation','TECH']).std(ddof=0).unstack(level=0).T
mysd.index.names = ['annotation','occupations']
mysd['statistics'] = 'SD'
mysd.set_index('statistics',append=True,inplace=True)
mysd['p-val'] = numpy.nan

table2 = pandas.concat([table2,mysd],axis=0).sort_index(kind='merge').reindex(annotation,axis=0,level='annotation').unstack('occupations').swaplevel(0,1,axis=1).reindex(occupations,axis=1,level=0)
table2.rename_axis([None,None],axis=1,inplace=True)

postings = mydf.groupby(['occupation','TECH']).nunique()[['BGTJobId']].reindex(occupations, level='occupation').T
postings.rename_axis([None,None],axis=1,inplace=True)
postings.index = pandas.MultiIndex.from_product([['Number of job postings'],['']])
plants = mydf.groupby(['occupation','TECH']).nunique()[['plant']].reindex(occupations, level='occupation').T
plants.rename_axis([None,None],axis=1,inplace=True)
plants.index = pandas.MultiIndex.from_product([['Number of plants'],['']])
table2 = pandas.concat([table2,postings,plants],axis=0).reindex(occupations,axis=1,level=0)
table2

Unnamed: 0_level_0,Unnamed: 1_level_0,Manager,Manager,Manager,Manager,Manager,Manager,Engineer,Engineer,Engineer,Engineer,...,Technician,Technician,Technician,Technician,Operator,Operator,Operator,Operator,Operator,Operator
Unnamed: 0_level_1,Unnamed: 1_level_1,AM,AM-TM,N-AM,N-TM,TM,p-val,AM,AM-TM,N-AM,N-TM,...,N-AM,N-TM,TM,p-val,AM,AM-TM,N-AM,N-TM,TM,p-val
annotation,statistics,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
nonroutine analytic,Mean,0.0741,0.014442,981.0,88819.0,0.059658,2.306313e-07,0.145064,0.013902,2704.0,78324.0,...,457.0,10791.0,0.062106,0.3672906,0.046336,0.021099,496.0,30249.0,0.025237,1.266931e-09
nonroutine analytic,SD,0.086409,,,,0.077532,,0.101577,,,,...,,,0.100323,,0.075469,,,,0.06111,
nonroutine manual,Mean,0.000658,-0.002921,981.0,88819.0,0.003579,9.580863e-32,0.003828,-0.00234,2704.0,78324.0,...,457.0,10791.0,0.033101,1.149797e-09,0.020158,-0.019749,496.0,30249.0,0.039907,1.162507e-16
nonroutine manual,SD,0.007233,,,,0.021279,,0.016832,,,,...,,,0.060746,,0.050369,,,,0.075904,
routine cognitive,Mean,0.001233,-0.000231,981.0,88819.0,0.001465,0.4281779,0.002354,-0.002145,2704.0,78324.0,...,457.0,10791.0,0.007098,0.00174891,0.003945,-0.004997,496.0,30249.0,0.008942,3.924361e-07
routine cognitive,SD,0.009065,,,,0.011316,,0.011899,,,,...,,,0.03039,,0.02121,,,,0.033687,
routine manual,Mean,0.052769,0.028176,981.0,88819.0,0.024593,4.2207550000000005e-23,0.036584,0.00136,2704.0,78324.0,...,457.0,10791.0,0.083249,0.001350557,0.093133,-0.020909,496.0,30249.0,0.114042,2.512526e-05
routine manual,SD,0.086721,,,,0.050584,,0.053395,,,,...,,,0.102804,,0.108009,,,,0.136372,
reciprocal,Mean,0.014263,0.004018,981.0,88819.0,0.010245,0.0004904876,0.005181,0.000814,2704.0,78324.0,...,457.0,10791.0,0.002766,0.0001852958,0.004783,0.001574,496.0,30249.0,0.003209,0.0697977
reciprocal,SD,0.035819,,,,0.030729,,0.019372,,,,...,,,0.015777,,0.019143,,,,0.017338,


## Final table

In [476]:
table = pandas.merge(left=table1,right=table2,how='inner',left_on=['annotation','statistics'],right_on=['annotation','statistics'])
table[('Manager','Within')] = (table[('Manager','N-TM')]*table[('Manager','AM-TM')])/table[('All','N-TM')]
table[('Manager','Between')] = (table[('Manager','N-AM')]/table[('All','N-AM')]-table[('Manager','N-TM')]/table[('All','N-TM')])*table[('Manager','AM')]
table[('Engineer','Within')] = (table[('Engineer','N-TM')]*table['Engineer','AM-TM'])/table['All','N-TM']
table[('Engineer','Between')] = (table[('Engineer','N-AM')]/table[('All','N-AM')]-table[('Engineer','N-TM')]/table[('All','N-TM')])*table[('Engineer','AM')]
table[('Technician','Within')] = (table[('Technician','N-TM')]*table['Technician','AM-TM'])/table['All','N-TM']
table[('Technician','Between')] = (table[('Technician','N-AM')]/table[('All','N-AM')]-table[('Technician','N-TM')]/table[('All','N-TM')])*table[('Technician','AM')]
table[('Operator','Within')] = (table[('Operator','N-TM')]*table['Operator','AM-TM'])/table['All','N-TM']
table[('Operator','Between')] = (table[('Operator','N-AM')]/table[('All','N-AM')]-table[('Operator','N-TM')]/table[('All','N-TM')])*table[('Operator','AM')]
table[('All','Within-Check')] = table[('Manager','Within')]+table[('Engineer','Within')]+table[('Technician','Within')]+table[('Operator','Within')]
table[('All','Between')] = table[('Manager','Between')]+table[('Engineer','Between')]+table[('Technician','Between')]+table[('Operator','Between')]
table = numpy.trunc(1000*table)/1000
table[('All','Within')] = table[('All','AM')] - table[('All','TM')] - table['All','Between']
check = table[[('All','Within-Check'),('All','Within')]]
print(check) # because of decimal places; 'Within-Check' is calculated from the occupations, 'Within' is calculated from All to deal with decimal places

mylist = [('All','AM'),('All','TM'),('All','Between'),('All','Within'),('Manager','AM'),('Manager','TM'),('Engineer','AM'),('Engineer','TM'),('Technician','AM'),('Technician','TM'),('Operator','AM'),('Operator','TM')]
table[mylist] = table[mylist].applymap('{:.3f}'.format)

for i in ['All','Manager','Engineer','Technician','Operator']:
  table.loc[(table[(i,'p-val')]<0.1)&(table[(i,'p-val')]>=0.05),(i,'AM')] = table.loc[(table[(i,'p-val')]<0.1)&(table[(i,'p-val')]>=0.05),(i,'AM')]+'*'
  table.loc[(table[(i,'p-val')]<0.05)&(table[(i,'p-val')]>=0.01),(i,'AM')] = table.loc[(table[(i,'p-val')]<0.05)&(table[(i,'p-val')]>=0.01),(i,'AM')]+'**'
  table.loc[table[(i,'p-val')]<0.01,(i,'AM')] = table.loc[table[(i,'p-val')]<0.01,(i,'AM')]+'***'

table = table[mylist]
table.loc[pandas.IndexSlice[:,'SD'],:] = table.loc[pandas.IndexSlice[:,'SD'],:].apply(lambda x: '(' + x + ')')
table = table.mask(table == '(nan)',numpy.nan)
table = table.mask(table == 'nan',numpy.nan)
table = table.mask(table == '-0.000','0.000')
table.loc[pandas.IndexSlice['Number of job postings',:],:] = table.loc[pandas.IndexSlice['Number of job postings',:],:].replace(r'.000','',regex=True)
table.loc[pandas.IndexSlice['Number of plants',:],:] = table.loc[pandas.IndexSlice['Number of plants',:],:].replace(r'.000','',regex=True)

table

                                           All       
                                  Within-Check Within
annotation             statistics                    
nonroutine analytic    Mean              0.014  0.015
                       SD                  NaN    NaN
nonroutine manual      Mean             -0.005 -0.005
                       SD                  NaN    NaN
routine cognitive      Mean             -0.001 -0.001
                       SD                  NaN    NaN
routine manual         Mean              0.010  0.010
                       SD                  NaN    NaN
reciprocal             Mean              0.002  0.002
                       SD                  NaN    NaN
sequential             Mean             -0.015 -0.016
                       SD                  NaN    NaN
Number of job postings                     NaN    NaN
Number of plants                           NaN    NaN


Unnamed: 0_level_0,Unnamed: 1_level_0,All,All,All,All,Manager,Manager,Engineer,Engineer,Technician,Technician,Operator,Operator
Unnamed: 0_level_1,Unnamed: 1_level_1,AM,TM,Between,Within,AM,TM,AM,TM,AM,TM,AM,TM
annotation,statistics,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
nonroutine analytic,Mean,0.111***,0.081,0.015,0.015,0.074***,0.059,0.145***,0.131,0.065,0.062,0.046***,0.025
nonroutine analytic,SD,(0.102),(0.103),,,(0.086),(0.077),(0.101),(0.120),(0.088),(0.100),(0.075),(0.061)
nonroutine manual,Mean,0.006***,0.011,0.0,-0.005,0.000***,0.003,0.003***,0.006,0.018***,0.033,0.020***,0.039
nonroutine manual,SD,(0.026),(0.041),,,(0.007),(0.021),(0.016),(0.028),(0.047),(0.060),(0.050),(0.075)
routine cognitive,Mean,0.002***,0.003,0.0,-0.001,0.001,0.001,0.002***,0.004,0.004***,0.007,0.003***,0.008
routine cognitive,SD,(0.013),(0.021),,,(0.009),(0.011),(0.011),(0.021),(0.018),(0.030),(0.021),(0.033)
routine manual,Mean,0.052***,0.044,-0.002,0.01,0.052***,0.024,0.036,0.035,0.100***,0.083,0.093***,0.114
routine manual,SD,(0.079),(0.081),,,(0.086),(0.050),(0.053),(0.059),(0.110),(0.102),(0.108),(0.136)
reciprocal,Mean,0.007*,0.006,-0.001,0.002,0.014***,0.010,0.005**,0.004,0.007***,0.002,0.004*,0.003
reciprocal,SD,(0.024),(0.025),,,(0.035),(0.030),(0.019),(0.021),(0.026),(0.015),(0.019),(0.017)


In [477]:
table.to_excel('Appendix-Table2-share.xlsx')