In [452]:
import pandas
import numpy
from scipy import stats

## Hybrid sample

In [453]:
filepath = 'D:/BG/Data/Processing/4_data_hybrid.txt'
hybrid = pandas.read_csv(filepath, sep='\t')
print(hybrid.shape[0],'job postings in hybrid sample')
hybrid.columns

212821 job postings in hybrid sample


Index(['BGTJobId', 'JobDate', 'CleanTitle', 'CanonTitle', 'SOC', 'SOCName',
       'ONET', 'ONETName', 'Employer', 'Sector', 'SectorName', 'NAICS3',
       'NAICS4', 'NAICS5', 'NAICS6', 'City', 'County', 'State', 'Lat', 'Lon',
       'Edu', 'MaxEdu', 'Degree', 'MaxDegree', 'Exp', 'MaxExp', 'MinSalary',
       'MaxSalary', 'MinHrlySalary', 'MaxHrlySalary', 'PayFrequency',
       'SalaryType', 'JobHours', 'SOC2', 'SOC4', 'SOC5', 'SK',
       'Employer_clean', 'n_terms', 'SK_without', 'TECH', 'occupation',
       'firm1', 'firm2', 'firm', 'plant', 'plant_all', 'plant_5',
       'development', 'materials', 'design', 'inventory', 'tooling',
       'automation', 'production', 'maintenance', 'technical',
       'administrative', 'management', 'finance', 'business', 'data',
       'software', 'office', 'ml', 'cognitive', 'creativity', 'social',
       'character', 'customer', 'writing', 'nonroutine analytic',
       'nonroutine manual', 'routine cognitive', 'routine manual',
       'sequential

In [459]:
mydf = hybrid.copy()
mydf['Count of Terms']=[len(job.split(',')) for job in mydf.SK_without.tolist()]
annotation = ['nonroutine analytic','nonroutine manual','routine cognitive','routine manual','reciprocal','sequential']
for a in annotation:
  mydf[a] = (mydf[a]*mydf['Count of Terms'].sum())/(mydf['Count of Terms']*mydf[a].sum())

## Statistics for all occupations

In [460]:
df = mydf[['TECH'] + annotation]

t = []
for a in annotation:
  t.append([a,stats.ttest_ind(df.loc[df.TECH=='AM',a],df.loc[df.TECH=='TM',a],equal_var=False)[1]])
tdf = pandas.DataFrame(t,columns=['annotation','p-val'])
tdf.set_index('annotation',inplace=True) 

mymean = df.groupby(by=['TECH']).mean().T
mymean.index.name = 'annotation'
mymean['AM-TM'] = mymean['AM'] - mymean['TM']

mymean['N-AM'] = N = len(mydf.loc[mydf.TECH=='AM'])
mymean['N-TM'] = N = len(mydf.loc[mydf.TECH=='TM'])

table1 = pandas.merge(left=mymean,right=tdf,how='inner',left_on=['annotation'],right_on=['annotation'])
table1['statistics'] = 'Mean'
table1.set_index('statistics',append=True,inplace=True)

mysd = df.groupby(by=['TECH']).std(ddof=0).T
mysd.index.name = 'annotation'
mysd.reset_index(inplace=True)
mysd['statistics'] = 'SD'
mysd.set_index(['annotation','statistics'],inplace=True)
mysd['p-val'] = numpy.nan
table1 = pandas.concat([table1,mysd],axis=0).sort_index(kind='merge').reindex(annotation,axis=0,level='annotation')
table1.columns = pandas.MultiIndex.from_product([['All'],table1.columns])

postings = mydf.groupby(['TECH']).nunique()[['BGTJobId']].T
postings.rename_axis([None],axis=1,inplace=True)
postings.columns = pandas.MultiIndex.from_product([['All'],postings.columns])
postings.index = pandas.MultiIndex.from_product([['Number of job postings'],['']])
plants = mydf.groupby(['TECH']).nunique()[['plant']].T
plants.rename_axis([None],axis=1,inplace=True)
plants.columns = pandas.MultiIndex.from_product([['All'],plants.columns])
plants.index = pandas.MultiIndex.from_product([['Number of plants'],['']])
table1 = pandas.concat([table1,postings,plants],axis=0)
table1

Unnamed: 0_level_0,Unnamed: 1_level_0,All,All,All,All,All,All
Unnamed: 0_level_1,Unnamed: 1_level_1,AM,AM-TM,N-AM,N-TM,TM,p-val
annotation,statistics,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
nonroutine analytic,Mean,1.323858,0.355728,4638.0,208183.0,0.96813,9.561266e-83
nonroutine analytic,SD,1.219334,,,,1.225981,
nonroutine manual,Mean,0.780737,-0.610377,4638.0,208183.0,1.391114,1.6127889999999999e-34
nonroutine manual,SD,3.281382,,,,5.059892,
routine cognitive,Mean,0.667039,-0.407509,4638.0,208183.0,1.074547,1.303517e-13
routine cognitive,SD,3.638618,,,,5.703446,
routine manual,Mean,1.283783,0.188751,4638.0,208183.0,1.095033,6.325611e-11
routine manual,SD,1.939213,,,,2.003587,
reciprocal,Mean,1.024896,0.093623,4638.0,208183.0,0.931273,0.07034271
reciprocal,SD,3.481768,,,,3.55967,


## Statistics by occupation

In [461]:
occupations = ['Manager','Engineer','Technician','Operator']
df = mydf[['TECH','occupation'] + annotation]

t = []
for a in annotation:
  for o in occupations:
    t.append([a,o,stats.ttest_ind(df.loc[(df.TECH=='AM') & (df.occupation==o),a],df.loc[(df.TECH=='TM') & (df.occupation==o),a],equal_var=False)[1]])
tdf = pandas.DataFrame(t,columns=['annotation','occupations','p-val'])
tdf.set_index(['annotation','occupations'],inplace=True)

mymean = df.groupby(by=['occupation','TECH']).mean().unstack(level=0).T
mymean.index.names = ['annotation','occupations']
mymean['AM-TM'] = mymean['AM'] - mymean['TM']

mymean.loc[pandas.IndexSlice[:,'Manager'],'N-AM'] = len(mydf.loc[(mydf.TECH=='AM')&(mydf.occupation=='Manager')])
mymean.loc[pandas.IndexSlice[:,'Engineer'],'N-AM'] = len(mydf.loc[(mydf.TECH=='AM')&(mydf.occupation=='Engineer')])
mymean.loc[pandas.IndexSlice[:,'Technician'],'N-AM'] = len(mydf.loc[(mydf.TECH=='AM')&(mydf.occupation=='Technician')])
mymean.loc[pandas.IndexSlice[:,'Operator'],'N-AM'] = len(mydf.loc[(mydf.TECH=='AM')&(mydf.occupation=='Operator')])
mymean.loc[pandas.IndexSlice[:,'Manager'],'N-TM'] = len(mydf.loc[(mydf.TECH=='TM')&(mydf.occupation=='Manager')])
mymean.loc[pandas.IndexSlice[:,'Engineer'],'N-TM'] = len(mydf.loc[(mydf.TECH=='TM')&(mydf.occupation=='Engineer')])
mymean.loc[pandas.IndexSlice[:,'Technician'],'N-TM'] = len(mydf.loc[(mydf.TECH=='TM')&(mydf.occupation=='Technician')])
mymean.loc[pandas.IndexSlice[:,'Operator'],'N-TM'] = len(mydf.loc[(mydf.TECH=='TM')&(mydf.occupation=='Operator')])

table2 = pandas.merge(left=mymean,right=tdf,how='inner',left_on=['annotation','occupations'],right_on=['annotation','occupations'])
table2['statistics'] = 'Mean'
table2.set_index('statistics',append=True,inplace=True)

mysd = df.groupby(by=['occupation','TECH']).std(ddof=0).unstack(level=0).T
mysd.index.names = ['annotation','occupations']
mysd['statistics'] = 'SD'
mysd.set_index('statistics',append=True,inplace=True)
mysd['p-val'] = numpy.nan

table2 = pandas.concat([table2,mysd],axis=0).sort_index(kind='merge').reindex(annotation,axis=0,level='annotation').unstack('occupations').swaplevel(0,1,axis=1).reindex(occupations,axis=1,level=0)
table2.rename_axis([None,None],axis=1,inplace=True)

postings = mydf.groupby(['occupation','TECH']).nunique()[['BGTJobId']].reindex(occupations, level='occupation').T
postings.rename_axis([None,None],axis=1,inplace=True)
postings.index = pandas.MultiIndex.from_product([['Number of job postings'],['']])
plants = mydf.groupby(['occupation','TECH']).nunique()[['plant']].reindex(occupations, level='occupation').T
plants.rename_axis([None,None],axis=1,inplace=True)
plants.index = pandas.MultiIndex.from_product([['Number of plants'],['']])
table2 = pandas.concat([table2,postings,plants],axis=0).reindex(occupations,axis=1,level=0)
table2

Unnamed: 0_level_0,Unnamed: 1_level_0,Manager,Manager,Manager,Manager,Manager,Manager,Engineer,Engineer,Engineer,Engineer,...,Technician,Technician,Technician,Technician,Operator,Operator,Operator,Operator,Operator,Operator
Unnamed: 0_level_1,Unnamed: 1_level_1,AM,AM-TM,N-AM,N-TM,TM,p-val,AM,AM-TM,N-AM,N-TM,...,N-AM,N-TM,TM,p-val,AM,AM-TM,N-AM,N-TM,TM,p-val
annotation,statistics,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
nonroutine analytic,Mean,0.878229,0.171163,981.0,88819.0,0.707067,2.306313e-07,1.719291,0.164767,2704.0,78324.0,...,457.0,10791.0,0.736076,0.3672906,0.549178,0.250068,496.0,30249.0,0.29911,1.266931e-09
nonroutine analytic,SD,1.024113,,,,0.918907,,1.203884,,,,...,,,1.18903,,0.894457,,,,0.724277,
nonroutine manual,Mean,0.08062,-0.357633,981.0,88819.0,0.438253,9.580863e-32,0.468707,-0.286497,2704.0,78324.0,...,457.0,10791.0,4.052718,1.149797e-09,2.468081,-2.41795,496.0,30249.0,4.886031,1.162507e-16
nonroutine manual,SD,0.885632,,,,2.605293,,2.060871,,,,...,,,7.437478,,6.166896,,,,9.293243,
routine cognitive,Mean,0.332608,-0.062424,981.0,88819.0,0.395032,0.4281779,0.634882,-0.578329,2704.0,78324.0,...,457.0,10791.0,1.913995,0.00174891,1.063817,-1.347457,496.0,30249.0,2.411274,3.924361e-07
routine cognitive,SD,2.444517,,,,3.051619,,3.20893,,,,...,,,8.195306,,5.719672,,,,9.084379,
routine manual,Mean,1.294732,0.691326,981.0,88819.0,0.603406,4.2207550000000005e-23,0.897615,0.033378,2704.0,78324.0,...,457.0,10791.0,2.042592,0.001350557,2.285123,-0.513023,496.0,30249.0,2.798146,2.512526e-05
routine manual,SD,2.127794,,,,1.241125,,1.310093,,,,...,,,2.522406,,2.650121,,,,3.346029,
reciprocal,Mean,2.005284,0.564911,981.0,88819.0,1.440373,0.0004904876,0.728477,0.114376,2704.0,78324.0,...,457.0,10791.0,0.388888,0.0001852958,0.672447,0.221276,496.0,30249.0,0.451171,0.0697977
reciprocal,SD,5.03601,,,,4.32033,,2.723635,,,,...,,,2.218117,,2.691378,,,,2.437617,


## Final table

In [462]:
table = pandas.merge(left=table1,right=table2,how='inner',left_on=['annotation','statistics'],right_on=['annotation','statistics'])
table[('Manager','Within')] = (table[('Manager','N-TM')]*table[('Manager','AM-TM')])/table[('All','N-TM')]
table[('Manager','Between')] = (table[('Manager','N-AM')]/table[('All','N-AM')]-table[('Manager','N-TM')]/table[('All','N-TM')])*table[('Manager','AM')]
table[('Engineer','Within')] = (table[('Engineer','N-TM')]*table['Engineer','AM-TM'])/table['All','N-TM']
table[('Engineer','Between')] = (table[('Engineer','N-AM')]/table[('All','N-AM')]-table[('Engineer','N-TM')]/table[('All','N-TM')])*table[('Engineer','AM')]
table[('Technician','Within')] = (table[('Technician','N-TM')]*table['Technician','AM-TM'])/table['All','N-TM']
table[('Technician','Between')] = (table[('Technician','N-AM')]/table[('All','N-AM')]-table[('Technician','N-TM')]/table[('All','N-TM')])*table[('Technician','AM')]
table[('Operator','Within')] = (table[('Operator','N-TM')]*table['Operator','AM-TM'])/table['All','N-TM']
table[('Operator','Between')] = (table[('Operator','N-AM')]/table[('All','N-AM')]-table[('Operator','N-TM')]/table[('All','N-TM')])*table[('Operator','AM')]
table[('All','Within-Check')] = table[('Manager','Within')]+table[('Engineer','Within')]+table[('Technician','Within')]+table[('Operator','Within')]
table[('All','Between')] = table[('Manager','Between')]+table[('Engineer','Between')]+table[('Technician','Between')]+table[('Operator','Between')]
table = numpy.trunc(1000*table)/1000
table[('All','Within')] = table[('All','AM')] - table[('All','TM')] - table['All','Between']
check = table[[('All','Within-Check'),('All','Within')]]
print(check) # because of decimal places; 'Within-Check' is calculated from the occupations, 'Within' is calculated from All to deal with decimal places

mylist = [('All','AM'),('All','TM'),('All','Between'),('All','Within'),('Manager','AM'),('Manager','TM'),('Engineer','AM'),('Engineer','TM'),('Technician','AM'),('Technician','TM'),('Operator','AM'),('Operator','TM')]
table[mylist] = table[mylist].applymap('{:.3f}'.format)

for i in ['All','Manager','Engineer','Technician','Operator']:
  table.loc[(table[(i,'p-val')]<0.1)&(table[(i,'p-val')]>=0.05),(i,'AM')] = table.loc[(table[(i,'p-val')]<0.1)&(table[(i,'p-val')]>=0.05),(i,'AM')]+'*'
  table.loc[(table[(i,'p-val')]<0.05)&(table[(i,'p-val')]>=0.01),(i,'AM')] = table.loc[(table[(i,'p-val')]<0.05)&(table[(i,'p-val')]>=0.01),(i,'AM')]+'**'
  table.loc[table[(i,'p-val')]<0.01,(i,'AM')] = table.loc[table[(i,'p-val')]<0.01,(i,'AM')]+'***'

table = table[mylist]
table.loc[pandas.IndexSlice[:,'SD'],:] = table.loc[pandas.IndexSlice[:,'SD'],:].apply(lambda x: '(' + x + ')')
table = table.mask(table == '(nan)',numpy.nan)
table = table.mask(table == 'nan',numpy.nan)
table = table.mask(table == '-0.000','0.000')
table.loc[pandas.IndexSlice['Number of job postings',:],:] = table.loc[pandas.IndexSlice['Number of job postings',:],:].replace(r'.000','',regex=True)
table.loc[pandas.IndexSlice['Number of plants',:],:] = table.loc[pandas.IndexSlice['Number of plants',:],:].replace(r'.000','',regex=True)

table

                                           All       
                                  Within-Check Within
annotation             statistics                    
nonroutine analytic    Mean              0.173  0.173
                       SD                  NaN    NaN
nonroutine manual      Mean             -0.702 -0.703
                       SD                  NaN    NaN
routine cognitive      Mean             -0.479 -0.479
                       SD                  NaN    NaN
routine manual         Mean              0.254  0.253
                       SD                  NaN    NaN
reciprocal             Mean              0.350  0.350
                       SD                  NaN    NaN
sequential             Mean             -0.207 -0.208
                       SD                  NaN    NaN
Number of job postings                     NaN    NaN
Number of plants                           NaN    NaN


Unnamed: 0_level_0,Unnamed: 1_level_0,All,All,All,All,Manager,Manager,Engineer,Engineer,Technician,Technician,Operator,Operator
Unnamed: 0_level_1,Unnamed: 1_level_1,AM,TM,Between,Within,AM,TM,AM,TM,AM,TM,AM,TM
annotation,statistics,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
nonroutine analytic,Mean,1.323***,0.968,0.182,0.173,0.878***,0.707,1.719***,1.554,0.781,0.736,0.549***,0.299
nonroutine analytic,SD,(1.219),(1.225),,,(1.024),(0.918),(1.203),(1.427),(1.047),(1.189),(0.894),(0.724)
nonroutine manual,Mean,0.780***,1.391,0.092,-0.703,0.080***,0.438,0.468***,0.755,2.298***,4.052,2.468***,4.886
nonroutine manual,SD,(3.281),(5.059),,,(0.885),(2.605),(2.060),(3.500),(5.845),(7.437),(6.166),(9.293)
routine cognitive,Mean,0.667***,1.074,0.072,-0.479,0.332,0.395,0.634***,1.213,1.144***,1.913,1.063***,2.411
routine cognitive,SD,(3.638),(5.703),,,(2.444),(3.051),(3.208),(5.783),(4.945),(8.195),(5.719),(9.084)
routine manual,Mean,1.283***,1.095,-0.065,0.253,1.294***,0.603,0.897,0.864,2.458***,2.042,2.285***,2.798
routine manual,SD,(1.939),(2.003),,,(2.127),(1.241),(1.310),(1.466),(2.705),(2.522),(2.650),(3.346)
reciprocal,Mean,1.024*,0.931,-0.257,0.35,2.005***,1.440,0.728**,0.614,1.056***,0.388,0.672*,0.451
reciprocal,SD,(3.481),(3.559),,,(5.036),(4.320),(2.723),(3.002),(3.757),(2.218),(2.691),(2.437)


In [463]:
table.to_excel('Appendix-Table2-terms.xlsx')