In [3]:
import pandas
import numpy
from scipy import stats
%run dd_wilcoxon.ipynb
from sklearn import preprocessing

## Matched sample

In [4]:
filepath = r'D:/BG/Data/Processing/4_data.txt'
data = pandas.read_csv(filepath, sep='\t')
print(data.shape[0],'postings in full sample')

occupations = ['Manager','Engineer','Technician','Operator']
for o in occupations:
  df0 = data[(data.plant_5==0) & (data.occupation==o)]
  print(df0.shape)
  df = df0.groupby(['plant','TECH'])['BGTJobId'].nunique().unstack(level=1).reset_index()
  df = df[(df.AM.notnull())&(df.TM.notnull())]
  mylist = df.plant.tolist()
  data[o] = numpy.nan
  data.loc[data.plant.isin(mylist), o] = 1

mylist = []
for o in occupations:
  df = data[(data[o]==1) & (data.occupation==o)]
  mylist.append(df)
matched=pandas.concat(mylist,axis=0)
print(matched.shape[0],'postings in matched sample')

3091485 postings in full sample
(89801, 76)
(80653, 77)
(11248, 78)
(31120, 79)
160943 postings in matched sample


## Measures

In [3]:
mydf = matched.copy()
mydf['Engineering Skills'] = mydf['research']+mydf['design']+mydf['materials']+mydf['development']
mydf['Operations Skills'] = mydf['tools']+mydf['inventory']+mydf['production']
mydf['Support Skills'] = mydf['business']+mydf['finance']+mydf['management']+mydf['analysis']+mydf['customer']+mydf['office']+mydf['software']
mydf['General Skills'] = mydf['cognitive']+mydf['social']
mydf['Job Complexity'] = mydf['complexity']

measures = ['Engineering Skills','Operations Skills','Support Skills','General Skills','Job Complexity']

## Statistics by occupation
### Dutta and Datta (2016) rank-sum test

In [4]:
df = mydf.copy()
df = df[['plant','occupation','TECH'] + measures]
df.loc[df.TECH=='AM','TECH01']=1
df.loc[df.TECH=='TM','TECH01']=0
DD = []
for o in occupations:
  ddf = df[df.occupation==o]
  le = preprocessing.LabelEncoder()
  plant = le.fit_transform(ddf.plant)
  for m in measures:
    dd = dd_wilcoxon(plant,ddf[m].values.tolist(),ddf['TECH01'].values.tolist())
    DD.append([m,o,dd[1]])
DD_df = pandas.DataFrame(DD,columns=['measures','occupations','p-value']).set_index(['measures','occupations']) 
DD_df = DD_df.applymap('{:.2f}'.format) 

mymean = df.groupby(by=['plant','occupation','TECH']).mean().reset_index()
mean_mymean = mymean.groupby(by=['occupation','TECH']).mean().unstack(level=0).T
mean_mymean.index.names = ['measures','occupations']
mean_mymean = mean_mymean.applymap('{:.2f}'.format)
table2 = pandas.merge(left=mean_mymean,right=DD_df,how='inner',left_on=['measures','occupations'],right_on=['measures','occupations'])
table2['statistics'] = 'Mean'
table2.set_index('statistics',append=True,inplace=True)

mysd = df.groupby(by=['plant','occupation','TECH']).std(ddof=0).reset_index()
mean_mysd = mysd.groupby(by=['occupation','TECH']).mean().unstack(level=0).T
mean_mysd.index.names = ['measures','occupations']
mean_mysd = mean_mysd.applymap('{:.2f}'.format)
mean_mysd = mean_mysd.applymap(lambda x: '(' + x + ')')
mean_mysd['statistics'] = 'SD'
mean_mysd.set_index('statistics',append=True,inplace=True)
mean_mysd['p-value'] = numpy.nan
table2 = pandas.concat([table2,mean_mysd],axis=0).sort_index(kind='merge').reindex(measures,axis=0,level='measures').unstack('occupations').swaplevel(0,1,axis=1).reindex(occupations,axis=1,level=0)
table2.rename_axis([None,None],axis=1,inplace=True)

     clusters  x  groups
0         137  1     1.0
1         114  5     1.0
2         114  5     1.0
3         114  5     1.0
4          46  0     1.0
..        ... ..     ...
951       158  0     1.0
952        13  2     1.0
953        96  0     1.0
954       158  0     1.0
955       158  0     1.0

[956 rows x 3 columns]
     clusters  x  groups
0         137  0     1.0
1         114  0     1.0
2         114  0     1.0
3         114  0     1.0
4          46  0     1.0
..        ... ..     ...
951       158  0     1.0
952        13  1     1.0
953        96  0     1.0
954       158  0     1.0
955       158  0     1.0

[956 rows x 3 columns]
     clusters  x  groups
0         137  2     1.0
1         114  8     1.0
2         114  8     1.0
3         114  8     1.0
4          46  7     1.0
..        ... ..     ...
951       158  3     1.0
952        13  2     1.0
953        96  7     1.0
954       158  4     1.0
955       158  3     1.0

[956 rows x 3 columns]
     clusters  x  groups
0  

Unnamed: 0_level_0,Unnamed: 1_level_0,Managers,Managers,Managers,Engineers,Engineers,Engineers,Technicians,Technicians,Technicians,Operators,Operators,Operators
Unnamed: 0_level_1,Unnamed: 1_level_1,AM,TM,p-value,AM,TM,p-value,AM,TM,p-value,AM,TM,p-value
measures,statistics,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Engineering Skills,Mean,1.16,0.49,0.0,2.73,1.61,0.0,1.35,0.85,0.0,0.82,0.41,0.0
Engineering Skills,SD,(0.47),(0.78),,(0.96),(1.33),,(0.38),(0.95),,(0.22),(0.57),
Operations Skills,Mean,0.42,0.26,0.0,0.61,0.31,0.0,0.97,0.78,0.01,0.89,1.06,0.07
Operations Skills,SD,(0.17),(0.51),,(0.38),(0.51),,(0.39),(0.96),,(0.39),(1.08),
Support Skills,Mean,6.16,6.70,0.0,6.02,6.12,0.37,4.42,4.47,0.16,4.56,4.38,0.0
Support Skills,SD,(1.01),(2.69),,(1.50),(2.80),,(0.75),(2.05),,(0.62),(1.91),
General Skills,Mean,2.72,2.53,0.02,2.56,2.34,0.0,2.21,1.63,0.0,1.91,1.47,0.0
General Skills,SD,(0.61),(1.49),,(0.85),(1.32),,(0.44),(1.16),,(0.44),(1.08),
Job Complexity,Mean,2.70,2.40,0.01,3.29,2.72,0.0,1.86,1.45,0.0,1.51,1.09,0.0
Job Complexity,SD,(0.57),(1.35),,(0.91),(1.40),,(0.43),(1.21),,(0.36),(0.92),


In [31]:
postings = mydf.groupby(['occupation','TECH']).nunique()[['BGTJobId']].reindex(occupations, level='occupation').T
postings = postings.applymap('{:,}'.format) 
postings.rename_axis([None,None],axis=1,inplace=True)
postings.index = pandas.MultiIndex.from_product([['Number of job postings'],['']])
plants = mydf.groupby(['occupation','TECH']).nunique()[['plant']].reindex(occupations, level='occupation').T
plants.rename_axis([None,None],axis=1,inplace=True)
plants.index = pandas.MultiIndex.from_product([['Number of plants'],['']])
table2 = pandas.concat([table2,postings,plants],axis=0).reindex(occupations,axis=1,level=0)
table2 = table2.reindex(['AM','TM','p-value'],axis=1,level=1)
table2

Unnamed: 0_level_0,Unnamed: 1_level_0,Managers,Managers,Managers,Engineers,Engineers,Engineers,Technicians,Technicians,Technicians,Operators,Operators,Operators
Unnamed: 0_level_1,Unnamed: 1_level_1,AM,TM,p-value,AM,TM,p-value,AM,TM,p-value,AM,TM,p-value
measures,statistics,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
Engineering Skills,Mean,1.16,0.49,0.0,2.73,1.61,0.0,1.35,0.85,0.0,0.82,0.41,0.0
Engineering Skills,SD,(0.47),(0.78),,(0.96),(1.33),,(0.38),(0.95),,(0.22),(0.57),
Operations Skills,Mean,0.42,0.26,0.0,0.61,0.31,0.0,0.97,0.78,0.01,0.89,1.06,0.07
Operations Skills,SD,(0.17),(0.51),,(0.38),(0.51),,(0.39),(0.96),,(0.39),(1.08),
Support Skills,Mean,6.16,6.70,0.0,6.02,6.12,0.37,4.42,4.47,0.16,4.56,4.38,0.0
Support Skills,SD,(1.01),(2.69),,(1.50),(2.80),,(0.75),(2.05),,(0.62),(1.91),
General Skills,Mean,2.72,2.53,0.02,2.56,2.34,0.0,2.21,1.63,0.0,1.91,1.47,0.0
General Skills,SD,(0.61),(1.49),,(0.85),(1.32),,(0.44),(1.16),,(0.44),(1.08),
Job Complexity,Mean,2.70,2.40,0.01,3.29,2.72,0.0,1.86,1.45,0.0,1.51,1.09,0.0
Job Complexity,SD,(0.57),(1.35),,(0.91),(1.40),,(0.43),(1.21),,(0.36),(0.92),
