* https://academic.oup.com/qje/article/135/4/1965/5858010?searchresult=1
* Supplementary Data: Data appendix Table 1 in Online Appendix
* Data Availability: DN_STEMM_QJE_May2020_Import_Clean.do

# 4-Annotation

In [844]:
import pandas
import numpy
import openpyxl

## Manufacturing sample

In [845]:
data = pandas.read_csv('D:/BG/Data/Downloading/SKILL.txt', sep='\t')
data = data.sort_values(['JobDate','BGTJobId'], ascending=[True, True])
print('{} initial postings'.format(data.shape[0]))
print('{} postings with missing Employer'.format(data.Employer.isna().sum()))
print('{} postings with nonmissing Employer'.format(data.shape[0]-data.Employer.isna().sum()))
print(data.dtypes)
print(data.SK[0:5])

4416837 initial postings
1131600 postings with missing Employer
3285237 postings with nonmissing Employer
BGTJobId           int64
JobDate           object
CleanTitle        object
CanonTitle        object
SOC                int64
SOCName           object
ONET              object
ONETName          object
Employer          object
Sector            object
SectorName        object
NAICS3           float64
NAICS4           float64
NAICS5           float64
NAICS6           float64
City              object
County            object
State             object
Lat              float64
Lon              float64
Edu              float64
MaxEdu           float64
Degree            object
MaxDegree         object
Exp              float64
MaxExp           float64
MinSalary        float64
MaxSalary        float64
MinHrlySalary    float64
MaxHrlySalary    float64
PayFrequency      object
SalaryType        object
JobHours          object
SOC2               int64
SOC4               int64
SOC5               

## Skills to annotate: 12,014 unique skills in manufacturing sample

In [846]:
BoW = data['SK']
BoW = [job.replace('colleterallized loan obligation (clo)','collateralized loan obligations (clos)') for job in BoW]
BoW = [job.replace('computer aided manufacturing (cam)','computer-aided manufacturing (cam)') for job in BoW]
BoW = [job.split(',') for job in BoW]
print(len(BoW),'postings in manufacturing sample')
print(BoW[0:5])
BoW = [term for job in BoW for term in job]
BoW = [term for term in BoW if term!='na']
print(len(BoW),'terms in manufacturing sample after removing na')
print(len(list(set(BoW))),'unique terms in manufacturing sample after removing na, terms to annotate')
skills = pandas.DataFrame(list(set(BoW)),columns=['Skill_US'])

4416837 postings in manufacturing sample
[['planning', 'purchasing'], ['positive disposition', 'repair', 'communication skills', 'verbal / oral communication', 'welding', 'system maintenance'], ['product sales', 'microsoft excel', 'proposal writing', 'scheduling', 'staff management', 'planning', 'technical training', 'customer contact', 'customer service', 'project planning and development skills', 'microsoft project', 'supervisory skills'], ['clinical development', 'performance management', 'project management', 'research', 'pharmacology', 'creativity', 'budgeting', 'mentoring', 'writing', 'due diligence', 'problem solving', 'teamwork / collaboration', 'building effective relationships', 'clinical trials', 'drug development'], ['ansys', 'nastran', 'mechanical engineering', 'nastran / patran', 'system design', 'finite element method / analysis software', 'communication skills', 'ls-dyna', 'finite element analysis', 'cad / cam (computer-aided design / manufacturing)', 'troubleshooting',

## Manufacturing_Skills_US.xlsx contains the 12,015 skills to annotate with BG families and clusters

In [847]:
BG = pandas.read_csv('All_Skills_US.csv',sep=',')
BG['Skill_US'] = BG['Skill_US'].str.lower()
BG['Skill_US'] = BG['Skill_US'].str.replace(r',','',regex=True)
BG['SkillClusterFamily'] = BG['SkillClusterFamily'].str.lower()
BG['SkillCluster'] = BG['SkillCluster'].str.lower()
BG = BG.merge(skills,on='Skill_US',how='right')
print(len(BG),'keywords from manufacturing sample to annotate')
print(BG.columns)
BG.to_excel('Manufacturing_Skills_US.xlsx')

12014 keywords from manufacturing sample to annotate
Index(['SkillId', 'Skill_US', 'IsSpecialized', 'IsSoftware', 'IsBaseline',
       'IsLanguage', 'IsActive', 'SkillCluster', 'SkillClusterFamily'],
      dtype='object')


## BG-Deming-Noray (2020)
* https://scholar.harvard.edu/files/ddeming/files/dn_stem_may_2020_appendix.pdf

In [848]:
mydf = BG.copy()

##### From BG
mydf['development'] = 0
mydf.loc[mydf['SkillCluster'].isin(['biopharmaceutical manufacturing','concept development','drug development','engineering management','engineering practices','engineering software','mathematics','mathematical modeling','mathematical software','product development','research methodology','simulation','simulation software']),'development'] = 1
mydf['design'] = 0
mydf.loc[(mydf['Skill_US'].str.contains('design')==True)|(mydf['SkillClusterFamily'].isin(['design']))|(mydf['SkillCluster'].isin(['analog design','drafting and engineering design'])),'design'] = 1
mydf['materials'] = 0
mydf.loc[mydf['SkillCluster'].isin(['aluminum industry knowledge','bioinformatics','biology','cellular biology','chemical analysis','chemical industry knowledge','chemistry','materials process','materials science','metal industy industry knowledge','molecular biology','physics','plastics material industry knowledge','process engineering']),'materials'] = 1
mydf['inventory'] = 0
mydf.loc[(mydf['SkillClusterFamily'].isin(['supply chain and logistics']))|(mydf['SkillCluster'].isin(['supply chain and logistics industry knowledge'])),'inventory'] = 1
mydf['tooling'] = 0
mydf.loc[(mydf['SkillCluster'].isin(['hand tools','machine tools','machinery','micro manufacturing','power tools','carpentry'])),'tooling'] = 1
mydf['automation'] = 0
mydf.loc[(mydf['SkillCluster'].isin(['automation engineering','robotics','computer-aided manufacturing'])),'automation'] = 1
mydf['production'] = 0
mydf.loc[mydf['SkillClusterFamily'].isin(['manufacturing and production']),'production'] = 1
mydf['maintenance'] = 0
mydf.loc[(mydf['SkillClusterFamily'].isin(['maintenance, repair, and installation'])),'maintenance'] = 1

# From Deming and Noray (2020)
mydf['cognitive'] = 0
mydf.loc[(mydf['Skill_US'].str.contains('analytical|cognitive|critical thinking|math|problem solving|research|statistics')==True)&(mydf['Skill_US'].str.contains('online research')==False),'cognitive'] = 1
mydf['creativity'] = 0
mydf.loc[mydf['Skill_US'].str.contains('creativ'),'creativity'] = 1
mydf['social'] = 0
mydf.loc[mydf['Skill_US'].str.contains('collaboration|communication|negotiation|presentation|social|teamwork'),'social'] = 1
mydf['character'] = 0
mydf.loc[mydf['Skill_US'].str.contains('detail-oriented|energetic|goal setting|initiative|meeting deadlines|multi-tasking|organizational skills|planning|positive disposition|prioritizing tasks|self-motivation|self-starter|time management|well organized'),'character'] = 1
mydf['customer'] = 0
mydf.loc[(mydf['Skill_US'].str.contains('client|cold calling|customer|guest sevices|patient|sales')==True)&(mydf['Skill_US'].str.contains('sales analysis|sales forecasting|sales order processing|sales planning|sales reporting|sales tax returns|salesforce|salesforce crm')==False),'customer'] = 1
mydf['writing'] = 0
mydf.loc[mydf['Skill_US'].str.contains('editing|preparing proposals|preparing reports|writing'),'writing'] = 1
mydf['administrative'] = 0
mydf.loc[mydf['Skill_US'].str.contains('scheduling|administrative support|general office duties|office management|typing|telephone skills|appointment setting|record keeping|administrative functions|general office duties|secretarial skills|travel arrangements|front office|expense reports|copying|clerical duties|word processing|mailing'),'administrative'] = 1
mydf['management'] = 0
mydf.loc[mydf['Skill_US'].str.contains('conflict management|human resource management|leadership|mentoring|people management|performance management|personnel management|staff development|staff management|staff supervision|supervisory'),'management'] = 1
mydf['finance'] = 0
mydf.loc[mydf['Skill_US'].str.contains('accounting|accounts payable|accounts receivable|balance sheet|billing design|bookkeeping|budgeting|cost|financ|general ledger|payroll processing|revenue projections'),'finance'] = 1
mydf['business'] = 0
mydf.loc[mydf['Skill_US'].str.contains('business administration|business development|business intelligence|business management|business operations|business planning|business process|business solutions|business strategy|business systems|key performance indicators|lean six sigma|process design|six sigma|systems administration|systems architecture|systems integration|systems development'),'business'] = 1
mydf['data'] = 0
mydf.loc[mydf['Skill_US'].str.contains('data collection|data management|data structures|data integration|big data|relational databases|data warehousing|database management|database administration|database design|data structures|data transformation|data architecture|relational database management system|data migration|database software|data acquisition|database schemas|data mapping|master data management|data warehouse processing|data conversion|data integrity|database architecture|database tuning|enterprise data management|relational database design|data warehouse development|data archiving|mysql|nosql|data entry'),'data'] = 1
mydf['software'] = 0
mydf.loc[(mydf['IsSoftware']==1),'software'] = 1
mydf['office'] = 0
mydf.loc[mydf['Skill_US'].isin(['basic internet skills','computer literacy','computer hardware/software knowledge','computer networking','microsoft access','microsoft excel','microsoft powerpoint','microsoft office','microsoft outlook','microsoft windows','microsoft word','online research']),'office'] = 1
mydf['ml'] = 0
mydf.loc[mydf['Skill_US'].isin(['python','r','splunk','apache hadoop','sqoop','apache hive','mapreduce','tensorflow','scikit-learn','mahout','keras','opencv','xgboost','libsvm','word2vec','artificial intelligence','machine learning','robotics','decision trees','support vector machines (svm)','bayesian networks','clustering','cluster analysis','neural networks','convolutional neural networks (cnn)','recurrent neural network (rnn)','human machine interface (hmi)','human machine interface (hmi) control systems','supervised learning (machine learning)','machine-to-machine (m2m) communications','machine code','machine vision','computer vision','machine translation (mt)','torch (machine learning)','deep learning','unsupervised learning','caffe deep learning framework','boosting (machine learning)','semi-supervised learning','chef infrastructure automation','automation tools','automated testing','automation systems','office automation','automation consulting','sales automation software','automation test environment','marketing automation','laboratory automation','automation techniques','automated underwriting system','gradient boosting','random forest','natural language processing','natural language toolkit (nltk)','speech recognition','pattern recognition','kernel methods','image recognition','object recognition','image processing','machine translation','text mining','recommender systems','latent semantic analysis','sentiment analysis / opinion mining','virtual agents','chatbot','ai chatbot']),'ml'] = 1
mydf['technical'] = 0
mydf.loc[mydf['Skill_US'].str.contains('computer installation and setup|computer software industry knowledge|computer industry knowledge|computer repair|computer maintenance|computer troubleshooting|computer sales|computer problem resolution|help desk support|technical support|information technology industry knowledge|telecommunications|information systems|web development|mainframe|transmission control protocol / internet protocol|web site design|wide area network|network administration|web application development|hardware and software installation|technical training|hardware and software configuration|routers|network hardware/software maintenance|software installation|system/network configuration'),'technical'] = 1

## Exclusivity for BG-Deming-Noray

In [849]:
mylist = ['administrative','management','finance','business','data','software','office','ml','technical','inventory','tooling','production','automation','maintenance','development','materials','design','character','customer','writing','social','creativity','cognitive']
mypairs = []
for i,j in enumerate(mylist):
  mypairs.extend(list(zip([j]*(len(mylist)-i+1),mylist[i+1:])))
for i,j in mypairs:
  mydf.loc[(mydf[i]==1)&(mydf[j]==1),i] = 0
mydf['sum_rows_BGDN'] = mydf[mylist].sum(axis=1)
print(100*mydf[['sum_rows_BGDN']].sum(axis=0)/12014, 'annotated percentage using exclusive BG and Deming and Noray')
mydf.to_excel('Manufacturing_Skills_US_annotation.xlsx')

sum_rows_BGDN    31.188613
dtype: float64 annotated percentage using exclusive BG and Deming and Noray


## Home made Atalay et al. (2020) 
* https://www.aeaweb.org/articles?id=10.1257/app.20190070

In [850]:
mydf['nra'] = 0
mydf.loc[mydf['Skill_US'].str.contains('abstract|advanced|analy|complex|design|evaluat|flexib|interpret|research|sketch|synthes'),'nra'] = 1
mydf['reciprocal'] = 0
mydf.loc[(mydf['Skill_US'].str.contains('agree|advis|coaching|consult|engagement|feedback|mentor|social|trust'))&(mydf['Skill_US'].str.contains('steam')==False),'reciprocal'] = 1 
mydf['sequential'] = 0
mydf.loc[mydf['Skill_US'].str.contains('assistance|authority|command|direct|key performance indicators|leader|monitor|organiz|performance metrics|planning|procedure|standard|supervis|support|surveillance'),'sequential'] = 1
#direct|leader|manag|monitor|planning|procedure|standard
mydf['nrm'] = 0
mydf.loc[mydf['Skill_US'].str.contains('repair'),'nrm'] = 1
mydf['rc'] = 0
mydf.loc[mydf['Skill_US'].str.contains('bookkeep|calcul|compar|compil|copy|correct|measur'),'rc'] = 1  
mydf['rm'] = 0
mydf.loc[mydf['Skill_US'].str.contains('assembl|control|equip|feed|install|maintain|operat|tool'),'rm'] = 1 

## Exclusivity for home made Atalay (2020)

In [851]:
mylist = ['rm','rc','nrm','sequential','reciprocal','nra']
mypairs = []
for i,j in enumerate(mylist):
  mypairs.extend(list(zip([j]*(len(mylist)-i+1),mylist[i+1:])))
print(mypairs)
for i,j in mypairs:
  mydf.loc[(mydf[i]==1)&(mydf[j]==1),i] = 0
mydf['sum_rows_A'] = mydf[mylist].sum(axis=1)
print(100*mydf[['sum_rows_A']].sum(axis=0)/12014, 'annotated percentage using exclusive Atalay')
mydf.to_excel('Manufacturing_Skills_US_annotation.xlsx')

[('rm', 'rc'), ('rm', 'nrm'), ('rm', 'sequential'), ('rm', 'reciprocal'), ('rm', 'nra'), ('rc', 'nrm'), ('rc', 'sequential'), ('rc', 'reciprocal'), ('rc', 'nra'), ('nrm', 'sequential'), ('nrm', 'reciprocal'), ('nrm', 'nra'), ('sequential', 'reciprocal'), ('sequential', 'nra'), ('reciprocal', 'nra')]
sum_rows_A    14.95755
dtype: float64 annotated percentage using exclusive Atalay


## Dictionary

In [852]:
development = mydf.loc[mydf.development==1,'Skill_US'].tolist()
design = mydf.loc[mydf.design==1,'Skill_US'].tolist()
materials = mydf.loc[mydf.materials==1,'Skill_US'].tolist()
inventory = mydf.loc[mydf.inventory==1,'Skill_US'].tolist()
tooling = mydf.loc[mydf.tooling==1,'Skill_US'].tolist()
automation = mydf.loc[mydf.automation==1,'Skill_US'].tolist()
production = mydf.loc[mydf.production==1,'Skill_US'].tolist()
maintenance = mydf.loc[mydf.maintenance==1,'Skill_US'].tolist()
technical = mydf.loc[mydf.technical==1,'Skill_US'].tolist()
customer = mydf.loc[mydf.customer==1,'Skill_US'].tolist()
administrative = mydf.loc[mydf.administrative==1,'Skill_US'].tolist()
management = mydf.loc[mydf.management==1,'Skill_US'].tolist()
finance = mydf.loc[mydf.finance==1,'Skill_US'].tolist()
business = mydf.loc[mydf.business==1,'Skill_US'].tolist()
data = mydf.loc[mydf.data==1,'Skill_US'].tolist()
software = mydf.loc[mydf.software==1,'Skill_US'].tolist()
office = mydf.loc[mydf.office==1,'Skill_US'].tolist()
ml = mydf.loc[mydf.ml==1,'Skill_US'].tolist()
social = mydf.loc[mydf.social==1,'Skill_US'].tolist()
cognitive = mydf.loc[mydf.cognitive==1,'Skill_US'].tolist()
character = mydf.loc[mydf.character==1,'Skill_US'].tolist()
creativity = mydf.loc[mydf.creativity==1,'Skill_US'].tolist()
writing = mydf.loc[mydf.writing==1,'Skill_US'].tolist()
nra = mydf.loc[mydf.nra==1,'Skill_US'].tolist()
nrm = mydf.loc[mydf.nrm==1,'Skill_US'].tolist()
rc = mydf.loc[mydf.rc==1,'Skill_US'].tolist()
rm = mydf.loc[mydf.rm==1,'Skill_US'].tolist()
sequential = mydf.loc[mydf.sequential==1,'Skill_US'].tolist()
reciprocal = mydf.loc[mydf.reciprocal==1,'Skill_US'].tolist()

mydict = {'development':development,'materials':materials,'design':design,'inventory':inventory,'tooling':tooling,'automation':automation,'production':production,'maintenance':maintenance,'technical':technical,'administrative':administrative,'management':management,'finance':finance,'business':business,'data':data,'software':software,'office':office,'ml':ml,'cognitive':cognitive,'creativity':creativity,'social':social,'character':character,'customer':customer,'writing':writing,'nonroutine analytic':nra,'nonroutine manual':nrm,'routine cognitive':rc,'routine manual':rm,'sequential':sequential,'reciprocal':reciprocal}
df = pandas.DataFrame.from_dict(mydict,orient='index')
df.index.name = 'Annotation'
df.to_csv('Annotation.csv')
df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1420,1421,1422,1423,1424,1425,1426,1427,1428,1429
Annotation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
development,geometry,new drug application (nda) review,engineering management,engineering analysis,drug development,biologics,labview,differential equations,wonderware,maple,...,,,,,,,,,,
materials,cheminformatics,plant biology,mammalian cell culture,biomechanics,immunohistochemistry,cancer biology,chemical reactions,particle sizing,metals industry knowledge,parasitology,...,,,,,,,,,,
design,design change packages,role playing games(rpg),landfill design,nuclear core design,adobe photoshop,digital design,ptc creo,3d modeling / design,electronic case report form (ecrf) design,color theory,...,,,,,,,,,,
inventory,lead time reduction,materials routing,supply agreements,logistics analysis,item location,supplier development,e-procurement,metrics definition,freight flow,shipping methods,...,,,,,,,,,,
tooling,carpenter's levels,trim carpentry,form building,metal framing,commercial carpentry,woodworking machines,window installation,wood framing,carpentry,crown molding,...,,,,,,,,,,
automation,wonderware intouch,embedded software,robot framework,machine vision,computer-aided manufacturing (cam),robotic systems,electromechanical systems,rsview,simultaneous localization and mapping (slam),robot programming,...,,,,,,,,,,
production,machinery lubrication,production schedule development,lathes,materials processing,pipefitting,machine operation,press brakes,industrial machinery maintenance,manufacturing execution system,sheet metal installation,...,,,,,,,,,,
maintenance,wire cutters,appliance repair,checking batteries,brake lathes,cabling,pressure testing,fleet preventive maintenance,ventilation systems,commercial plumbing,hvac engineering,...,,,,,,,,,,
technical,help desk support,mainframe,wide area network (wan),mainframe testing,computer software industry knowledge,remote technical support,geospatial information systems,information systems,telephone technical support,hardware and software installation,...,,,,,,,,,,
administrative,general office duties,appointment setting,darwin information typing architecture (dita),invoice record keeping,mailing industry knowledge,scheduling,travel arrangements,audit scheduling,office management,ribotyping,...,,,,,,,,,,


## Hybrid sample

In [853]:
filepath = 'D:/BG/Data/Processing/3_data.txt'
data = pandas.read_csv(filepath, sep='\t')
print(data.shape[0],'job postings in full sample')
hybrid = data[data.plant_5==0]
print(hybrid.shape[0],'job postings in hybrid sample')

3091473 job postings in full sample
212821 job postings in hybrid sample


## Hybrid sample and string of terms for each job posting

In [854]:
mydata = hybrid.copy()
mydata['SK_without1'] = mydata['SK_without'].str.split(',')
mydata = mydata[['BGTJobId','SK_without1']].values.tolist()
print(mydata[0:5])

[[351089402, ['ansys', 'nastran', 'mechanical engineering', 'nastran / patran', 'system design', 'finite element method / analysis software', 'communication skills', 'ls-dyna', 'finite element analysis', 'cad / cam (computer-aided design / manufacturing)', 'troubleshooting', 'unigraphics']], [351095233, ['finite element analysis', 'system design', 'budgeting', 'ansys', 'planning', 'mentoring', 'product development']], [351100169, ['engineering technical support', 'physical abilities', 'english', 'personal protective equipment (ppe)', 'engineering drawings', 'machine tools', 'international traffic in arms regulations (itar)', 'planning', 'technical support', 'repair', 'grinders']], [351102479, ['engineering documentation', 'planning', 'component design', 'financial analysis', 'troubleshooting technical issues', 'process design', 'problem solving', 'leadership', 'analytical skills', 'level design', 'multi-tasking', 'technical writing / editing', 'budgeting', 'computer literacy', 'schedul

## COUNT

### Equal keyword

In [855]:
count = []
for job in mydata:
  j = []
  j.append(job[0])
  for category in mydict:
    m = 0
    for y in job[1]:
      for x in mydict[category]:
        if x == y:
          m = m + 1
    j.append(m)
  count.append(j)
df = pandas.DataFrame(count,columns=['BGTJobId']+list(mydict.keys()))
print(df) 

           BGTJobId  development  materials  design  inventory  tooling  \
0         351089402            2          0       3          0        0   
1         351095233            2          0       1          0        0   
2         351100169            1          0       1          0        0   
3         351102479            1          0       3          0        0   
4         351110441            0          0       0          0        0   
...             ...          ...        ...     ...        ...      ...   
212816  38689223701            1          0       0          2        0   
212817  38689229220            1          2       1          0        0   
212818  38689235695            1          2       2          0        0   
212819  38689246738            1          2       0          2        0   
212820  38689265196            5          2       8          1        0   

        automation  production  maintenance  technical  ...  social  \
0                0          

## Merging data for hybrid sample

In [856]:
hybrid.columns

Index(['BGTJobId', 'JobDate', 'CleanTitle', 'CanonTitle', 'SOC', 'SOCName',
       'ONET', 'ONETName', 'Employer', 'Sector', 'SectorName', 'NAICS3',
       'NAICS4', 'NAICS5', 'NAICS6', 'City', 'County', 'State', 'Lat', 'Lon',
       'Edu', 'MaxEdu', 'Degree', 'MaxDegree', 'Exp', 'MaxExp', 'MinSalary',
       'MaxSalary', 'MinHrlySalary', 'MaxHrlySalary', 'PayFrequency',
       'SalaryType', 'JobHours', 'SOC2', 'SOC4', 'SOC5', 'SK',
       'Employer_clean', 'n_terms', 'SK_without', 'TECH', 'occupation',
       'firm1', 'firm2', 'firm', 'plant', 'plant_all', 'plant_5'],
      dtype='object')

In [857]:
dfhybrid = pandas.merge(left=hybrid,right=df,how='inner',on='BGTJobId')
print(dfhybrid.columns)

Index(['BGTJobId', 'JobDate', 'CleanTitle', 'CanonTitle', 'SOC', 'SOCName',
       'ONET', 'ONETName', 'Employer', 'Sector', 'SectorName', 'NAICS3',
       'NAICS4', 'NAICS5', 'NAICS6', 'City', 'County', 'State', 'Lat', 'Lon',
       'Edu', 'MaxEdu', 'Degree', 'MaxDegree', 'Exp', 'MaxExp', 'MinSalary',
       'MaxSalary', 'MinHrlySalary', 'MaxHrlySalary', 'PayFrequency',
       'SalaryType', 'JobHours', 'SOC2', 'SOC4', 'SOC5', 'SK',
       'Employer_clean', 'n_terms', 'SK_without', 'TECH', 'occupation',
       'firm1', 'firm2', 'firm', 'plant', 'plant_all', 'plant_5',
       'development', 'materials', 'design', 'inventory', 'tooling',
       'automation', 'production', 'maintenance', 'technical',
       'administrative', 'management', 'finance', 'business', 'data',
       'software', 'office', 'ml', 'cognitive', 'creativity', 'social',
       'character', 'customer', 'writing', 'nonroutine analytic',
       'nonroutine manual', 'routine cognitive', 'routine manual',
       'sequential

## Saving data

In [858]:
filepath = 'D:/BG/Data/Processing/4_data_hybrid.txt'
dfhybrid.to_csv(filepath, sep='\t', header=True, index=False)