# Overlaps

In [38]:
import pandas
import statsmodels.formula.api as smf

## Keywords

In [39]:
filepath = r'D:/BG/Paper_1/Count/BGT.txt'
df1 = pandas.read_csv(filepath, sep='\t', nrows=17422)
filepath = r'D:/BG/Paper_1/Count/DNATBU.txt'
df2 = pandas.read_csv(filepath, sep='\t', nrows=166)
adv_m = df1.loc[(df1.SkillCluster=='materials science') & (df1.IsSoftware==0), 'Skill_US'].tolist()
adv_p = df1.loc[(df1.SkillCluster=='product development') & (df1.IsSoftware==0), 'Skill_US'].tolist()
adv_s = df1.loc[(df1.SkillClusterFamily=='science and research') & (df1.IsSoftware==0), 'Skill_US'].tolist()
adv = adv_m + adv_p + adv_s
print(len(adv),'phrases in high level')
design = df1.loc[(df1.SkillCluster=='analog design') & (df1.IsSoftware==0), 'Skill_US'].tolist() + df1.loc[(df1.SkillCluster=='creative design') & (df1.IsSoftware==0), 'Skill_US'].tolist() + df1.loc[(df1.SkillCluster=='digital design') & (df1.IsSoftware==0), 'Skill_US'].tolist() + df1.loc[(df1.SkillCluster=='drafting and engineering design') & (df1.IsSoftware==0), 'Skill_US'].tolist() + df1.loc[(df1.SkillCluster=='graphic and visual design') & (df1.IsSoftware==0), 'Skill_US'].tolist() + df1.loc[(df1.SkillCluster=='graphic and visual design software') & (df1.IsSoftware==0), 'Skill_US'].tolist() + df1.loc[(df1.SkillCluster=='industrial design') & (df1.IsSoftware==0), 'Skill_US'].tolist() + df1.loc[(df1.SkillCluster=='manufacturing design') & (df1.IsSoftware==0), 'Skill_US'].tolist()
print(len(design),'phrases in design')

217 phrases in high level
71 phrases in design


In [40]:
complexity = df2.loc[df2.BU=='complexity', 'KEYWORDS'].tolist()
print(complexity)
adv = [phrase for phrase in adv if all(stem not in phrase for stem in complexity)]
print(len(adv),'phrases in high level after removing complexity overlap')
design = [phrase for phrase in design if all(stem not in phrase for stem in complexity)]
print(len(design),'phrases in design after removing complexity overlap')

['advanced', 'analy', 'change', 'creativ', 'design', 'develop', 'devising', 'evaluate', 'experiment', 'improve', 'initiative', 'interpret', 'learn', 'model', 'multi-tasking', 'plan', 'project', 'research', 'simulat', 'sketch']
178 phrases in high level after removing complexity overlap
31 phrases in design after removing complexity overlap


## Full sample

In [41]:
filepath = r'D:/BG/Data/Processing/4_data.txt'
data = pandas.read_csv(filepath, sep='\t')
print(data.shape)

(3091485, 100)


## Hybrid sample

In [42]:
data = data[data.plant_5==0]
print(data.shape)

(212822, 100)


## Counting new High level and Design

In [43]:
data['SK_0'] = data.SK_without.str.split(',')
jobslist = data[['BGTJobId','SK_0']].values.tolist() # [[1,['critical thinking','physical abilities']],[2,['market research','analytical skills']]
# Important: it works for phrases

MATCHES = []
for job in jobslist:
  j = []
  j.append(job[0])
  n = 0
  for keyword in adv:
    n = job[1].count(keyword)+n
  j.append(n)
  MATCHES.append(j)
print(len(MATCHES))
df_adv = pandas.DataFrame(MATCHES)
df_adv.columns = ['BGTJobId','adv0']
df_adv = df_adv.set_index('BGTJobId')
print(df_adv)

MATCHES = []
for job in jobslist:
  j = []
  j.append(job[0])
  n = 0
  for keyword in design:
    n = job[1].count(keyword)+n
  j.append(n)
  MATCHES.append(j)
print(len(MATCHES))
df_design = pandas.DataFrame(MATCHES)
df_design.columns = ['BGTJobId','design0']
df_design = df_design.set_index('BGTJobId')
print(df_design)

data = pandas.merge(left=data,right=df_adv,how='inner',on='BGTJobId')
data = pandas.merge(left=data,right=df_design,how='inner',on='BGTJobId')

212822
             adv0
BGTJobId         
351089402       0
351095233       0
351100169       0
351102479       0
351110441       0
...           ...
38689223701     0
38689229220     2
38689235695     2
38689246738     3
38689265196     2

[212822 rows x 1 columns]
212822
             design0
BGTJobId            
351089402          0
351095233          0
351100169          1
351102479          0
351110441          0
...              ...
38689223701        0
38689229220        0
38689235695        0
38689246738        0
38689265196        0

[212822 rows x 1 columns]


## Complexity as a function of activities (hybrid OLS)

In [44]:
r = smf.ols("complexity ~ basic+tech_1+tech_2+adv0+design0+software+busfin+cust_1+cust_2+data_1+data_2+management+ml+office", data=data).fit(cov_type='cluster',cov_kwds={'groups':data['plant']})
print(r.summary())

                            OLS Regression Results                            
Dep. Variable:             complexity   R-squared:                       0.211
Model:                            OLS   Adj. R-squared:                  0.211
Method:                 Least Squares   F-statistic:                     117.4
Date:                Wed, 21 Oct 2020   Prob (F-statistic):          1.95e-116
Time:                        13:14:27   Log-Likelihood:            -4.7907e+05
No. Observations:              212822   AIC:                         9.582e+05
Df Residuals:                  212807   BIC:                         9.583e+05
Df Model:                          14                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.8452      0.066     27.847      0.0

## Complexity as a function of occupations

In [45]:
data = pandas.get_dummies(data, columns=['occupation'])
r = smf.ols("complexity ~ occupation_Managers+occupation_Engineers+occupation_Technicians", data=data).fit(cov_type='cluster',cov_kwds={'groups':data['plant']})
print(r.summary())

                            OLS Regression Results                            
Dep. Variable:             complexity   R-squared:                       0.114
Model:                            OLS   Adj. R-squared:                  0.114
Method:                 Least Squares   F-statistic:                     201.9
Date:                Wed, 21 Oct 2020   Prob (F-statistic):           2.65e-73
Time:                        13:14:28   Log-Likelihood:            -4.9138e+05
No. Observations:              212822   AIC:                         9.828e+05
Df Residuals:                  212818   BIC:                         9.828e+05
Df Model:                           3                                         
Covariance Type:              cluster                                         
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                  0

## Complexity as a function of technology

In [46]:
data['Qtr'] = (1/10)*(pandas.to_datetime(data.JobDate).dt.quarter+4*(pandas.to_datetime(data.JobDate).dt.year-2014))
data = pandas.get_dummies(data, columns=['TECH'])
data['TECH_AMQtr'] = data.TECH_AM * data.Qtr
data['total_terms1'] = 0.1*data['total_terms']
r = smf.ols("complexity ~ total_terms1+occupation_Managers+occupation_Engineers+occupation_Technicians+TECH_AM+Qtr+TECH_AMQtr", data=data).fit(cov_type='cluster',cov_kwds={'groups':data['plant']})
print(r.summary())

                            OLS Regression Results                            
Dep. Variable:             complexity   R-squared:                       0.422
Model:                            OLS   Adj. R-squared:                  0.422
Method:                 Least Squares   F-statistic:                     445.9
Date:                Wed, 21 Oct 2020   Prob (F-statistic):          2.97e-160
Time:                        13:14:30   Log-Likelihood:            -4.4599e+05
No. Observations:              212822   AIC:                         8.920e+05
Df Residuals:                  212814   BIC:                         8.921e+05
Df Model:                           7                                         
Covariance Type:              cluster                                         
                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
Intercept                 -1