In [1]:
# do we do both together, or separately (probably do separately, and if different results, do together)
# in other words, do we compute the one factor that is the greatest determinant for both
# or do we compute the independently compute the greatest determining factors for each category
    # the factors may or may not be the same
# ask which factors they're talking about

In [2]:
# Approach:
# combine apprentice and journeymen datasets (at this point, I think we've used both together for everything, lol)
# label all the different factors (contractor, total_employee (corresponds to project size), trade, craft level)
# use one-hot encoding (reason: label encoding [the alternative] works in a way 
    # which biases the influence of different features), and we don't want that
    # however, will likely need to address multicollinearity 
# resource: https://www.analyticsvidhya.com/blog/2020/03/one-hot-encoding-vs-label-encoding-using-scikit-learn/
# use a linear regression/neural network to compute which feature has the highest coefficient/weight associated with it?

# alternatively, follow blog post: https://towardsdatascience.com/3-essential-ways-to-calculate-feature-importance-in-python-2f9149592155
# my suggested method is closest to method #1

In [32]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [33]:
df1 = pd.read_csv('WorkforceUtilizationReport-Journeymen.csv')
df2 = pd.read_csv('WorkforceUtilizationReport-Apprentice.csv')
df = pd.concat([df1, df2], axis=0)
df.index = range(df1.shape[0] + df2.shape[0])

In [34]:
df.tail()

Unnamed: 0,MONTH,YEAR,PROJECT,PROJECT_CODE,CONTRACTOR,CONSTRUCTION_TRADE,CRAFT_LEVEL,TOTAL_EMPLOYEE,CAUCASIAN,AFRICAN_AMERICAN,HISPANIC,ASIAN,NATIVE_AMERICAN,OTHER,NOT_SPECIFIED,TOTAL_FEMALE,TOTAL_MALE,HOURS_WORKED_PER_MONTH
14839,12,2020,TRC1203 FC1 C Brighton Municipal Court Buildin...,TRC1203 FC1 C,Trac builders inc.,CARPENTER,Apprentice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HOURS_PER_MONTH
14840,12,2020,TRC1203 FC1 C Brighton Municipal Court Buildin...,TRC1203 FC1 C,Trac builders inc.,LABORER,Apprentice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HOURS_PER_MONTH
14841,12,2020,TRC1702 HC1 C Suffolk County Courthouse Elevat...,TRC1702 HC1 C,3 phase elevator,ELEVATOR CONSTRUCTOR,Apprentice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HOURS_PER_MONTH
14842,12,2020,TRC1702 HC1 C Suffolk County Courthouse Elevat...,TRC1702 HC1 C,3 phase elevator,ELEVATOR CONSTRUCTOR HELPER,Apprentice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HOURS_PER_MONTH
14843,12,2020,TRC1702 HC1 C Suffolk County Courthouse Elevat...,TRC1702 HC1 C,"Annese electrical services, inc.",ELECTRICIAN,Apprentice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HOURS_PER_MONTH


In [35]:
df.columns

Index(['MONTH', 'YEAR', 'PROJECT', 'PROJECT_CODE', 'CONTRACTOR',
       'CONSTRUCTION_TRADE', 'CRAFT_LEVEL', 'TOTAL_EMPLOYEE', 'CAUCASIAN',
       'AFRICAN_AMERICAN', 'HISPANIC', 'ASIAN', 'NATIVE_AMERICAN', 'OTHER',
       'NOT_SPECIFIED', 'TOTAL_FEMALE', 'TOTAL_MALE',
       'HOURS_WORKED_PER_MONTH'],
      dtype='object')

In [36]:
poc_cols = ['AFRICAN_AMERICAN', 'HISPANIC', 'ASIAN', 'NATIVE_AMERICAN', 'OTHER']
df['POC'] = 0
for i in range(df.shape[0]):
    sum = 0
    for col in poc_cols:
        sum += df.iloc[i][col]
    df.loc[i, ['POC']] = sum

In [37]:
df.index.is_unique

True

In [38]:
df.head()

Unnamed: 0,MONTH,YEAR,PROJECT,PROJECT_CODE,CONTRACTOR,CONSTRUCTION_TRADE,CRAFT_LEVEL,TOTAL_EMPLOYEE,CAUCASIAN,AFRICAN_AMERICAN,HISPANIC,ASIAN,NATIVE_AMERICAN,OTHER,NOT_SPECIFIED,TOTAL_FEMALE,TOTAL_MALE,HOURS_WORKED_PER_MONTH,POC
0,8,2017,AEP1407E UT1 C AEP Utility Contract-Cape Cod,AEP 1407E UT1,Rise engineering,INSULATOR (PIPES & TANKS),Journeymen,16.0,0.0,0.0,8.0,8.0,0.0,0.0,0.0,0.0,16.0,HOURS_PER_MONTH,16.0
1,8,2017,CME0902 DC1 C Chief Medical Examiner Office We...,CME0902 DC1 CM,Crestview construction & trucking inc.,BACKHOE/FRONT-END LOADER,Journeymen,247.5,247.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,247.5,HOURS_PER_MONTH,0.0
2,8,2017,CME0902 DC1 C Chief Medical Examiner Office We...,CME0902 DC1 CM,Crestview construction & trucking inc.,DRIVER / GROUNDMAN CDL,Journeymen,127.0,127.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,127.0,HOURS_PER_MONTH,0.0
3,8,2017,CME0902 DC1 C Chief Medical Examiner Office We...,CME0902 DC1 CM,Crestview construction & trucking inc.,EQUIPMENT OPERATOR (Class B CDL),Journeymen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,HOURS_PER_MONTH,0.0
4,8,2017,CME0902 DC1 C Chief Medical Examiner Office We...,CME0902 DC1 CM,Crestview construction & trucking inc.,LABORER,Journeymen,61.0,61.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,61.0,HOURS_PER_MONTH,0.0


In [39]:
to_transform = ['CONTRACTOR', 'CONSTRUCTION_TRADE', 'CRAFT_LEVEL']

In [40]:
df = df.drop(['PROJECT_CODE', 'MONTH', 'YEAR', 'PROJECT', 'CAUCASIAN',
       'AFRICAN_AMERICAN', 'HISPANIC', 'ASIAN', 'NATIVE_AMERICAN', 'OTHER',
       'NOT_SPECIFIED', 'TOTAL_MALE', 'HOURS_WORKED_PER_MONTH'], axis=1)
df.head()

Unnamed: 0,CONTRACTOR,CONSTRUCTION_TRADE,CRAFT_LEVEL,TOTAL_EMPLOYEE,TOTAL_FEMALE,POC
0,Rise engineering,INSULATOR (PIPES & TANKS),Journeymen,16.0,0.0,16.0
1,Crestview construction & trucking inc.,BACKHOE/FRONT-END LOADER,Journeymen,247.5,0.0,0.0
2,Crestview construction & trucking inc.,DRIVER / GROUNDMAN CDL,Journeymen,127.0,0.0,0.0
3,Crestview construction & trucking inc.,EQUIPMENT OPERATOR (Class B CDL),Journeymen,0.0,0.0,0.0
4,Crestview construction & trucking inc.,LABORER,Journeymen,61.0,0.0,0.0


In [42]:
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_contractor_cols = pd.DataFrame(OH_encoder.fit_transform(df[['CONTRACTOR']]))

# One-hot encoding removed index; put it back
OH_contractor_cols.index = df.index

# name columns
OH_contractor_cols.columns = ['CONTRACTOR' + str(i) for i in range(len(OH_contractor_cols.columns))]
OH_contractor_cols.head()

Unnamed: 0,CONTRACTOR0,CONTRACTOR1,CONTRACTOR2,CONTRACTOR3,CONTRACTOR4,CONTRACTOR5,CONTRACTOR6,CONTRACTOR7,CONTRACTOR8,CONTRACTOR9,...,CONTRACTOR451,CONTRACTOR452,CONTRACTOR453,CONTRACTOR454,CONTRACTOR455,CONTRACTOR456,CONTRACTOR457,CONTRACTOR458,CONTRACTOR459,CONTRACTOR460
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_trade_cols = pd.DataFrame(OH_encoder.fit_transform(df[['CONSTRUCTION_TRADE']]))

# One-hot encoding removed index; put it back
OH_trade_cols.index = df.index

# name columns
OH_trade_cols.columns = ['CONSTRUCTION_TRADE' + str(i) for i in range(len(OH_trade_cols.columns))]
OH_trade_cols.head()

Unnamed: 0,CONSTRUCTION_TRADE0,CONSTRUCTION_TRADE1,CONSTRUCTION_TRADE2,CONSTRUCTION_TRADE3,CONSTRUCTION_TRADE4,CONSTRUCTION_TRADE5,CONSTRUCTION_TRADE6,CONSTRUCTION_TRADE7,CONSTRUCTION_TRADE8,CONSTRUCTION_TRADE9,...,CONSTRUCTION_TRADE71,CONSTRUCTION_TRADE72,CONSTRUCTION_TRADE73,CONSTRUCTION_TRADE74,CONSTRUCTION_TRADE75,CONSTRUCTION_TRADE76,CONSTRUCTION_TRADE77,CONSTRUCTION_TRADE78,CONSTRUCTION_TRADE79,CONSTRUCTION_TRADE80
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_craft_cols = pd.DataFrame(OH_encoder.fit_transform(df[['CRAFT_LEVEL']]))

# One-hot encoding removed index; put it back
OH_craft_cols.index = df.index

# name columns
OH_craft_cols.columns = ['CRAFT_LEVEL' + str(i) for i in range(len(OH_craft_cols.columns))]
OH_craft_cols.head()

Unnamed: 0,CRAFT_LEVEL0,CRAFT_LEVEL1
0,0.0,1.0
1,0.0,1.0
2,0.0,1.0
3,0.0,1.0
4,0.0,1.0


In [47]:
df_poc = df.drop(['CONTRACTOR', 'CONSTRUCTION_TRADE', 'CRAFT_LEVEL'], axis=1)
df_poc = pd.concat([df_poc, OH_contractor_cols, OH_trade_cols, OH_craft_cols], axis=1)

df_women = df.drop(['CONTRACTOR', 'CONSTRUCTION_TRADE', 'CRAFT_LEVEL'], axis=1)
df_women = pd.concat([df_women, OH_contractor_cols, OH_trade_cols, OH_craft_cols], axis=1)

df_women.head()

Unnamed: 0,TOTAL_EMPLOYEE,TOTAL_FEMALE,POC,CONTRACTOR0,CONTRACTOR1,CONTRACTOR2,CONTRACTOR3,CONTRACTOR4,CONTRACTOR5,CONTRACTOR6,...,CONSTRUCTION_TRADE73,CONSTRUCTION_TRADE74,CONSTRUCTION_TRADE75,CONSTRUCTION_TRADE76,CONSTRUCTION_TRADE77,CONSTRUCTION_TRADE78,CONSTRUCTION_TRADE79,CONSTRUCTION_TRADE80,CRAFT_LEVEL0,CRAFT_LEVEL1
0,16.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,247.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,127.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,61.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [66]:
import statsmodels.api as sm

# Function to calculate VIF
def calculate_vif(data):
    vif_df = pd.DataFrame(columns = ['Var', 'Vif'])
    x_var_names = data.columns
    for i in range(0, x_var_names.shape[0]):
        y = data[x_var_names[i]]
        x = data[x_var_names.drop([x_var_names[i]])]
        r_squared = sm.OLS(y,x).fit().rsquared
        vif = round(1/(1-r_squared),2)
        vif_df.loc[i] = [x_var_names[i], vif]
    return vif_df.sort_values(by = 'Vif', axis = 0, ascending=False, inplace=False)

X=df_poc.drop(['POC', 'CONTRACTOR1', 'CONSTRUCTION_TRADE0', 'CONSTRUCTION_TRADE54', 'CONTRACTOR271', 'CONTRACTOR38', 'CONSTRUCTION_TRADE34'],axis=1)
res = calculate_vif(X)

In [68]:
res['Vif'].value_counts()

1.27     26
1.55     14
1.54     10
1.28      9
1.85      7
         ..
26.03     1
6.07      1
4.07      1
16.45     1
38.75     1
Name: Vif, Length: 339, dtype: int64

In [79]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X = df_poc.drop(['POC', 'CONTRACTOR1', 'CONSTRUCTION_TRADE0', 'CONSTRUCTION_TRADE54', 'CONTRACTOR271', 'CONTRACTOR38', 'CONSTRUCTION_TRADE34'], axis=1)
y = df_poc['POC']
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

#ss = StandardScaler()
#X_scaled = ss.fit_transform(X)

In [80]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X, y)
importances = pd.DataFrame(data={
    'Attribute': X.columns,
    'Importance': model.coef_[0]
})

In [81]:
import matplotlib.pyplot as plt

In [82]:
importances.head()

Unnamed: 0,Attribute,Importance
0,TOTAL_EMPLOYEE,0.162623
1,TOTAL_FEMALE,0.162623
2,CONTRACTOR0,0.162623
3,CONTRACTOR2,0.162623
4,CONTRACTOR3,0.162623


In [83]:
importances.sort_values(by=['Importance'])

Unnamed: 0,Attribute,Importance
0,TOTAL_EMPLOYEE,0.162623
368,CONTRACTOR369,0.162623
367,CONTRACTOR368,0.162623
366,CONTRACTOR367,0.162623
365,CONTRACTOR366,0.162623
...,...,...
172,CONTRACTOR172,0.162623
171,CONTRACTOR171,0.162623
170,CONTRACTOR170,0.162623
184,CONTRACTOR184,0.162623
