In [1]:
import pandas as pd
import numpy as np

import industry_type

pd.set_option('display.max_columns', None)

In [2]:
state = 'tennessee'
state_sentence_case = f'{state[0].capitalize()}{state[1:]}'
national_file_path = '../generation/national_industry_analysis.csv'
national_detailed_file_path = '../generation/national_summary_detailed.csv'
state_results_csv_path = f'../generation/{state}_results.csv'
soc_mapping_path = '../data/soc_mapping.csv'
wef_risk_path = '../data/skills/skills_based_risk.csv'
industry_type_path = 'modified_bls_super_sector_df.csv'

data_v2_path = '../data_v2/industry_risk_analysis.json'

In [3]:
national_df = pd.read_csv(national_file_path)
national_detailed_df = pd.read_csv(national_detailed_file_path)
soc_mapping_df = pd.read_csv(soc_mapping_path)
wef_risk_df = pd.read_csv(wef_risk_path)
industry_type_df = pd.read_csv(industry_type_path)

In [4]:
industry_type_df = industry_type_df.drop(columns=['Title'])
industry_type_df['OCC_CODE'] = industry_type_df['O*NET-SOC Code'].apply(lambda x: x.split('.')[0] if '.' in x else x)
industry_type_df = industry_type_df.drop(columns=['O*NET-SOC Code'])
industry_type_df = industry_type_df.drop_duplicates(subset=['OCC_CODE'], keep='first')

In [5]:
national_detailed_df = national_detailed_df.merge(industry_type_df, on='OCC_CODE', how='left')

In [6]:
def get_minor_group_title(code):
    if type(code) != str:
        code = str(code)
    if '-' in code:
        code = code.replace('-', '')[:3]
    
    code = code[:3]
    code = int(code)

    
    df = soc_mapping_df[soc_mapping_df['normalized_minor_code'] == code]
    if df.empty:
        return None
    else:
        return df['minor_title'].values[0]

def get_major_group_title(code):
    if type(code) != str:
        code = str(code)
    if '-' in code:
        code = code.replace('-', '')[:2]
    
    code = code[:3]
    code = int(code)

    
    df = soc_mapping_df[soc_mapping_df['normalized_major_code'] == code]
    if df.empty:
        return None
    else:
        return df['major_title'].values[0]

def get_broad_group_title(code):
    if type(code) != str:
        code = str(code)
    if '-' in code:
        code = code.replace('-', '')[:1]
    
    code = code[:4]
    code = int(code)

    
    df = soc_mapping_df[soc_mapping_df['normalized_broad_code'] == code]
    if df.empty:
        return None
    else:
        return df['broad_title'].values[0]

In [7]:
national_detailed_df['minor_group_name'] = national_detailed_df['minor_group'].apply(get_minor_group_title)
national_detailed_df['major_group_name'] = national_detailed_df['major_group'].apply(get_major_group_title)
national_detailed_df['broad_group_name'] = national_detailed_df['OCC_CODE'].apply(get_broad_group_title)

In [8]:
national_df.head()

Unnamed: 0,minor_group,minor_group_name,TOT_EMP,economic_value,automation_susceptibility,enhanced_automation_risk,is_potentially_at_risk,is_currently_at_risk,estimated_zapier_apps,zapier_apps_per_worker,at_risk_soc_codes,potential_econ_value_at_risk,current_econ_value_at_risk,minor_potential_index,minor_iceberg_index,automation_gap,weighted_iceberg_index,state
0,111,111,36310.0,4652278000.0,37.948976,28.461732,0,0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,Alabama
1,112,112,5370.0,719036900.0,43.301277,32.475958,1,0,0.0,0.0,112021.0,195070800.0,0.0,27.129456,0.0,27.129456,0.0,Alabama
2,113,113,20700.0,2779513000.0,35.604256,26.703192,1,0,0.0,0.0,113021.0,819366000.0,0.0,29.47876,0.0,29.47876,0.0,Alabama
3,119,119,42490.0,4119062000.0,36.534983,27.401238,2,0,0.0,0.0,119041119121.0,492811500.0,0.0,11.964167,0.0,11.964167,0.0,Alabama
4,131,131,61570.0,4881716000.0,38.441509,28.831132,1,0,0.0,0.0,131111.0,655233700.0,0.0,13.4222,0.0,13.4222,0.0,Alabama


In [9]:
national_detailed_df.head()

Unnamed: 0,AREA,AREA_TITLE,AREA_TYPE,PRIM_STATE,NAICS,NAICS_TITLE,I_GROUP,OWN_CODE,OCC_CODE,OCC_TITLE,O_GROUP,TOT_EMP,EMP_PRSE,JOBS_1000,LOC_QUOTIENT,PCT_TOTAL,PCT_RPT,H_MEAN,A_MEAN,MEAN_PRSE,H_PCT10,H_PCT25,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY,SOC_Code_Cleaned,tech_intensity_score,automation_susceptibility,automation_risk_category,major_group,estimated_zapier_apps_major,major_group_emp_total,emp_proportion,estimated_zapier_apps,zapier_apps_per_worker,zapier_apps_normalized,automation_susceptibility_norm,enhanced_automation_risk,enhanced_risk_category,economic_value,is_potentially_at_risk,is_currently_at_risk,potential_index,econ_potential_index,minor_group,minor_group_name,state,Modified BLS Super Sector,major_group_name,broad_group_name
0,1,Alabama,2,AL,0,Cross-industry,cross-industry,1235,11-1011,Chief Executives,detailed,720.0,6.8,0.348,0.25,,,106.26,221030.0,5.8,31.59,59.60,79.48,102.01,#,65700,123960,165320.0,212180,#,,,111011,12.305882,37.703481,Low,11,0.0,104870.0,0.006866,0.0,0.0,0,0.377035,28.277611,Low,159141600.0,False,False,0.0,0.031151,111,Top Executives,Alabama,MANAGEMENT,Management Occupations,
1,1,Alabama,2,AL,0,Cross-industry,cross-industry,1235,11-1021,General and Operations Managers,detailed,34450.0,2.7,16.781,0.73,,,62.17,129310.0,1.1,23.11,34.74,49.67,78.25,112.54,48080,72260,103320.0,162760,234080,,,111021,14.610314,43.238114,Moderate,11,0.0,104870.0,0.328502,0.0,0.0,0,0.432381,32.428586,Low,4454730000.0,False,False,0.0,1.0,111,Top Executives,Alabama,MANAGEMENT,Management Occupations,
2,1,Alabama,2,AL,0,Cross-industry,cross-industry,1235,11-1031,Legislators,detailed,1140.0,9.1,0.555,2.6,,,*,33690.0,5.1,*,*,*,*,*,18320,19670,24470.0,45050,55070,True,,111031,10.412132,32.905331,Low,11,0.0,104870.0,0.010871,0.0,0.0,0,0.329053,24.678998,Low,38406600.0,False,False,0.0,0.006561,111,Top Executives,Alabama,,Management Occupations,
3,1,Alabama,2,AL,0,Cross-industry,cross-industry,1235,11-2011,Advertising and Promotions Managers,detailed,70.0,16.5,0.032,0.24,,,53.99,112290.0,3.3,36.77,39.88,50.37,64.03,71.21,76480,82950,104770.0,133170,148110,,,112011,13.092828,40.677276,Moderate,11,0.0,104870.0,0.000667,0.0,0.0,0,0.406773,30.507957,Low,7860300.0,False,False,0.0,0.00166,112,"Advertising, Marketing, Promotions, Public Rel...",Alabama,MEDIA_AND_COMMUNICATIONS_SERVICES,Management Occupations,
4,1,Alabama,2,AL,0,Cross-industry,cross-industry,1235,11-2021,Marketing Managers,detailed,1490.0,3.7,0.728,0.3,,,62.94,130920.0,1.9,31.53,39.16,54.55,78.95,105.92,65580,81450,113460.0,164210,220320,,,112021,18.60031,55.7113,Moderate,11,0.0,104870.0,0.014208,0.0,0.0,0,0.557113,41.783475,Moderate,195070800.0,True,False,100.0,0.056422,112,"Advertising, Marketing, Promotions, Public Rel...",Alabama,MEDIA_AND_COMMUNICATIONS_SERVICES,Management Occupations,


In [10]:
wef_risk_df['OCC_CODE'] = wef_risk_df['O*NET-SOC Code'].apply(lambda x: x.split('.')[0] if '.' in x else x)
wef_risk_df = wef_risk_df.drop_duplicates(subset=['OCC_CODE']) # as we have detailed jobs

# Calculate the 80th percentile threshold (top 20%)
perc_ile = 0.8
threshold = wef_risk_df['automation_risk_score'].quantile(perc_ile)

# Create the new field based on whether each row's score is above the threshold
wef_risk_df['perc_ile_thresholded_risk'] = wef_risk_df['automation_risk_score'] >= threshold

In [11]:
wef_risk_df[wef_risk_df['perc_ile_thresholded_risk'] == True]

Unnamed: 0,O*NET-SOC Code,Title,basic_skills,cognitive_skills,social_skills,operations_skills,maintenance_skills,technical_skills,management_skills,automation_risk,automation_risk_score,OCC_CODE,perc_ile_thresholded_risk
35,11-9121.00,Natural Sciences Managers,3.9400,3.697143,3.230000,2.0300,1.0300,2.9350,3.080000,51.8,61.6,11-9121,True
45,11-9199.01,Regulatory Affairs Managers,4.0925,3.464286,3.146667,1.2825,1.0000,2.0650,3.040000,48.8,58.7,11-9199,True
55,13-1031.00,"Claims Adjusters, Examiners, and Investigators",3.9050,3.125714,3.041667,1.8100,1.0300,2.0325,2.583333,48.8,58.7,13-1031,True
64,13-1051.00,Cost Estimators,3.7825,3.197143,2.956667,1.2200,1.0300,2.0950,2.895000,50.6,60.4,13-1051,True
74,13-1141.00,"Compensation, Benefits, and Job Analysis Speci...",3.7550,3.124286,2.980000,1.2200,1.0000,2.0300,2.645000,50.7,60.5,13-1141,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,45-2041.00,"Graders and Sorters, Agricultural Products",2.5000,2.070000,1.958333,1.6250,1.0000,1.2175,1.561667,50.1,59.9,45-2041,True
653,47-4011.00,Construction and Building Inspectors,3.6550,3.108571,2.936667,2.3775,1.0000,2.2175,2.626667,50.0,59.8,47-4011,True
740,51-3092.00,Food Batchmakers,3.0000,2.644286,2.331667,2.7800,1.4350,1.8425,1.936667,48.9,58.8,51-3092,True
773,51-6051.00,"Sewers, Hand",2.4050,2.411429,2.020000,2.1875,1.0950,1.3750,1.980000,49.3,59.2,51-6051,True


In [12]:
print(f' shape national_detailed_df - before merge {national_detailed_df.shape}')
national_detailed_df = national_detailed_df.merge(
    wef_risk_df[['OCC_CODE', 'automation_risk_score', 'perc_ile_thresholded_risk']],
    on='OCC_CODE',
    how='left'  # or 'right' or 'inner' depending on your needs
)
print(f' shape national_detailed_df - after merge {national_detailed_df.shape}')
national_detailed_df['automation_risk_score'].fillna(national_detailed_df['automation_risk_score'].median(), inplace=True)
national_detailed_df['perc_ile_thresholded_risk'].fillna(False, inplace=True)

 shape national_detailed_df - before merge (36434, 57)
 shape national_detailed_df - after merge (36434, 59)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  national_detailed_df['automation_risk_score'].fillna(national_detailed_df['automation_risk_score'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  national_detailed_df['perc_ile_thresholded_risk'].fillna(False, inplace=True)
  national_detailed_df['perc_il

# State Filtering

In [13]:
state_filtered_df = national_detailed_df[national_detailed_df['AREA_TITLE'] == state_sentence_case]

In [14]:
state_filtered_df.shape

(748, 59)

In [15]:
print(f"{state_filtered_df['economic_value'].sum():02}")

178513028800.0


## Risked Jobs in state

In [16]:
state_risk_df = state_filtered_df[state_filtered_df['perc_ile_thresholded_risk'] == True]

In [17]:
state_risk_df.columns

Index(['AREA', 'AREA_TITLE', 'AREA_TYPE', 'PRIM_STATE', 'NAICS', 'NAICS_TITLE',
       'I_GROUP', 'OWN_CODE', 'OCC_CODE', 'OCC_TITLE', 'O_GROUP', 'TOT_EMP',
       'EMP_PRSE', 'JOBS_1000', 'LOC_QUOTIENT', 'PCT_TOTAL', 'PCT_RPT',
       'H_MEAN', 'A_MEAN', 'MEAN_PRSE', 'H_PCT10', 'H_PCT25', 'H_MEDIAN',
       'H_PCT75', 'H_PCT90', 'A_PCT10', 'A_PCT25', 'A_MEDIAN', 'A_PCT75',
       'A_PCT90', 'ANNUAL', 'HOURLY', 'SOC_Code_Cleaned',
       'tech_intensity_score', 'automation_susceptibility',
       'automation_risk_category', 'major_group',
       'estimated_zapier_apps_major', 'major_group_emp_total',
       'emp_proportion', 'estimated_zapier_apps', 'zapier_apps_per_worker',
       'zapier_apps_normalized', 'automation_susceptibility_norm',
       'enhanced_automation_risk', 'enhanced_risk_category', 'economic_value',
       'is_potentially_at_risk', 'is_currently_at_risk', 'potential_index',
       'econ_potential_index', 'minor_group', 'minor_group_name', 'state',
       'Modified BL

In [18]:
# First, let's calculate the total TOT_EMP for each minor_group
total_emp_by_minor = state_filtered_df.groupby(['Modified BLS Super Sector'])['TOT_EMP'].sum()

# Then, calculate the TOT_EMP for jobs with perc_ile_thresholded_risk=True
risk_emp_by_minor = state_risk_df[state_risk_df['perc_ile_thresholded_risk']].groupby(['Modified BLS Super Sector'])['TOT_EMP'].sum()

# Calculate the percentage
percentage_at_risk = (risk_emp_by_minor / total_emp_by_minor * 100).replace(np.nan, 0).sort_values(ascending=False)

# Display the results
print("\nPercentage of employees at risk by minor group:")


Percentage of employees at risk by minor group:


In [19]:
percentage_at_risk.sort_values(ascending=False)[:25]

Modified BLS Super Sector
ACCOUNTING_AND_AUDITING_SERVICES         100.000000
SCIENTIFIC_RESEARCH_AND_DEVELOPMENT       88.036254
SPECIAL_INDUSTRIES                        79.870130
FINANCE_AND_INSURANCE                     52.323428
ARCHITECTURAL_ENGINEERING_SERVICES        45.707657
MEDIA_AND_COMMUNICATIONS_SERVICES         34.505723
TECHNOLOGY_AND_SOFTWARE_DEVELOPMENT       33.305636
ENERGY                                    31.777625
LEGAL_SERVICES                            31.611358
HEALTHCARE                                17.154201
MANUFACTURING                             12.256243
MANAGEMENT                                11.862512
ADMINISTRATIVE_SUPPORT_WASTE_SERVICES      9.870227
MINING_OIL_GAS_EXTRACTION                  8.875740
EDUCATIONAL_SERVICES                       8.376485
AGRICULTURE_FORESTRY_FISHING_HUNTING       5.807623
CONSTRUCTION                               5.718196
RETAIL_TRADE                               4.901889
PASSENGER_TRANSPORTATION              

In [26]:
industry_wise_economic_value_df = state_filtered_df.groupby(['Modified BLS Super Sector'])['economic_value'].sum().sort_values(ascending=False)

In [27]:
industry_wise_economic_value_df

Modified BLS Super Sector
HEALTHCARE                               2.420687e+10
MANUFACTURING                            1.816923e+10
RETAIL_TRADE                             1.572902e+10
MANAGEMENT                               1.543603e+10
ADMINISTRATIVE_SUPPORT_WASTE_SERVICES    1.401958e+10
TRAVEL_AND_ENTERTAINMENT_SERVICES        1.073342e+10
CONSTRUCTION                             1.070611e+10
PACKAGING_AND_GOODS_TRANSPORTATION       1.022074e+10
FINANCE_AND_INSURANCE                    9.032129e+09
EDUCATIONAL_SERVICES                     8.413173e+09
TECHNOLOGY_AND_SOFTWARE_DEVELOPMENT      6.744705e+09
MEDIA_AND_COMMUNICATIONS_SERVICES        3.640933e+09
LEGAL_SERVICES                           2.676874e+09
GOVERNMENT                               2.286122e+09
PASSENGER_TRANSPORTATION                 2.021710e+09
ACCOUNTING_AND_AUDITING_SERVICES         1.989561e+09
SCIENTIFIC_RESEARCH_AND_DEVELOPMENT      1.339806e+09
ENERGY                                   1.064088e+09
MA

In [29]:
industry_wise_employment_df = state_filtered_df.groupby(['Modified BLS Super Sector'])['TOT_EMP'].sum().sort_values(ascending=False)
industry_wise_employment_df

Modified BLS Super Sector
RETAIL_TRADE                             410250.0
MANUFACTURING                            343580.0
HEALTHCARE                               342540.0
TRAVEL_AND_ENTERTAINMENT_SERVICES        340490.0
ADMINISTRATIVE_SUPPORT_WASTE_SERVICES    295130.0
PACKAGING_AND_GOODS_TRANSPORTATION       222620.0
CONSTRUCTION                             204610.0
MANAGEMENT                               143140.0
EDUCATIONAL_SERVICES                     131320.0
FINANCE_AND_INSURANCE                    113410.0
TECHNOLOGY_AND_SOFTWARE_DEVELOPMENT       72210.0
MEDIA_AND_COMMUNICATIONS_SERVICES         48050.0
ACCOUNTING_AND_AUDITING_SERVICES          42860.0
GOVERNMENT                                41770.0
PASSENGER_TRANSPORTATION                  32200.0
LEGAL_SERVICES                            29230.0
SCIENTIFIC_RESEARCH_AND_DEVELOPMENT       16550.0
AGRICULTURE_FORESTRY_FISHING_HUNTING      16530.0
ENERGY                                    14570.0
MANAGEMENT_CONSULTING_SE

## Industry based employment risk

In [30]:
grouped_employee_risk_by_industry = state_risk_df.groupby(['Modified BLS Super Sector'])['TOT_EMP'].sum().sort_values(ascending=False)
print("\nTotal employees by industry:")
print(grouped_employee_risk_by_industry)


Total employees by industry:
Modified BLS Super Sector
FINANCE_AND_INSURANCE                    59340.0
HEALTHCARE                               58760.0
ACCOUNTING_AND_AUDITING_SERVICES         42860.0
MANUFACTURING                            42110.0
ADMINISTRATIVE_SUPPORT_WASTE_SERVICES    29130.0
TECHNOLOGY_AND_SOFTWARE_DEVELOPMENT      24050.0
RETAIL_TRADE                             20110.0
MANAGEMENT                               16980.0
MEDIA_AND_COMMUNICATIONS_SERVICES        16580.0
SCIENTIFIC_RESEARCH_AND_DEVELOPMENT      14570.0
CONSTRUCTION                             11700.0
EDUCATIONAL_SERVICES                     11000.0
LEGAL_SERVICES                            9240.0
ENERGY                                    4630.0
ARCHITECTURAL_ENGINEERING_SERVICES        3940.0
SPECIAL_INDUSTRIES                        1230.0
AGRICULTURE_FORESTRY_FISHING_HUNTING       960.0
PASSENGER_TRANSPORTATION                   310.0
MINING_OIL_GAS_EXTRACTION                  150.0
Name: TOT_EMP

In [32]:
grouped_econ_value_risk_by_industry = state_risk_df.groupby(['Modified BLS Super Sector'])['economic_value'].sum().sort_values(ascending=False)
print("\nTotal economic value by industry:")
print(grouped_econ_value_risk_by_industry)


Total economic value by industry:
Modified BLS Super Sector
HEALTHCARE                               4.982633e+09
FINANCE_AND_INSURANCE                    4.303319e+09
MANUFACTURING                            2.244991e+09
TECHNOLOGY_AND_SOFTWARE_DEVELOPMENT      2.027592e+09
ACCOUNTING_AND_AUDITING_SERVICES         1.989561e+09
MANAGEMENT                               1.658437e+09
ADMINISTRATIVE_SUPPORT_WASTE_SERVICES    1.463902e+09
SCIENTIFIC_RESEARCH_AND_DEVELOPMENT      1.203017e+09
MEDIA_AND_COMMUNICATIONS_SERVICES        1.118580e+09
EDUCATIONAL_SERVICES                     9.384155e+08
CONSTRUCTION                             8.796295e+08
RETAIL_TRADE                             8.154605e+08
LEGAL_SERVICES                           4.817621e+08
ENERGY                                   4.473154e+08
ARCHITECTURAL_ENGINEERING_SERVICES       3.241147e+08
SPECIAL_INDUSTRIES                       6.065130e+07
AGRICULTURE_FORESTRY_FISHING_HUNTING     4.762850e+07
PASSENGER_TRANSPORTAT

## Top 5 industries within industry type analysis

In [33]:
def get_top_5_by_industry(df, group_col='Modified BLS Super Sector', dict_dump_mode=False):
    """
    Get top 5 rows by TOT_EMP and economic_value for each industry group
    """
    results = {}
    
    for industry in df[group_col].unique():
        if pd.notna(industry):  # Skip NaN values
            industry_data = df[df[group_col] == industry]

            if not dict_dump_mode:
                results[industry] = {
                    'top_5_by_employment': industry_data.nlargest(5, 'TOT_EMP')[['OCC_CODE', 'OCC_TITLE', 'TOT_EMP', 'economic_value']],
                    'top_5_by_economic_value': industry_data.nlargest(5, 'economic_value')[['OCC_CODE', 'OCC_TITLE', 'TOT_EMP', 'economic_value']]
                }
            else:
                results[industry] = {
                    'top_5_by_employment': industry_data.nlargest(5, 'TOT_EMP')[['OCC_CODE', 'OCC_TITLE', 'TOT_EMP', 'economic_value']].to_dict(orient='records'),
                    'top_5_by_economic_value': industry_data.nlargest(5, 'economic_value')[['OCC_CODE', 'OCC_TITLE', 'TOT_EMP', 'economic_value']].to_dict(orient='records')
                }
    
    return results

# Usage
industry_top_5 = get_top_5_by_industry(state_risk_df)

# Display results
for industry, data in industry_top_5.items():
    print(f"\n{'='*80}")
    print(f"INDUSTRY: {industry}")
    print(f"{'='*80}")
    
    print(f"\nTop 5 by Employment:")
    print(data['top_5_by_employment'].to_string(index=False))
    
    print(f"\nTop 5 by Economic Value:")
    print(data['top_5_by_economic_value'].to_string(index=False))

industry_top_5 = get_top_5_by_industry(state_risk_df, dict_dump_mode=True)


INDUSTRY: SCIENTIFIC_RESEARCH_AND_DEVELOPMENT

Top 5 by Employment:
OCC_CODE                                  OCC_TITLE  TOT_EMP  economic_value
 19-1042 Medical Scientists, Except Epidemiologists   4350.0     471105000.0
 15-2031               Operations Research Analysts   1500.0     118965000.0
 19-4021                     Biological Technicians   1390.0      85262600.0
 43-9111                     Statistical Assistants   1290.0      67531500.0
 17-2081                    Environmental Engineers   1260.0      67611600.0

Top 5 by Economic Value:
OCC_CODE                                                  OCC_TITLE  TOT_EMP  economic_value
 19-1042                 Medical Scientists, Except Epidemiologists   4350.0     471105000.0
 15-2031                               Operations Research Analysts   1500.0     118965000.0
 19-2041 Environmental Scientists and Specialists, Including Health   1180.0      97196600.0
 19-4021                                     Biological Technicians   1

In [34]:
industry_top_5

{'SCIENTIFIC_RESEARCH_AND_DEVELOPMENT': {'top_5_by_employment': [{'OCC_CODE': '19-1042',
    'OCC_TITLE': 'Medical Scientists, Except Epidemiologists',
    'TOT_EMP': 4350.0,
    'economic_value': 471105000.0},
   {'OCC_CODE': '15-2031',
    'OCC_TITLE': 'Operations Research Analysts',
    'TOT_EMP': 1500.0,
    'economic_value': 118965000.0},
   {'OCC_CODE': '19-4021',
    'OCC_TITLE': 'Biological Technicians',
    'TOT_EMP': 1390.0,
    'economic_value': 85262600.0},
   {'OCC_CODE': '43-9111',
    'OCC_TITLE': 'Statistical Assistants',
    'TOT_EMP': 1290.0,
    'economic_value': 67531500.0},
   {'OCC_CODE': '17-2081',
    'OCC_TITLE': 'Environmental Engineers',
    'TOT_EMP': 1260.0,
    'economic_value': 67611600.0}],
  'top_5_by_economic_value': [{'OCC_CODE': '19-1042',
    'OCC_TITLE': 'Medical Scientists, Except Epidemiologists',
    'TOT_EMP': 4350.0,
    'economic_value': 471105000.0},
   {'OCC_CODE': '15-2031',
    'OCC_TITLE': 'Operations Research Analysts',
    'TOT_EMP': 1

In [35]:
state_industry_analysis = {
    "state": state,
    
    "total_jobs": state_filtered_df['TOT_EMP'].sum(),
    "total_economic_value": state_filtered_df['economic_value'].sum(),
    
    "total_jobs_at_risk": state_risk_df['TOT_EMP'].sum(),
    "total_jobs_at_risk_percentage": (state_risk_df['TOT_EMP'].sum() / state_filtered_df['TOT_EMP'].sum()) * 100,
    
    "total_economic_value_at_risk": state_risk_df['economic_value'].sum(),
    "total_economic_value_at_risk_percentage": (state_risk_df['economic_value'].sum() / state_filtered_df['economic_value'].sum()) * 100,

    "industry_impact_percentage": percentage_at_risk.to_dict(),
    "industry_wise_economic_value": industry_wise_economic_value_df.to_dict(),
    "industry_wise_employment": industry_wise_employment_df.to_dict(),
    "risked_top_5_industry_by_employment": grouped_employee_risk_by_industry.to_dict(),
    "risked_top_5_industry_by_economic_value": grouped_econ_value_risk_by_industry.to_dict(),
    "industry_wise_top_5_jobs": industry_top_5
}

In [36]:
import json
json.dump(state_industry_analysis, open(data_v2_path, 'w'), indent=2)

## Minor group based economic risk

In [37]:
grouped_economic_value = state_risk_df.groupby(['minor_group', 'minor_group_name'])['economic_value'].sum().sort_values(ascending=False)
print("\nTotal economic value by minor group:")
print(grouped_economic_value)


Total economic value by minor group:
minor_group  minor_group_name                                                     
291          Healthcare Diagnosing or Treating Practitioners                          3.411549e+09
132          Financial Specialists                                                    3.395926e+09
433          Financial Clerks                                                         2.583568e+09
151          Computer and Information Analyst                                         1.776512e+09
172          Engineers                                                                1.764058e+09
131          Business Operations Specialists                                          1.758233e+09
119          Other Management Occupations                                             1.658437e+09
435          Material Recording, Scheduling, Dispatching, and Distributing Workers    1.613095e+09
436          Secretaries and Administrative Assistants                                8

## Minor group based unemployment count risk

In [38]:
grouped_employee_risk_by_minor_group = state_risk_df.groupby(['minor_group', 'minor_group_name'])['TOT_EMP'].sum().sort_values(ascending=False)
print("\nTotal employees by minor group:")
print(grouped_employee_risk_by_minor_group)


Total employees by minor group:
minor_group  minor_group_name                                                     
433          Financial Clerks                                                         56430.0
132          Financial Specialists                                                    44070.0
435          Material Recording, Scheduling, Dispatching, and Distributing Workers    35470.0
131          Business Operations Specialists                                          24650.0
151          Computer and Information Analyst                                         21570.0
436          Secretaries and Administrative Assistants                                21550.0
291          Healthcare Diagnosing or Treating Practitioners                          19520.0
172          Engineers                                                                18610.0
119          Other Management Occupations                                             16980.0
537          Material Moving Workers  

## Major group instances

In [39]:
state_risk_df[state_risk_df['major_group'] == 43].sort_values(by='TOT_EMP', ascending=False)[['OCC_CODE', 'OCC_TITLE', 'TOT_EMP', 'economic_value', 'automation_risk_score']]

Unnamed: 0,OCC_CODE,OCC_TITLE,TOT_EMP,economic_value,automation_risk_score
30423,43-3031,"Bookkeeping, Accounting, and Auditing Clerks",42860.0,1989561000.0,62.5
30454,43-5071,"Shipping, Receiving, and Inventory Clerks",20110.0,815460500.0,58.3
30458,43-6013,Medical Secretaries and Administrative Assistants,18900.0,709695000.0,58.3
30453,43-5061,"Production, Planning, and Expediting Clerks",14030.0,716371800.0,59.0
30422,43-3021,Billing and Posting Clerks,10070.0,423242100.0,60.8
30438,43-4131,Loan Interviewers and Clerks,4390.0,192325900.0,59.5
30424,43-3051,Payroll and Timekeeping Clerks,3500.0,170765000.0,64.9
30434,43-4071,File Clerks,3350.0,128171000.0,58.2
30460,43-9021,Data Entry Keyers,2680.0,103260400.0,58.2
30457,43-6012,Legal Secretaries and Administrative Assistants,2650.0,119170500.0,58.4


## Minor group instances

In [40]:
state_risk_df[state_risk_df['minor_group'] == 353].sort_values(by='TOT_EMP', ascending=False)[['minor_group_name', 'OCC_CODE', 'OCC_TITLE', 'TOT_EMP', 'economic_value', 'automation_risk_score']]

Unnamed: 0,minor_group_name,OCC_CODE,OCC_TITLE,TOT_EMP,economic_value,automation_risk_score


In [41]:
state_risk_df[state_risk_df['minor_group'] == 537].sort_values(by='TOT_EMP', ascending=False)[['minor_group_name', 'OCC_CODE', 'OCC_TITLE', 'TOT_EMP', 'economic_value', 'automation_risk_score']]

Unnamed: 0,minor_group_name,OCC_CODE,OCC_TITLE,TOT_EMP,economic_value,automation_risk_score
30704,Material Moving Workers,53-7064,"Packers and Packagers, Hand",14760.0,487227600.0,58.7


In [42]:
state_filtered_df[state_filtered_df['minor_group'] == 537].sort_values(by='TOT_EMP', ascending=False)[['minor_group_name', 'OCC_CODE', 'OCC_TITLE', 'TOT_EMP', 'economic_value', 'automation_risk_score', 'perc_ile_thresholded_risk']]

Unnamed: 0,minor_group_name,OCC_CODE,OCC_TITLE,TOT_EMP,economic_value,automation_risk_score,perc_ile_thresholded_risk
30702,Material Moving Workers,53-7062,"Laborers and Freight, Stock, and Material Move...",113380.0,4247215000.0,52.3,False
30705,Material Moving Workers,53-7065,Stockers and Order Fillers,64660.0,2276032000.0,57.2,False
30700,Material Moving Workers,53-7051,Industrial Truck and Tractor Operators,19130.0,789495100.0,50.4,False
30704,Material Moving Workers,53-7064,"Packers and Packagers, Hand",14760.0,487227600.0,58.7,True
30701,Material Moving Workers,53-7061,Cleaners of Vehicles and Equipment,7380.0,243613800.0,49.6,False
30708,Material Moving Workers,53-7081,Refuse and Recyclable Material Collectors,2820.0,98841000.0,44.8,False
30698,Material Moving Workers,53-7021,Crane and Tower Operators,740.0,41728600.0,47.4,False
30710,Material Moving Workers,53-7199,"Material Moving Workers, All Other",740.0,32656200.0,54.0,False
30697,Material Moving Workers,53-7011,Conveyor Operators and Tenders,610.0,26492300.0,48.7,False
30703,Material Moving Workers,53-7063,Machine Feeders and Offbearers,590.0,23452500.0,52.8,False


In [43]:
state_risk_df[state_risk_df['minor_group'] == 513].sort_values(by='TOT_EMP', ascending=False)[['minor_group_name', 'OCC_CODE', 'OCC_TITLE', 'TOT_EMP', 'economic_value', 'automation_risk_score']]

Unnamed: 0,minor_group_name,OCC_CODE,OCC_TITLE,TOT_EMP,economic_value,automation_risk_score
30585,Food Processing Workers,51-3092,Food Batchmakers,3130.0,126232900.0,58.8
