# About

This notebook converts Skills.xlsx into Skills.csv

In [1]:
import pandas as pd

In [4]:
skills_df = pd.read_excel('../data/ONET/Skills.xlsx')

In [5]:
skills_df.head()

Unnamed: 0,O*NET-SOC Code,Title,Element ID,Element Name,Scale ID,Scale Name,Data Value,N,Standard Error,Lower CI Bound,Upper CI Bound,Recommend Suppress,Not Relevant,Date,Domain Source
0,11-1011.00,Chief Executives,2.A.1.a,Reading Comprehension,IM,Importance,4.12,8,0.125,3.88,4.37,N,,08/2023,Analyst
1,11-1011.00,Chief Executives,2.A.1.a,Reading Comprehension,LV,Level,4.62,8,0.183,4.2664,4.9836,N,N,08/2023,Analyst
2,11-1011.00,Chief Executives,2.A.1.b,Active Listening,IM,Importance,4.0,8,0.0,4.0,4.0,N,,08/2023,Analyst
3,11-1011.00,Chief Executives,2.A.1.b,Active Listening,LV,Level,4.75,8,0.1637,4.4292,5.0708,N,N,08/2023,Analyst
4,11-1011.00,Chief Executives,2.A.1.c,Writing,IM,Importance,4.12,8,0.125,3.88,4.37,N,,08/2023,Analyst


In [6]:
skills_df.to_csv('Skills.csv', index=False)

In [7]:
# Update SOC_Code column based on the condition
skills_df['SOC_Code'] = skills_df['O*NET-SOC Code'].apply(lambda x: x.split('.')[0] if '.' in x else x)

In [8]:
# print unique values of Element Name
print(f"unique count of Element Name: {len(skills_df['Element Name'].unique())}")
print(f"unique values of Element Name: {skills_df['Element Name'].unique()}")


unique count of Element Name: 35
unique values of Element Name: ['Reading Comprehension' 'Active Listening' 'Writing' 'Speaking'
 'Mathematics' 'Science' 'Critical Thinking' 'Active Learning'
 'Learning Strategies' 'Monitoring' 'Social Perceptiveness' 'Coordination'
 'Persuasion' 'Negotiation' 'Instructing' 'Service Orientation'
 'Complex Problem Solving' 'Operations Analysis' 'Technology Design'
 'Equipment Selection' 'Installation' 'Programming'
 'Operations Monitoring' 'Operation and Control' 'Equipment Maintenance'
 'Troubleshooting' 'Repairing' 'Quality Control Analysis'
 'Judgment and Decision Making' 'Systems Analysis' 'Systems Evaluation'
 'Time Management' 'Management of Financial Resources'
 'Management of Material Resources' 'Management of Personnel Resources']


In [9]:
print(f"unique count of O*NET-SOC Code: {len(skills_df['O*NET-SOC Code'].unique())}")

unique count of O*NET-SOC Code: 879


In [11]:
print(f"unique count of SOC code without .specification: {len(skills_df['SOC_Code'].unique())}")

unique count of SOC code without .specification: 763


In [15]:
import json

# Read the SOC mapping JSON file
with open('../data/soc_mapping.json', 'r') as f:
    soc_mapping = json.load(f)

df = pd.DataFrame.from_dict(soc_mapping, orient='index')

# Reset index to make the SOC codes a column
df = df.reset_index()
df = df.rename(columns={'index': 'SOC Code'})

print("\nColumns in the DataFrame:")
print(df.columns.tolist())


Columns in the DataFrame:
['SOC Code', 'detailed_title', 'major_code', 'major_title', 'minor_code', 'minor_title', 'broad_code', 'broad_title']


In [23]:
# First remove the hyphen, then take the appropriate number of digits
df['normalized_SOC_Code'] = df['SOC Code'].str.replace('-', '')
df['normalized_major_code'] = df['SOC Code'].str.replace('-', '').str[:2]
df['normalized_minor_code'] = df['SOC Code'].str.replace('-', '').str[:3]
df['normalized_broad_code'] = df['SOC Code'].str.replace('-', '').str[:4]

In [24]:
df.head()

Unnamed: 0,SOC Code,detailed_title,major_code,major_title,minor_code,minor_title,broad_code,broad_title,normalized_SOC_Code,normalized_major_code,normalized_minor_code,normalized_broad_code
0,11-1011,Chief Executives,11-0000,Management Occupations,11-1000,Top Executives,11-1010,Chief Executives,111011,11,111,1110
1,11-1021,General and Operations Managers,11-0000,Management Occupations,11-1000,Top Executives,11-1020,General and Operations Managers,111021,11,111,1110
2,11-1031,Legislators,11-0000,Management Occupations,11-1000,Top Executives,11-1030,Legislators,111031,11,111,1110
3,11-2011,Advertising and Promotions Managers,11-0000,Management Occupations,11-2000,"Advertising, Marketing, Promotions, Public Rel...",11-2010,Advertising and Promotions Managers,112011,11,112,1120
4,11-2021,Marketing Managers,11-0000,Management Occupations,11-2000,"Advertising, Marketing, Promotions, Public Rel...",11-2020,Marketing and Sales Managers,112021,11,112,1120


In [25]:
df.to_csv('../data/soc_mapping.csv', index=False)