In [287]:
import pandas as pd
import numpy as np
import plotly.express as px

pd.set_option('display.max_columns', None)

In [288]:
raw_data = pd.read_csv("data/postings.csv")

In [289]:
raw_data.columns

Index(['job_id', 'company_name', 'title', 'description', 'max_salary',
       'pay_period', 'location', 'company_id', 'views', 'med_salary',
       'min_salary', 'formatted_work_type', 'applies', 'original_listed_time',
       'remote_allowed', 'job_posting_url', 'application_url',
       'application_type', 'expiry', 'closed_time',
       'formatted_experience_level', 'skills_desc', 'listed_time',
       'posting_domain', 'sponsored', 'work_type', 'currency',
       'compensation_type', 'normalized_salary', 'zip_code', 'fips'],
      dtype='object')

In [290]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 123849 entries, 0 to 123848
Data columns (total 31 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   job_id                      123849 non-null  int64  
 1   company_name                122130 non-null  object 
 2   title                       123849 non-null  object 
 3   description                 123842 non-null  object 
 4   max_salary                  29793 non-null   float64
 5   pay_period                  36073 non-null   object 
 6   location                    123849 non-null  object 
 7   company_id                  122132 non-null  float64
 8   views                       122160 non-null  float64
 9   med_salary                  6280 non-null    float64
 10  min_salary                  29793 non-null   float64
 11  formatted_work_type         123849 non-null  object 
 12  applies                     23320 non-null   float64
 13  original_liste

In [291]:
raw_data["pay_period"].value_counts()

pay_period
YEARLY      20628
HOURLY      14741
MONTHLY       518
WEEKLY        177
BIWEEKLY        9
Name: count, dtype: int64

In [292]:
# CREATE A NEW YEARLY SALARY COLUMN FROM EXISTING SALARY COLUMNS TO STANDARDIZE THIS METRIC ACROSS THE DATASET 

salary_columns: list[str] = ["max_salary","med_salary","min_salary"]
salary_period_type_column: str = "pay_period"

def convert_to_yearly_salary(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df.loc[df[salary_period_type_column]=="HOURLY", salary_columns] = df.loc[df[salary_period_type_column]=="HOURLY", salary_columns]*2080
    df.loc[df[salary_period_type_column]=="WEEKLY", salary_columns] = df.loc[df[salary_period_type_column]=="WEEKLY", salary_columns]*52
    df.loc[df[salary_period_type_column]=="BIWEEKLY", salary_columns] = df.loc[df[salary_period_type_column]=="BIWEEKLY", salary_columns]*26
    df.loc[df[salary_period_type_column]=="MONTHLY", salary_columns] = df.loc[df[salary_period_type_column]=="MONTHLY", salary_columns]*12

    df["standardized_salary"] = df["med_salary"]
    
    df["avg_min_max"] = (df["max_salary"]+df["min_salary"])/2
    df.loc[df["standardized_salary"].isna()==True, "standardized_salary"] = df.loc[df["standardized_salary"].isna()==True,"avg_min_max"]

    return df

In [293]:
jobs_data = convert_to_yearly_salary(raw_data)

In [294]:
jobs_data = jobs_data[jobs_data["standardized_salary"]>=0]

In [295]:
COLUMNS_TO_KEEP = ['company_name'
                   ,'title'
                   ,'description'
                   ,'location'
                   ,'remote_allowed'
                   ,'work_type'
                   ,'currency'
                   ,'standardized_salary']

jobs_data = jobs_data[COLUMNS_TO_KEEP]

In [296]:
# WE WILL FOCUS ON USD CURRENCY
# value_counts: 
# USD    36058
# EUR        6
# CAD        3
# BBD        2
# AUD        2
# GBP        2

jobs_data = jobs_data[jobs_data["currency"]=="USD"]
jobs_data = jobs_data.drop(columns="currency")

In [297]:
# WE WILL FOCUS ON FULL_TIME, CONTRACT, PART_TIME WORK TYPES
# value_counts: 
# FULL_TIME     29119
# CONTRACT       3848
# PART_TIME      2304
# TEMPORARY       394
# INTERNSHIP      247
# OTHER           138
# VOLUNTEER         8

jobs_data = jobs_data[jobs_data["work_type"].isin(["FULL_TIME","CONTRACT","PART_TIME"])]

In [298]:
tmp_location = jobs_data["location"].str.split(',',expand=True) 
tmp_location

Unnamed: 0,0,1,2
0,Princeton,NJ,
1,Fort Collins,CO,
2,Cincinnati,OH,
3,New Hyde Park,NY,
4,Burlington,IA,
...,...,...,...
123837,Irvine,CA,
123839,Greater Indianapolis,,
123843,Irvine,CA,
123844,Walnut Creek,CA,


In [299]:
tmp_location[tmp_location[1].isna()]

Unnamed: 0,0,1,2
6,United States,,
45,Dallas-Fort Worth Metroplex,,
51,United States,,
121,United States,,
134,United States,,
...,...,...,...
123768,United States,,
123795,United States,,
123819,United States,,
123821,United States,,


In [300]:
tmp_location.loc[tmp_location[1].isna(),1] = tmp_location.loc[tmp_location[1].isna(),0] 

In [301]:
jobs_data["state"] = tmp_location[1].str.strip()

In [302]:
location_renaming = {
    'United States': 'US'
    ,'Ohio Metropolitan Area': 'OH'
    ,'Texas Metropolitan Area': 'TX'
    ,'California': 'CA'
    ,'South Carolina Metropolitan Area': 'SC'
    ,'Oregon Metropolitan Area': 'OR'
    ,'Alabama Area': 'AL'
    ,'Kansas Metropolitan Area': 'KS'
    ,'Massachusetts Metropolitan Area': 'MI'
    ,'Colorado': 'CO'
    ,'Nebraska Metropolitan Area': 'NE'
    ,'New York': 'NY'
    ,'New York Metropolitan Area': 'NY'
    ,'MI Area': 'MI'
    ,'Illinois': 'IL'
    ,'Texas': 'TX'
    ,'Louisiana Metropolitan Area': 'LA'
    ,'South Carolina Area': 'SC'
    ,'Hawaii': 'HI'
    ,'Ohio': 'OH'
    ,'Virginia Metropolitan Area': 'VA'
    ,'Florida': 'FL'
    ,'Delaware': 'DE'
    ,'Arizona': 'AZ'
    ,'North Carolina': 'NC'
    ,'New Jersey': 'NJ'
    ,'North Carolina Metropolitan Area': 'NC'
    ,'Illinois Metropolitan Area': 'IL'
    ,'Alaska': 'AK'
    ,'Nebraska': 'NE'
    ,'Georgia': 'GA'
    ,'Washington': 'WA'
    ,'Massachusetts': 'MA'
    ,'AR Area': 'AR'
    ,'Wisconsin Metropolitan Area': 'WI'
    ,'Maine Metropolitan Area': 'ME'
    ,'Oregon': 'OR'
    ,'Indiana Metropolitan Area': 'IN'
    ,'Oklahoma': 'OK'
    ,'Michigan': 'MI'
    ,'Utah': 'UT'
    ,'Georgia Area': 'GA'
    ,'New Mexico': 'NM'
    ,'Virginia': 'VA'
    ,'Dallas-Fort Worth Metroplex': 'TX'
    ,'Greater Minneapolis-St. Paul Area': 'MN'
    ,'Cincinnati Metropolitan Area': 'OH'
    ,'Louisville Metropolitan Area': 'KY'
    ,'Washington DC-Baltimore Area': 'WA'
    ,'New York City Metropolitan Area': 'NY'
    ,'Denver Metropolitan Area': 'CO'
    ,'San Francisco Bay Area': 'CA'
    ,'San Diego Metropolitan Area': 'CA'
    ,'Greater Asheville': 'NC'
    ,'Los Angeles Metropolitan Area': 'CA'
    ,'Greater Philadelphia': 'PA'
    ,'Greater Sacramento': 'CA'
    ,'Greater Grand Junction Area': 'CO'
    ,'Greater Flagstaff Area': 'AZ'
    ,'Detroit Metropolitan Area': 'MI'
    ,'Greater St. Louis': 'MO'
    ,'Atlanta Metropolitan Area': 'GA'
    ,'Raleigh-Durham-Chapel Hill Area': 'NC'
    ,'Little Rock Metropolitan Area': 'AR'
    ,'Nashville Metropolitan Area': 'TN'
    ,'Erie-Meadville Area': 'PA'
    ,'Greater Chicago Area': 'IL'
    ,'Greater Indianapolis': 'IN'
    ,'Buffalo-Niagara Falls Area': 'NY'
    ,'Salt Lake City Metropolitan Area': 'UT'
    ,'Greater Boston': 'MA'
    ,'Greater Hartford': 'CT'
    ,'Greensboro--Winston-Salem--High Point Area': 'NC'
    ,'Las Vegas Metropolitan Area': 'NV'
    ,'Kansas City Metropolitan Area': 'MO'
    ,'Greater Seattle Area': 'WA'
    ,'Greater Houston': 'TX'
    ,'Greater Tampa Bay Area': 'FL'
    ,'Greater Cleveland': 'OH'
    ,'Omaha Metropolitan Area': 'NE'
    ,'Greater Macon': 'GA'
    ,'Greater Orlando': 'FL'
    ,'Maui': 'HI'
    ,'Greater Phoenix Area': 'AZ'
    ,'New Bern-Morehead City Area': 'NC'
    ,'Knoxville Metropolitan Area': 'TN'
    ,'Greater Syracuse-Auburn Area': 'NY'
    ,'Miami-Fort Lauderdale Area': 'FL'
    ,'Utica-Rome Area': 'NY'
    ,'Honolulu Metropolitan Area': 'HI'
    ,'Greater Augusta Area': 'GA'
    ,'Greater Scranton Area': 'PA'
    ,'Charlotte Metro': 'NC'
    ,'Metropolitan Fresno': 'CA'
    ,'Greater Chattanooga': 'TN'
    ,'Greater San Luis Obispo Area': 'CA'
    ,'Des Moines Metropolitan Area': 'IA'
    ,'La Crosse-Onalaska Area': 'WI'
    ,'Lubbock-Levelland Area': 'TX'
    ,'Oklahoma City Metropolitan Area': 'OK'
    ,'Blacksburg-Christiansburg-Radford Area': 'VA'
    ,'Boise Metropolitan Area': 'ID'
    ,'Grand Rapids Metropolitan Area': 'MI'
    ,'Waterloo-Cedar Falls Area': 'IA'
    ,'Mobile Metropolitan Area': 'AL'
    ,'Greater Richmond Region': 'VA'
    ,'Peoria Metropolitan Area': 'IL'
    ,'Eau Claire-Menomonie Area': 'WI'
    ,'Greater Lansing': 'MI'
    ,'Greater Bend Area': 'OR'
    ,'Tallahassee Metropolitan Area': 'FL'
    ,'South Bend-Mishawaka Region': 'IN'
    ,'Greater Fort Wayne': 'IN'
    ,'Memphis Metropolitan Area': 'MS'
    ,'Topeka Metropolitan Area': 'KS'
    ,'Youngstown-Warren area': 'OH'
    ,'Metro Jacksonville': 'FL'
    ,'Greater Pittsburgh Region': 'PA'
    ,'Pensacola Metropolitan Area': 'FL'
    ,'Albuquerque-Santa Fe Metropolitan Area': 'NM'
    ,'Greater Sioux Falls Area': 'SD'
    ,'Pueblo-Cañon City Area': 'CO'
    ,'Greater Dothan': 'AL'
    ,'Tulsa Metropolitan Area': 'OK'
    ,'Greater Bloomington Area': 'IN'
    ,'Greater Reno Area': 'NV'
    ,'Greater Fort Collins Area': 'CO'
    ,'Greater Madison Area': 'WI'
    ,'Greater Milwaukee': 'WI'
    ,'Rocky Mount-Wilson Area': 'NC'
    ,'Bellingham Metropolitan Area': 'WA'
    ,'Beaumont-Port Arthur Area': 'TX'
    ,'Greater New Orleans Region': 'LA'
    ,'Greater Corpus Christi Area': 'TX'
    ,'Greater Colorado Springs Area': 'CO'
    ,'Lawton Area': 'OK'
    ,'Greater McAllen Area': 'TX'
    ,'Baton Rouge Metropolitan Area': 'LA'
    ,'Greater Lexington Area': 'KY'
    ,'Greater Savannah Area': 'GA'
    ,'Modesto-Merced Area': 'CA'
    ,'Appleton-Oshkosh-Neenah Area': 'WI'
    ,'Greater Bismarck Area': 'ND'
    ,'Greater Enid Area': 'OK'
    ,'Greater Wilmington Area': 'DE'
    ,'College Station-Bryan Area': 'TX'
    ,'Greater Tucson Area': 'AZ'
    ,'Greater Chico Area': 'CA'
    ,'Crestview-Fort Walton Beach-Destin Area': 'FL'
    ,'Greater Morgantown Area': 'WV'
    ,'Johnson City-Kingsport-Bristol Area': 'TN'
    ,'Walla Walla Area': 'WA'
    ,'Cape Coral Metropolitan Area': 'FL'
    ,'Greater Eugene-Springfield Area': 'OR'
    ,'Greater Goldsboro Area': 'NC'
    ,'Greater Burlington Area': 'VT'
}

In [303]:
jobs_data["state"] = jobs_data["state"].replace(location_renaming)
jobs_data = jobs_data.drop(columns="location")

In [304]:
jobs_data["state"].unique()

array(['NJ', 'CO', 'OH', 'NY', 'IA', 'US', 'CA', 'FL', 'RI', 'PA', 'MA',
       'VA', 'WA', 'AK', 'HI', 'TX', 'AZ', 'UT', 'TN', 'IN', 'NE', 'NM',
       'MT', 'MN', 'MD', 'DC', 'MS', 'MI', 'GA', 'SC', 'OR', 'KY', 'AR',
       'NC', 'WI', 'NV', 'IL', 'ID', 'WY', 'CT', 'KS', 'AL', 'MO', 'OK',
       'DE', 'ND', 'LA', 'WV', 'NH', 'ME', 'VT', 'SD'], dtype=object)

In [305]:
COLUMNS_TO_CONCATENATE = ['company_name', 'title', 'description']

jobs_data["title"] = jobs_data["title"].str.strip()
jobs_data[COLUMNS_TO_CONCATENATE] = jobs_data[COLUMNS_TO_CONCATENATE].fillna("-",)
jobs_data["augmented_description"] =  jobs_data[COLUMNS_TO_CONCATENATE].agg(' '.join, axis=1)

jobs_data = jobs_data.drop(columns=["company_name","title","description"])

In [306]:
jobs_data.head()

Unnamed: 0,remote_allowed,work_type,standardized_salary,state,augmented_description
0,,FULL_TIME,38480.0,NJ,Corcoran Sawyer Smith Marketing Coordinator Jo...
1,,FULL_TIME,83200.0,CO,- Mental Health Therapist/Counselor At Aspen T...
2,,FULL_TIME,55000.0,OH,The National Exemplar Assitant Restaurant Man...
3,,FULL_TIME,157500.0,NY,"Abrams Fensterman, LLP Senior Elder Law / Trus..."
4,,FULL_TIME,70000.0,IA,- Service Technician Looking for HVAC service ...


In [307]:
jobs_data["remote_allowed"] = jobs_data["remote_allowed"].fillna(0)
jobs_data["remote_allowed"] = jobs_data["remote_allowed"].astype(int)