Loading libraries and Original Dataset<br />
Dataset Source: <a href="https://www.kaggle.com/datasets/arshkon/linkedin-job-postings/data">LinkedIn Job Postings (2023 - 2024)</a>


In [1]:
# importing basic libraries
import pandas as pd
import math
import os
# importing local file which contains a dictionnary
from state_dict import *

# Dataset sources folder 
path_to_file = '../Datasets/Linkedin_Job_Posting/'

In [2]:
# Setting up stuff for pandas reading
# Displaying all columns
pd.set_option('display.max_columns', 40)

In [3]:
# Reading the Jobposting CSV
dataset = pd.read_csv(path_to_file+'postings.csv')

In [None]:
# Checking dataset columns names
print(dataset.columns)

In [21]:
# Extracting the list of States in the location field and adding it to the dataset
all_locations = dataset['location'].tolist()
# Adding a new column in our dataset to store the State
dataset['state'] = str(0)

# Temp var
temp_states = []

# We need to split the data to only keep the States
# Some entries are in the form "city, State", some are just generic text of city name (or "United State")
for i in all_locations:
    split_loc = [x.strip() for x in i.split(",")]
    if len(split_loc) > 1:
        # We have a lot of State with a loc in the form of "STATE Metropolitan Area" so we remove those parts
        state = split_loc[1].replace(' Area', '')
        state = state.replace(' Metropolitan', '')

        # If the result is 'United States' or a two-digits letter name, we keep it as is
        if (state == 'United States') or (len(state) == 2):
            temp_states.append(state)
        # Otherwise, we convert the full name to a two-digits letter name
        elif state in state_conversion:
            temp_states.append(state_conversion[state])
        # Almost no occurences - we store "Other"
        else:
            temp_states.append('Other')
    else:
        # We keep the "United States" value but override the rest with Other as it's about 4% of the full dataset
        if i.strip() == 'United States':
            temp_states.append('United States')
        else:
            temp_states.append('Other')

# temp_states

# Storing the values in our dataset
# TODO : optimize this as it takes about 10 seconds to just copy data
for i in range(len(temp_states)):
    dataset.loc[i, 'state'] = temp_states[i]

First File is About all "Data" Offers


In [23]:
# We filter all the data to only keep job titles containing the word "Data" 
mask = dataset['title'].str.contains('data', case=False)
subdf_jobs = dataset[mask]

In [24]:
# We drop useless columns
columns_to_drop = ['views', 'formatted_work_type', 'applies', 'original_listed_time', 'job_posting_url', 'application_url', 'application_type', 'expiry', 'closed_time', 'skills_desc', 'listed_time', 'posting_domain', 'sponsored', 'compensation_type', 'max_salary', 'pay_period', 'med_salary', 'min_salary', 'currency', 'description']
subdf_jobs_cleaned = subdf_jobs.drop(labels = columns_to_drop, axis=1)

In [33]:
subdf_jobs_cleaned.head()

Unnamed: 0,job_id,company_name,title,location,company_id,remote_allowed,formatted_experience_level,work_type,state
116,3245063922,Saxon AI,Data Architect,"San Francisco, CA",224935.0,,,CONTRACT,CA
134,3398076960,,Technical Product and IT Manager for Data Cent...,United States,,1.0,,FULL_TIME,United States
165,3540371917,KeyBank,Enterprise Data & Analytics Infrastructure Man...,"Cleveland, OH",3252.0,,,FULL_TIME,OH
283,3742692445,ZenithMinds Inc,Sr Data Engineer with Kafka,"Austin, TX",81941852.0,1.0,,FULL_TIME,TX
348,3792233622,PB Built,Receptionist/Data Entry,"Jupiter, FL",2331524.0,,,FULL_TIME,FL


In [31]:
# We write the sub df content into a new CSV file
filename = 'all_offers_data.csv'

#deleting file if a version already exists
if filename in os.listdir("./csv"):
    os.remove('./csv/'+filename)

# Writing the CSV
subdf_jobs_cleaned.to_csv('./csv/'+filename, sep=',', na_rep='N/A')

Second One is about the ratio of Remote options per States


In [32]:
# Creating a new Dataset where we will save our data
# Data will be stats per Location (States, mostly) about the share of remote work
# We will need the State, but also data on number of offer in that state, number with remote option, and a share of remote (which is remote/total)

subdf_remote = pd.DataFrame()
subdf_remote['state'] = str(0)
subdf_remote['total_offer'] = 0
subdf_remote['remote_offer'] = 0
subdf_remote['remote_share'] = 0

In [34]:
# We replace all NaN values in the dataset['remote_allowed'] column by 0, as it works as a True/False and save it in a new dataset 
# df1 is an intermediate DF for cleaning data and stuff
df1 = pd.DataFrame()
df1['remote_allowed'] = dataset['remote_allowed']
df1.fillna({'remote_allowed':0}, inplace=True)
df1['state'] = dataset['state']


In [None]:
# We look for the total number of offers per states and populate our main DataFrame
subset = df1.groupby('state', as_index=False).count()
subdf_remote['state'] = subset['state']
subdf_remote['total_offer'] = subset['remote_allowed']

#Now we make a sum to only get the number of remote offers per state
subset = df1.groupby('state', as_index=False).sum()
subdf_remote['remote_offer'] = subset['remote_allowed'].astype('int64')

# Finally, we calculate the share of remote work for each State
for i in range(len(subdf_remote)):
    subdf_remote.loc[i, 'remote_share'] = round(subdf_remote.loc[i, 'remote_offer']/subdf_remote.loc[i, 'total_offer']*100, 2)


In [49]:
# We make another csv with this data
filename = 'remote_work_share.csv'

#deleting file if a version already exists
if filename in os.listdir("./csv"):
    os.remove('./csv/'+filename)

# Writing the CSV
subdf_remote.to_csv('./csv/'+filename, sep=',', na_rep='N/A')
