# Imports

In [None]:
import sys
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import nltk

path = '../src/'
sys.path.append(path)

from utils_nlp import *

# Read the data

Data available here:
https://www.kaggle.com/datasets/narsil/jobs-in-data-com

First download and save it into "data" folder.

In [None]:
df = pd.read_csv('../data/job_descriptions.csv')

# Feature Engineering and EDA

In [None]:
df['experience_list'] = df["Experience"].apply(lambda x: [
    float(i.strip()) for i in x.replace('Years', '').split('to')
    ])

In [None]:
df['min_experience'] = df['experience_list'].apply(lambda x: x[0])

In [None]:
df['min_experience'].value_counts()

In [None]:
df['Role'].value_counts()

In [None]:
df['Work Type'].value_counts()

In [None]:
df = df[df['Country']=='USA']
#df = df[df['Work Type']=='Full Time']
#df = df[df['min_experience'].apply(lambda x: x<1.0)]

## Data Cleaning / Preparation

In [None]:
#clean text applying all the text preprocessing functions
df['cleaned_text'] = df['Job Description'].apply(
    lambda x: ' '.join(preprocess_text(x))
    )
df.head()

dict_of_tokens={i[1]:i[0] for i in vectorizer.vocabulary_.items()}
tfidf_vectors = []  # all deoc vectors by tfidf
for row in vectors:
  tfidf_vectors.append(
    {dict_of_tokens[column]:value for (column,value) in zip(row.indices,row.data)}
    )

In [None]:
df['l_words']=df['cleaned_text'].apply(lambda x: list(set(x.split())))

In [None]:
df['mean_salary'] = df["Salary Range"].apply(lambda x: np.mean([
    float(i.replace('$','').replace('K',''))*1000 for i in x.split('-')
    ]))
df['max_salary'] = df["Salary Range"].apply(lambda x: np.max([
    float(i.replace('$','').replace('K',''))*1000 for i in x.split('-')
    ]))
df['mean_experience'] = df["Experience"].apply(lambda x: np.mean([
    float(i.strip()) for i in x.replace('Years', '').split('to')
    ]))

In [None]:
df2 = df.explode('l_words')
df2 = df2.rename({"l_words":"word"}, axis = 1)
df2['pos_tag'] =df2['word'].apply(lambda x: pos_tag([x],tagset='universal')[0][1])

## Filter only ADJ

In [None]:
df2_adj = df2[df2["pos_tag"]=='ADJ']

In [None]:
df_grouped = df2_adj.groupby('word').agg(
    {'mean_experience': 'mean', 
     'max_salary': list,
     'mean_salary': [list, "count"]
     })


In [None]:
df_grouped = df_grouped.reset_index(col_level=0)
df_grouped.columns = df_grouped.columns.droplevel(1)
df_grouped.columns = ['word', 'mean_experience','max_salary', 'mean_salary', 'count']
df_grouped.sort_values('count', ascending=False, inplace=True)

In [None]:
df_grouped['median_salary']=df_grouped['mean_salary'].apply(lambda x: np.median(x))

In [None]:
df_grouped

In [None]:
df_grouped.sort_values('count', ascending=False).iloc[:16]

Select from the most common adjectives in the dataset sorted by median salary

In [None]:
selection = df_grouped.sort_values('count', ascending=False).iloc[:15]
selection = selection.sort_values('median_salary', ascending=True)
words = list(selection['word'].values)

In [None]:
words

## Prepare some special fonts
You may download FiraSans font from Google Fonts webpage.

In [None]:
from matplotlib.font_manager import FontProperties

personal_path = '../data/Fonts/'

font_path = personal_path + 'FiraSans-Regular.ttf'
fira_sans_regular = FontProperties(fname=font_path)

font_path = personal_path + 'FiraSans-SemiBold.ttf'
fira_sans_semibold = FontProperties(fname=font_path)

In [None]:
fig, axs = plt.subplots(nrows=15, ncols=1, figsize=(8, 10))
axs = axs.flatten()

variable = 'mean_salary'

darkgreen = '#9BC184'
midgreen = '#C2D6A4'
lowgreen = '#E7E5CB'
darkgrey = '#525252'

pos_avg_experience = 60_000
pos_words = 48_000
x_min, x_max = 60_000, 125_000

max_y = 0.00005
colors = [lowgreen,midgreen,darkgreen,midgreen,lowgreen]

for i, word in enumerate(words):

    subset = df2_adj[df2_adj['word']==word]
    
    sns.kdeplot(subset[variable], 
                fill=True,
                color = 'grey',
                edgecolor='lightgrey',
                ax=axs[i])
    
   
    """ 
    # display average number of bedrooms on left
    avg_experience = df_grouped[df_grouped['word']==word]['mean_experience'].values[0].round(1)
    axs[i].text(
        pos_avg_experience, 0,
        f'({avg_experience})',
        ha='left',
        fontsize=10,
        fontproperties=fira_sans_regular,
        color=darkgrey
    )
    """ 
    # display word on left
    axs[i].text(
        pos_words, 0,
        word.upper(),
        ha='left',
        fontsize=10,
        fontproperties=fira_sans_semibold,
        color=darkgrey
    )

    #quantiles
    quantiles = np.percentile(subset[variable], [2.5, 10,25,75,90,97.5])
    quantiles = quantiles.tolist()

    for j in range(len(quantiles)-1):
        axs[i].fill_between(
            [quantiles[j],
              quantiles[j+1]], 
                0,
                max_y/5,
                color = colors[j]
                )
        
    median = subset[variable].median()
    axs[i].scatter([median], [max_y/10], color='black', s=20)
            
    global_median = df[variable].median()
    axs[i].axvline(global_median, color='red', linestyle='--')

    axs[i].set_xlim(x_min, x_max)
    axs[i].set_ylim(0,max_y)
    axs[i].set_ylabel("")

    axs[i].set_axis_off()
    # x axis scale for last ax
    if i == 14:
        values = [70_000, 80_000, 90_000, 100_000]
        for value in values:
            axs[i].text(
                value, -0.00003,
                f'{value}',
                ha='center',
                fontsize=10
            )
        axs[i].set_xlabel('Salary')
        axs[i].set_xticks(values)

    
"""   
text = '(Avg. Experience)'
fig.text(
    0.06,
    0.88,
    text,
    ha='left',
    fontsize=10,
    fontproperties=fira_sans_regular,
    color=darkgrey
)
""" 
# x axis label
text = "Annual Gross Salary (USD)"
fig.text(
    0.5, 0.06,
    text,
    ha='center',
    fontsize=14,
    fontproperties=fira_sans_regular
)


text = 'Global Median Salary'
fig.text(0.5, 0.88, text,color='r', ha='center', fontsize=10)

# title
text = "STRATEGIC and VARIOUS equal to UNDER PAID?"
fig.text(
    -0.03, 1.01,
    text,
    ha='left',
    fontsize=18,
    fontproperties=fira_sans_semibold
)
text = """
Adjectives used to describe jobs and how they are related to Salaries.
Job Description from jobs found in Kaggle Dataset, filtering for USA.
The 15 most frequent adjectives are shown.
"""
fig.text(
    -0.03, 0.9,
    text,
    ha='left',
    fontsize=14,
    fontproperties=fira_sans_regular
)
# credit
text = """
Axis capped at 100,000 USD.
Data: https://www.kaggle.com/datasets/hummaamqaasim/jobs-in-data. 
Visualization: Toni Almagro (lessdatamorestories.com) 
based on https://python-graph-gallery.com/web-ridgeline-by-text/ 
"""
fig.text(
    -0.03, -0.03,
    text,
    ha='left',
    fontsize=8,
    fontproperties=fira_sans_regular
)

#Explanation
## ---------------

# legend on the first ax
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
subax = inset_axes(
    parent_axes=axs[0],
    width="40%",
    height="350%",
    loc=1
)
subax.set_xticks([])
subax.set_yticks([])
beautiful_subset = df2_adj[df2_adj['word']=='overall']

sns.kdeplot(
    beautiful_subset[variable],
    fill=True,
    ax=subax,
    color='grey',
    edgecolor='lightgrey'
)
quantiles = np.percentile(beautiful_subset[variable], [2.5, 10, 25, 75, 90, 97.5])
quantiles = quantiles.tolist()
for j in range(len(quantiles) - 1):
    subax.fill_between(
        [quantiles[j], # lower bound
         quantiles[j+1]], # upper bound
        0, # max y=0
        max_y/10, # max y=0.00004
        color=colors[j]
    )
subax.set_xlim(x_min, x_max-20_000)
subax.set_ylim(-0.00002, max_y)
mean = beautiful_subset[variable].median()
subax.scatter([mean], [0.0000025], color='black', s=20)

legend_pos = 0.000055
subax.text(
    x_min+5000, legend_pos,
    'Explanation',
    ha='left',
    fontsize=12,
    fontproperties=fira_sans_semibold
)

subax.text(
    x_max-30_000, legend_pos/2,
    'Distribution\nof Salaries',
    ha='center',
    fontsize=7,
    fontproperties=fira_sans_regular
)
subax.text(
    mean, legend_pos/4,
    'Median',
    ha='center',
    fontsize=7,
    fontproperties=fira_sans_regular
)

subtext_pos = -0.00001
subax.text(
    quantiles[4]+5_000,subtext_pos*1.5,
    "95% of salaries",
    ha='center',
    fontsize=6,
    fontproperties=fira_sans_regular
)

subax.text(
    quantiles[3],subtext_pos,
    "80% of salaries",
    ha='center',
    fontsize=6,
    fontproperties=fira_sans_regular
)
subax.text(
    quantiles[1], subtext_pos*1.5,
    "50% of salaries\nfall within this range",
    ha='center',
    fontsize=6,
    fontproperties=fira_sans_regular
)

# arrows in the legend
import matplotlib.patches as patches
def add_arrow(head_pos, tail_pos, ax):
    style = "Simple, tail_width=0.01, head_width=1, head_length=2"
    kw = dict(arrowstyle=style, color="k", linewidth=0.2)
    arrow = patches.FancyArrowPatch(
        tail_pos, head_pos,
        connectionstyle="arc3,rad=.5",
        **kw
    )
    ax.add_patch(arrow)
add_arrow( (mean-3_000, 0.000005),(quantiles[1]+5000, subtext_pos*1.2), subax) # 50%
add_arrow( (quantiles[3]+2000,0.0000025), (quantiles[3], -0.0000055),subax) # 80%
add_arrow((mean, 0.000005), (mean, 0.000015), subax) #,median 
add_arrow((quantiles[4]+1000, 0.000003), (quantiles[4]+5_000,subtext_pos), subax) # 50%

# background grey lines
from matplotlib.lines import Line2D
def add_line(xpos, ypos, fig=fig):
    line = Line2D(
        xpos, ypos,
        color='lightgrey',
        lw=0.2,
        transform=fig.transFigure
    )
    fig.lines.append(line)
#add_line([0.317, 0.317], [0.1, 0.9])
#add_line([0.51, 0.51], [0.1, 0.9])
#add_line([0.703, 0.703], [0.1, 0.9])
#add_line([0.896, 0.896], [0.1, 0.9])

plt.savefig('../images/jobs-ridgeline-by-text-USA.png', dpi=300, bbox_inches='tight')
plt.show()