## Data Scientist and Business Analyst Salaries Analysis
* The Dataset used is [Data Science and STEM Salaries](https://www.kaggle.com/datasets/jackogozaly/data-science-and-stem-salaries), uploaded by Jack Ogozaly on Kaggle.

In [1]:
!pip3 install pyjanitor orjson --quiet

In [10]:
import pandas as pd
import numpy as np
import janitor
import missingno as msno
import warnings

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

from scipy.stats import skew, kurtosis

# Plot Template & Color
pio.templates.default = 'plotly_white'


# Other settings
warnings.filterwarnings(action='ignore')
pd.set_option('display.max_columns', 50)

In [11]:
def figure_show(plot, static: bool, **kwargs)-> go.Figure:
    """
    Sets figure with custom layout parameters
    """
    config = {'staticPlot': True}
    plot.update_layout(
        **kwargs,
        font=dict(color='Black', size=12),
        margin=dict(pad=10),
        width=780)
    
    if static:
        return plot.show(config=config)
    else:
        return plot.show()


fp = '../input/data-science-and-stem-salaries/Levels_Fyi_Salary_Data.csv'
def load_data(path:str=fp) -> pd.DataFrame:
    """
    Load and cleans data
    """
    df = pd.read_csv(path)
    df = df.remove_empty()\
        .clean_names()\
        .drop('rownumber', axis=1)

    return df

In [12]:
# Loading Data 
df = load_data()

# Check for missing data
msno.matrix(df, figsize=(14, 5), fontsize=11, color=(0., 0., 0.3));

In [13]:
# The data
df.head()

In [14]:
# Data Basic Info
df.info()

## Time Feature

In [15]:
df.timestamp = pd.to_datetime(df.timestamp)
df.timestamp

In [16]:
year = df.timestamp.dt.year
year_counts = year.value_counts()
year_counts

In [17]:
def plot_year_counts(x, y, title, highlight_bar) -> go.Figure:
    """
    Plots total count of available data that year.
    """
    plot_color = [dark_blue] * 6
    plot_color[highlight_bar] = cobalt_blue
    year_fig = px.bar(x=x, y=y, text=y)
    year_fig.update_layout(title=f'<b>{title}</b>')
    year_fig.update_yaxes(title='Count')
    year_fig.update_xaxes(title='Year', tickmode='linear')
    year_fig.update_traces(marker_color=plot_color)

    return figure_show(year_fig, static=False)

plot_year_counts(
    x=year_counts.index, 
    y=year_counts.values, 
    title='Data Available: Year',
    highlight_bar=0)

In [18]:
print('\n', 'min: ', df.timestamp.min(), '\n', 'max: ', df.timestamp.max())

In [19]:
print(f"""
> Summary
{'-' * 40}
> The Dataset has {df.shape[0]} entries and {df.shape[1]} features.
> There's a total of {df.isna().sum().sum()} missing entries/observations.
    - Gender, Other Details, Race, and Education compromise the majority of the missing values.
> The start date is {df.timestamp.min()} and ends on {df.timestamp.max()}.
> The dataset is compromise of dtypes: float64(6), int64(12), object(10)
    - 10 of wich are binary features specifiying feature Race and Education.
""")

## Company & Title

---

In [20]:
df.company.value_counts()

In [21]:
# Filter
company = df.company.value_counts()[df.company.value_counts() >= 100]
company = company[:12]

In [22]:
def plot_company(x, y, title, highlight_bar) -> go.Figure:    
    """
    Plot companies included in Data
    """
    fig = go.Figure()
    company_plot_color = [dark_blue] * 30
    company_plot_color[highlight_bar] = cobalt_blue
    fig.add_trace(go.Bar(x=x, y=y, orientation='h'))
    fig.update_traces(marker_color=company_plot_color)
    fig.update_layout(title=f'<b>{title}</b>')
    
    return figure_show(fig, static=False)

plot_company(
    x=company.values, 
    y=company.index, 
    title='Data Available: Companies',
    highlight_bar=0)

## Zooming in: Data Scientist and Business Analyst

---

In [23]:
df.title.value_counts()

In [24]:
data_scientist = df[df.title.str.contains('Data Scientist')]
business_analyst = df[df.title.str.contains('Business Analyst')]
data = pd.concat([data_scientist, business_analyst], axis=0)
data.head()

In [25]:
data['timestamp'] = pd.to_datetime(data.timestamp)
year = data.timestamp.dt.year
year_new_counts = year.value_counts()
year_new_counts

In [26]:
plot_year_counts(
    x=year_new_counts.index, 
    y=year_new_counts.values, 
    title='Data Scientist and Business Analyst Roles',
    highlight_bar=0)

In [27]:
# Filter
company = data.company.value_counts()[data.company.value_counts() >= 20]
plot_company(
    x=company.values, 
    y=company.index, 
    title='Data Scientist and Business Analyst: Companies',
    highlight_bar=0)

In [28]:
def plot_title_counts() -> go.Figure:
    """
    Plots a Bar Graph Count of Data Scientist and Business Analyst.
    """
    counts = data.title.value_counts()
    fig = px.bar(x=counts.index, y=counts.values, text=counts.values)
    fig.update_layout(title='<b>Position/Title:</b> Counts')
    fig.update_yaxes(title='Count')
    fig.update_xaxes(title='Position')
    fig.update_traces(marker_color=[dark_blue, honey_orange])
    return figure_show(fig, static=False)

plot_title_counts()

## Yearly Compensation
---

In [29]:
# Simplify and Rename columns
data = data.rename(columns={
        'totalyearlycompensation': 'yearly_compensation',
        'yearsofexperience': 'experience_years',
        'yearsatcompany': 'employed_years'})

data['employed_bin_years'] = pd.cut(data.employed_years, bins=8,
    labels=[
        '0-3 Yrs.',
        '3-6 Yrs.',
        '6-9 Yrs.',
        '9-12 Yrs.',
        '12-15 Yrs.',
        '15-18 Yrs.',
        '18-21 Yrs.',
        '21-24 Yrs.'])

# Feature Compensation and Years Employed
comp_and_employed = data[['yearly_compensation', 'employed_years', 'title', 'employed_bin_years']]\
    .sort_values(by='employed_bin_years')


# Feature Compensation and Employee Work Experience
comp_and_experience = data[['yearly_compensation', 'experience_years', 'title']]\
    .sort_values(by='experience_years')

### Distribution
---

In [30]:
data.yearly_compensation\
    .describe()\
    .to_frame()

In [31]:
skew_comp = skew(data.yearly_compensation)
kurt_comp = kurtosis(data.yearly_compensation)
print(f"""
skew: {skew_comp}
kurt: {kurt_comp}
""")

In [32]:
data_science_comp = data[data.title == 'Data Scientist']\
    [['yearly_compensation', 'employed_bin_years']]

# Sort data naturally to avoid plotting misplacements with plotly
data_science_comp = data_science_comp.sort_naturally('employed_bin_years')

business_analyst_comp = data[data.title == 'Business Analyst']\
    [['yearly_compensation', 'employed_bin_years']]

In [33]:
def plot_title_comp_box() -> go.Figure:
    """
    Plots a Box chart for the yearly 
        compensation of Data Scientist and Business Analyst.
    """
    fig = go.Figure()

    fig.add_trace(go.Box(
        x=data_science_comp.yearly_compensation,
        marker_color=dark_blue,
        name='Data Science'))

    fig.add_trace(go.Box(
        x=business_analyst_comp.yearly_compensation, 
        marker_color=honey_orange,
        name='Business Analyst'))

    fig.update_layout(title='<b>Yearly Compensation:</b> Data Science and Business Analyst')
    fig.update_xaxes(title='Total Compensation ($)')
    return figure_show(fig, static=False)

plot_title_comp_box()

### No. of years Employed and Compensation

---

In [34]:
comp_and_employed

In [35]:
def plot_comp_emp() -> go.Figure:    
    """
    Plots a Scatter Plot for the yearly compensation of
        Data Scientist and Business Analyst.
    """
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        y=data_science_comp.yearly_compensation,
        x=data_science_comp.employed_bin_years, 
        mode='markers',
        name='Data Scientist',
        marker_color=dark_blue))
    
    fig.add_trace(go.Scatter(
        y=business_analyst_comp.yearly_compensation,
        x=business_analyst_comp.employed_bin_years,
        mode='markers',
        name='Business Analyst',
        marker_color=honey_orange))
    
    
    fig.update_layout(title='<b>Years Employed & Compensation</b>')
    fig.update_traces(opacity=0.8)
    fig.update_yaxes(title='Yearly Compensation ($)')
    fig.update_xaxes(title='Total No. of years Employed')
    
    return figure_show(fig, static=False)

plot_comp_emp()

In [36]:
def plot_comp_emp():
    
    fig = px.box(comp_and_employed, y='yearly_compensation', x='employed_bin_years')
    fig.update_layout(
        title='<b>Years Employed & Compensation:</b><br> Combined - Data Scientist and Business Analyst')
    fig.update_yaxes(title='Yearly Compensation ($)')
    fig.update_xaxes(title='No. of Years Employed')
    fig.update_traces(marker_color=dark_blue)
    
    return figure_show(fig, static=False)

plot_comp_emp()

### Work Experience and Yearly Compensation

---

In [37]:
def plot_comp_exp():    
    
    fig = px.histogram(x=comp_and_experience.experience_years, y=comp_and_experience.yearly_compensation)
    fig.update_layout(title='<b>Compensation and Experience</b>')
    fig.update_traces(marker_color=dark_blue)
    fig.update_yaxes(title='Yearly Compensation ($)')
    fig.update_xaxes(title='Experience in yrs.')
    return figure_show(fig, static=False)

plot_comp_exp()