<div
    align="center"
    style="
        background-color: #082f49; 
        font-size: 30px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 10px; 
        line-height: 1.5; 
        border-radius: 15px;
        margin: 10px 0;
    "
>
    <h1>Data Science Salaries Analysis</h1>
</div>


<div 
    align="left" 
    style="
        background-color: #075985; 
        font-size: 24px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 10px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 10px 0;
    "
>
    <h3>1 - General Setup</h3>
</div>

<div 
    align="left" 
    style="
        background-color: #0284c7; 
        font-size: 20px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 5px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 5px 0;
    "
>
    <h4>1.1 - Import libraries</h4>
</div>

In [1]:
import os
import warnings

import numpy as np
import pandas as pd

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.patches import FancyBboxPatch

import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

from skimpy import skim
import statsmodels.api as sm
from wordcloud import WordCloud
import nltk
from sklearn.preprocessing import LabelEncoder
import plotly.io as pio

<div 
    align="left" 
    style="
        background-color: #0284c7; 
        font-size: 20px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 5px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 5px 0;
    "
>
    <h4>1.2 - Visualization setup</h4>
</div>

In [2]:
pio.renderers.default = 'notebook'

%matplotlib inline 
# Ccharts displayed within cells
sns.set_theme(style="dark")
plt.style.use('dark_background')

# Modify display settings
pd.set_option('display.float_format', '{:.0f}'.format)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Ignore warnings
warnings.filterwarnings("ignore")

<div 
    align="left" 
    style="
        background-color: #0284c7; 
        font-size: 20px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 5px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 5px 0;
    "
>
    <h4>1.3 - General functions</h4>
</div>

In [3]:
import matplotlib as mpl
import pandas as pd

def styleDF(
    df,
    cmap_colors=["#e5e7eb","#172554"],
    font_size="14pt",
    header_font_size="16pt",
    padding="8px"
):
    # Create color map
    customCMap = mpl.colors.LinearSegmentedColormap.from_list("customCMap", cmap_colors)

    # Style the DataFrame
    return df.style\
        .set_properties(**{
            'font-size': font_size,
            'text-align': 'center',
            'padding': padding
        })\
        .set_table_styles(
            [{
                'selector': 'th', 
                'props': [
                    ('font-size', header_font_size), 
                    ('text-align', 'center'), 
                    ('padding', padding)
                ]
            }]  
        )\
        .background_gradient(cmap=customCMap)


<div 
    align="left" 
    style="
        background-color: #075985; 
        font-size: 24px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 10px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 10px 0;
    "
>
    <h3>2 - Initial data visualization</h3>
</div>

<div 
    align="left" 
    style="
        background-color: #0284c7; 
        font-size: 20px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 5px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 5px 0;
    "
>
    <h4>2.1 - Load dataset</h4>
</div>

In [6]:
# Build dynamic file path
homeDirectory = os.path.expanduser("~")
filePath = os.path.join(
    homeDirectory,
    r'Desktop\Data Science Salaries\DataScience_salaries_2024.csv'
)

# Load CSV file
dfSalaries = pd.read_csv(filePath)

<div 
    align="left" 
    style="
        background-color: #0284c7; 
        font-size: 20px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 5px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 5px 0;
    "
>
    <h4>2.2 - Visualize data structure</h4>
</div>

In [7]:
skim(dfSalaries)

<div 
    align="left" 
    style="
        background-color: #0284c7; 
        font-size: 20px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 5px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 5px 0;
    "
>
    <h4>2.3 - Visualize general data redundancy</h4>
</div>

In [8]:
pd.DataFrame({
    'Count':dfSalaries.shape[0],
    'Null':dfSalaries.isnull().sum(),
    'Null %':dfSalaries.isnull().mean() * 100,
    'Unique':dfSalaries.nunique(),
    'Duplicated':(dfSalaries.duplicated() == True).sum()
})

Unnamed: 0,Count,Null,Null %,Unique,Duplicated
work_year,14838,0,0,5,5711
experience_level,14838,0,0,4,5711
employment_type,14838,0,0,4,5711
job_title,14838,0,0,153,5711
salary,14838,0,0,2363,5711
salary_currency,14838,0,0,23,5711
salary_in_usd,14838,0,0,2730,5711
employee_residence,14838,0,0,88,5711
remote_ratio,14838,0,0,3,5711
company_location,14838,0,0,77,5711


<div 
    align="left" 
    style="
        background-color: #075985; 
        font-size: 24px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 10px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 10px 0;
    "
>
    <h3>3 - Data filtering and organization</h3>
</div>

<div 
    align="left" 
    style="
        background-color: #0284c7; 
        font-size: 20px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 5px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 5px 0;
    "
>
    <h4>3.1 - Rename experience levels</h4>
</div>

In [9]:
def mapExpLevel(df, oldCol='experience_level', newCol='experience_level'):
    expLevel = {
        'EN': 'Junior',
        'MI': 'Mid-Level',
        'SE': 'Senior',
        'EX': 'Expert'
    }

    df[newCol] = df[oldCol].replace(expLevel)
    return df

dfSalaries = mapExpLevel(dfSalaries)

<div 
    align="left" 
    style="
        background-color: #0284c7; 
        font-size: 20px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 5px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 5px 0;
    "
>
    <h4>3.2 - Visualize new experience levels</h4>
</div>

In [10]:
# Count each type
empExpCounts = dfSalaries['experience_level'].value_counts()

# Create dataframe
dfEmpExpCounts = empExpCounts.reset_index()
dfEmpExpCounts.columns = ['Experience Level', 'Count']

# Create colors map
dfEmpExpCountsColors = mpl.colors.LinearSegmentedColormap.from_list("custom_blue", ["#e5e7eb", "#172554"])

# Visualize new data
styleDF(dfEmpExpCounts)

Unnamed: 0,Experience Level,Count
0,Senior,9696
1,Mid-Level,3553
2,Junior,1148
3,Expert,441


<div 
    align="left" 
    style="
        background-color: #0284c7; 
        font-size: 20px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 5px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 5px 0;
    "
>
    <h4>3.3 - Rename employment types</h4>
</div>

In [11]:
def mapEmploymentType(df, oldCol='employment_type', newCol='employment_type'):
    employmentType = {
        'FT': 'Full-Time',
        'PT': 'Part-Time',
        'CT': 'Contract',
        'FL': 'Freelance'
    }

    df[newCol] = df[oldCol]\
        .map(employmentType)\
        .fillna('Other')

    return df

dfSalaries = mapEmploymentType(dfSalaries)

<div 
    align="left" 
    style="
        background-color: #0284c7; 
        font-size: 20px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 5px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 5px 0;
    "
>
    <h4>3.4 - Visualize new employment types</h4>
</div>

In [12]:
# Count each type
empTypeCounts = dfSalaries['employment_type'].value_counts()

# Create dataframe
dfEmpTypeCounts = empTypeCounts.reset_index()
dfEmpTypeCounts.columns = ['Employment Type', 'Count']

# Create colors map
dfEmpTypeCountsColors = mpl.colors.LinearSegmentedColormap.from_list("custom_blue", ["#e5e7eb", "#172554"])

# Visualize new data
styleDF(dfEmpTypeCounts)

Unnamed: 0,Employment Type,Count
0,Full-Time,14772
1,Part-Time,27
2,Contract,26
3,Freelance,13


<div 
    align="left" 
    style="
        background-color: #0284c7; 
        font-size: 20px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 5px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 5px 0;
    "
>
    <h4>3.5 - Rename employment models</h4>
</div>

In [13]:
def mapEmploymentModel(df, oldCol='remote_ratio', newCol='employment_model'):

    employmentModel = {
        0: 'In-Person',
        50: 'Hybrid',
        100: 'Remote'
    }

    df[newCol] = df[oldCol].map(employmentModel).fillna('Other')

    return df

dfEmploymentModel = mapEmploymentModel(dfSalaries)

<div 
    align="left" 
    style="
        background-color: #0284c7; 
        font-size: 20px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 5px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 5px 0;
    "
>
    <h4>3.6 - Visualize new employment models</h4>
</div>

In [14]:
# Count each type
empModelCounts = dfSalaries['employment_model'].value_counts()

# Create dataframe
dfEmpModelCounts = empModelCounts.reset_index()
dfEmpModelCounts.columns = ['Employment Model', 'Count']

# Create colors map
dfEmpModelCountsColors = mpl.colors.LinearSegmentedColormap.from_list("custom_blue", ["#e5e7eb", "#172554"])

# Visualize new data
styleDF(dfEmpModelCounts)

Unnamed: 0,Employment Model,Count
0,In-Person,9853
1,Remote,4737
2,Hybrid,248


<div 
    align="left" 
    style="
        background-color: #0284c7; 
        font-size: 20px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 5px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 5px 0;
    "
>
    <h4>3.7 - Detect outliers</h4>
</div>

In [15]:
# Find Q1, Q2, Q3 and calculate IQR, MIN, MAX
Q1 = dfSalaries["salary_in_usd"].quantile(0.25)
Q2 = dfSalaries["salary_in_usd"].quantile(0.50)
Q3 = dfSalaries["salary_in_usd"].quantile(0.75)
print(f"Q1={Q1}, Q2={Q2}, Q3={Q3}\n")

IQR = Q3 - Q1
print(f"IQR={IQR}\n")

lowerBound = Q1 - (1.5 * IQR)
upperBound = Q3 + (1.5 * IQR)

print(f"MIN={lowerBound}, MAX={upperBound}")

Q1=102000.0, Q2=141300.0, Q3=185900.0

IQR=83900.0

MIN=-23850.0, MAX=311750.0


<div 
    align="left" 
    style="
        background-color: #0284c7; 
        font-size: 20px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 5px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 5px 0;
    "
>
    <h4>3.8 - Handle outliers</h4>
</div>

In [16]:
# Detect outliers
salaryData = dfSalaries['salary_in_usd']
outliers = salaryData[(salaryData < lowerBound) | (salaryData > upperBound)]
outliersCount = len(outliers)

print(f"Quantity of outliers: {outliersCount}")

# Delete outliers
dfSalaries = dfSalaries[~dfSalaries['salary_in_usd'].isin(outliers)]

Quantity of outliers: 270


<div 
    align="left" 
    style="
        background-color: #075985; 
        font-size: 24px; 
        font-family: 'Arial', sans-serif; 
        color: #F9F9F9; 
        padding: 10px; 
        line-height: 1; 
        border-radius: 15px;
        margin: 10px 0;
    "
>
    <h3>4 - Save data in PostgreSQL database</h3>
</div>