In [None]:
import kagglehub # type: ignore
import pandas as pd # type: ignore
import plotly.express as px # type: ignore
import os

#Import the dataset of data science salaries from kaggle 
path = kagglehub.dataset_download("sazidthe1/data-science-salaries")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: C:\Users\david\.cache\kagglehub\datasets\sazidthe1\data-science-salaries\versions\2


In [2]:
#Extract the data from csv to a dataframe 
data_science_market = pd.read_csv("data_science_salaries.csv")

In [3]:
#Quick review of composition.  
data_science_market.info()
data_science_market.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6599 entries, 0 to 6598
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   job_title           6599 non-null   object
 1   experience_level    6599 non-null   object
 2   employment_type     6599 non-null   object
 3   work_models         6599 non-null   object
 4   work_year           6599 non-null   int64 
 5   employee_residence  6599 non-null   object
 6   salary              6599 non-null   int64 
 7   salary_currency     6599 non-null   object
 8   salary_in_usd       6599 non-null   int64 
 9   company_location    6599 non-null   object
 10  company_size        6599 non-null   object
dtypes: int64(3), object(8)
memory usage: 567.2+ KB


Unnamed: 0,job_title,experience_level,employment_type,work_models,work_year,employee_residence,salary,salary_currency,salary_in_usd,company_location,company_size
0,Data Engineer,Mid-level,Full-time,Remote,2024,United States,148100,USD,148100,United States,Medium
1,Data Engineer,Mid-level,Full-time,Remote,2024,United States,98700,USD,98700,United States,Medium
2,Data Scientist,Senior-level,Full-time,Remote,2024,United States,140032,USD,140032,United States,Medium
3,Data Scientist,Senior-level,Full-time,Remote,2024,United States,100022,USD,100022,United States,Medium
4,BI Developer,Mid-level,Full-time,On-site,2024,United States,120000,USD,120000,United States,Medium


In [4]:
#Check variables to select the analysis to be made. I am looking for small amounts of categories: experience_level and work_models have 4 and 3 different values and will make a great analysis.
print(data_science_market["experience_level"].unique())  
print(data_science_market["work_models"].unique()) 
print(data_science_market["employment_type"].unique()) 

#Useful variables "experience_level", "employment_type", "work_models"
#2 graphs to be made.  1. average salary by experience level given that it is full-time employment. 2. Salary by work model scatterplot given that it is a full time employment (filter).

#Apply the filtering of data. Only Full-time employment type. 
full_time_ds_market = data_science_market[data_science_market["employment_type"]=="Full-time"]
full_time_ds_market.info() #wow just lost 47 entries. Almost all data related workers are full time workers. 


['Mid-level' 'Senior-level' 'Entry-level' 'Executive-level']
['Remote' 'On-site' 'Hybrid']
['Full-time' 'Part-time' 'Contract' 'Freelance']
<class 'pandas.core.frame.DataFrame'>
Index: 6552 entries, 0 to 6598
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   job_title           6552 non-null   object
 1   experience_level    6552 non-null   object
 2   employment_type     6552 non-null   object
 3   work_models         6552 non-null   object
 4   work_year           6552 non-null   int64 
 5   employee_residence  6552 non-null   object
 6   salary              6552 non-null   int64 
 7   salary_currency     6552 non-null   object
 8   salary_in_usd       6552 non-null   int64 
 9   company_location    6552 non-null   object
 10  company_size        6552 non-null   object
dtypes: int64(3), object(8)
memory usage: 614.2+ KB


In [5]:
#Function to generate the data to be used. Same as previous but in a function. 

def data():
    """_summary_
    This function will generate the dataset to be used by graphing functions as long as the csv file has the name: data_science_salaries.csv
    and is in the root of the directory. 
    Returns:
        _type_: _description_
    """
    ds_market = pd.read_csv("data_science_salaries.csv")
    ds_full_time = ds_market[ds_market["employment_type"]=="Full-time"]
    return ds_full_time

In [None]:
#Important Note. No scatterplot is viable with this dataset as there are not 2 continuous numeric variables. A bar graph will be made instead.

def avg_salary_by_experience_bar_graph(df):
    """_summary_
    Generates a bar graph of average salary grouped by experience level.
    
    Parameters:
    df (pd.DataFrame): A DataFrame containing 'experience_level' and 'salary_in_usd' columns.

    Returns:
    fig (plotly.graph_objects.Figure): A Plotly bar chart figure
    """
    #Ensure columns exist
    if 'experience_level' not in df.columns or 'salary_in_usd' not in df.columns:
        raise ValueError("DataFrame must contain 'experience_level' and 'salary_in_usd' columns")
    # Group by experience level and compute the average salary
    avg_salary = df.groupby('experience_level', as_index=False)['salary_in_usd'].mean()
    
    # Create the bar chart
    
    fig = px.bar(avg_salary, x='experience_level', y='salary_in_usd', 
                 title="Average Salary by Experience Level",
                 labels={'salary_in_usd': 'Average Salary', 'experience_level': 'Experience Level'},
                 color='experience_level')

    return fig

    

In [None]:
#Function to make a scatterplot
def salary_by_work_model_histogram(df): 
    """_summary_
    Generates an histogram of salary by work models  
    Args:
        df (_type_): A df containing work_models and salary_in_usd as columns
    """
    if "work_models" not in df.columns or "salary_in_usd" not in df.columns: 
        raise ValueError("Dataframe must contain 'work_model' and 'salary_in_usd' columns.")
    hist = px.histogram(df, x="salary_in_usd", color="work_models", 
                   title="Salary Distribution by Work Model",
                   labels={"salary_in_usd": "Salary (USD)", "work_models": "Work Model"},
                   nbins=10, barmode="overlay")  # Overlayed histograms
    return hist