In [3]:
# Data Loading and Managing
import numpy as np
import pandas as pd

# Data Visualization
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [6]:
df = pd.read_csv(r"C:\Users\Acer-PC\Downloads\Kaggle\ds_salaries.csv")
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB


In [8]:
df.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [9]:
# Extract all categorical columns
categorical_columns = df.columns[df.dtypes == 'object']
categorical_columns

Index(['experience_level', 'employment_type', 'job_title', 'salary_currency',
       'employee_residence', 'company_location', 'company_size'],
      dtype='object')

In [10]:
# Extract the sub-categories of each category
for col in categorical_columns:
    unique_values = df[col].unique()
    print(f"{col} : {unique_values}\n")

experience_level : ['SE' 'MI' 'EN' 'EX']

employment_type : ['FT' 'CT' 'FL' 'PT']

job_title : ['Principal Data Scientist' 'ML Engineer' 'Data Scientist'
 'Applied Scientist' 'Data Analyst' 'Data Modeler' 'Research Engineer'
 'Analytics Engineer' 'Business Intelligence Engineer'
 'Machine Learning Engineer' 'Data Strategist' 'Data Engineer'
 'Computer Vision Engineer' 'Data Quality Analyst'
 'Compliance Data Analyst' 'Data Architect'
 'Applied Machine Learning Engineer' 'AI Developer' 'Research Scientist'
 'Data Analytics Manager' 'Business Data Analyst' 'Applied Data Scientist'
 'Staff Data Analyst' 'ETL Engineer' 'Data DevOps Engineer' 'Head of Data'
 'Data Science Manager' 'Data Manager' 'Machine Learning Researcher'
 'Big Data Engineer' 'Data Specialist' 'Lead Data Analyst'
 'BI Data Engineer' 'Director of Data Science'
 'Machine Learning Scientist' 'MLOps Engineer' 'AI Scientist'
 'Autonomous Vehicle Technician' 'Applied Machine Learning Scientist'
 'Lead Data Scientist' 'Cloud Da

In [11]:
# Descriptive analysis
df.describe()

Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio
count,3755.0,3755.0,3755.0,3755.0
mean,2022.373635,190695.6,137570.38988,46.271638
std,0.691448,671676.5,63055.625278,48.58905
min,2020.0,6000.0,5132.0,0.0
25%,2022.0,100000.0,95000.0,0.0
50%,2022.0,138000.0,135000.0,0.0
75%,2023.0,180000.0,175000.0,100.0
max,2023.0,30400000.0,450000.0,100.0


In [33]:
# Create A pie chart showing distribution of date entries across years
data_values = df.work_year.value_counts()

# Create a pie chart
fig = go.Figure(data = go.Pie(
    labels = data_values.index, 
    values = data_values.values, 
    hole = 0.4, 
    textinfo = 'label+percent',
    insidetextorientation = 'radial',
    marker = dict(
        colors = px.colors.sequential.Cividis, 
        line = dict(
            color = 'honeydew', 
            width = 2
        )
    ),
))

# Update layout
fig.update_layout(
    title = "Distribution of Data Entries across Years",
    annotations = [dict(text="Year Overview", showarrow=False, font_size=20)],
    height = 600
)

# Show the pie chart
fig.show()

In [50]:
# Creating a bar graph to examine average salary by year
mean_salary = df.groupby('work_year')['salary_in_usd'].mean()

# Create bar graph
bar_fig = go.Figure(data = go.Bar(
    x = mean_salary.index,
    y = mean_salary.values,
    marker = dict(color = 'steelblue') 
    )
)

# Update layout
bar_fig.update_layout(
    title = "Examining Mean Salary by Year",
    xaxis_title = "Working Year",
    yaxis_title = "Mean Salary",
    height = 600
)

# Show plot
bar_fig.show()

In [42]:
# Create a box plot to examine salarty across years
fig2 = px.box(
    data_frame = df, 
    x = 'work_year', 
    y = 'salary_in_usd', 
    color = 'work_year', 
    points = 'all', 
    height = 600, 
    notched = True
)

# Update layout
fig.update_layout(
    title = "Salary Distribution across Years",
    xaxis_title = "Year",
    yaxis_title = "Salary (USD)"
)

# Show the box plot
fig.show()

In [54]:
# Creating a bar graph to examine distribution by experience level
data_value2 = df.experience_level.value_counts()

# Create bar graph
bar_fig2 = go.Figure(data = go.Bar(
    x = data_value2.index,
    y = data_value2.values,
    text = data_value2.values,
    textposition = 'outside',
    marker = dict(color = 'steelblue') 
    )
)

# Update layout
bar_fig2.update_layout(
    title = "Distrubtion of Data Entries by Experience Level",
    xaxis_title = "Experience Level",
    height = 600
)

# Show plot
bar_fig2.show()

In [61]:
# Creating a bar graph to mean salary base on experience level
data_value3 = df.groupby('experience_level')['salary_in_usd'].mean(numeric_only = True)

# Create bar graph
bar_fig3 = go.Figure(data = go.Bar(
    x = data_value3.index,
    y = data_value3.values,
    text = data_value3.values,
    textposition = 'outside',
    marker = dict(color = 'steelblue') 
    )
)

# Update layout
bar_fig3.update_layout(
    title = "Average Salary by Experience Level",
    xaxis_title = "Experience Level",
    height = 600
)

# Show plot
bar_fig3.show()

In [62]:
# Create box plots with facets
fig = px.box(df, x='experience_level', y='salary_in_usd', color='experience_level', facet_col='work_year')

# Update the layout
fig.update_layout(
    title="Salary Distribution by Experience Level",
    yaxis_title="Salary (USD)",
    height=600
)

# Show the figure
fig.show()

In [67]:
# Create a box plot examining salary across employment type
fig3 = px.box(
    data_frame = df, 
    x = 'employment_type', 
    y = 'salary_in_usd', 
    color = 'employment_type', 
    height = 600, 
)

# Update layout
fig3.update_layout(
    title = "Salary Distribution across Employment Type",
    xaxis_title = "Employment Type",
    yaxis_title = "Salary (USD)"
)

# Show the box plot
fig3.show()

In [71]:
# Create a box plot
box_plot = px.box(
    data_frame=df,
    x='work_year',
    y='salary_in_usd',
    color='employment_type',
    title="Salary Distribution across Employment Types",
    labels={'work_year': 'Work Year', 'salary_in_usd': 'Salary (USD)', 'employment_type': 'Employment Type'}
)

# Customize the layout
box_plot.update_layout(
    legend_title="Employment Type",
    height=600
)

# Show the box plot
box_plot.show()