In [14]:
# Step 1 – Data Cleaning & First Look
# Project: Data Science Job & Salary Trends 2025

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Make plots a bit nicer
sns.set(style="whitegrid", palette="muted", font_scale=1.1)

# --------------------------
# 1. Load the dataset
# --------------------------
# If running locally, put the CSV in your working directory
# Kaggle dataset: https://www.kaggle.com/datasets/adilshamim8/salaries-for-data-science-jobs
df = pd.read_csv("../data/raw/salaries.csv")

# Quick shape and peek
print("Shape of dataset:", df.shape)
df.head()

Shape of dataset: (151445, 11)


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2025,EX,FT,Head of Data,348516,USD,348516,US,0,US,M
1,2025,EX,FT,Head of Data,232344,USD,232344,US,0,US,M
2,2025,SE,FT,Data Scientist,145400,USD,145400,US,0,US,M
3,2025,SE,FT,Data Scientist,81600,USD,81600,US,0,US,M
4,2025,MI,FT,Engineer,160000,USD,160000,US,100,US,M


In [16]:
# --------------------------
# 2. Data overview
# --------------------------
# Info about columns
df.info()

# Missing values check
print("\nMissing values per column:")
print(df.isnull().sum())

# Quick descriptive stats
df.describe(include="all").transpose()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151445 entries, 0 to 151444
Data columns (total 11 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   work_year           151445 non-null  int64 
 1   experience_level    151445 non-null  object
 2   employment_type     151445 non-null  object
 3   job_title           151445 non-null  object
 4   salary              151445 non-null  int64 
 5   salary_currency     151445 non-null  object
 6   salary_in_usd       151445 non-null  int64 
 7   employee_residence  151445 non-null  object
 8   remote_ratio        151445 non-null  int64 
 9   company_location    151445 non-null  object
 10  company_size        151445 non-null  object
dtypes: int64(4), object(7)
memory usage: 12.7+ MB

Missing values per column:
work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
emplo

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
work_year,151445.0,,,,2024.435313,0.671842,2020.0,2024.0,2025.0,2025.0,2025.0
experience_level,151445.0,4.0,SE,87491.0,,,,,,,
employment_type,151445.0,4.0,FT,150541.0,,,,,,,
job_title,151445.0,422.0,Data Scientist,18751.0,,,,,,,
salary,151445.0,,,,162837.963135,208012.396413,14000.0,106000.0,147000.0,199000.0,30400000.0
salary_currency,151445.0,26.0,USD,143173.0,,,,,,,
salary_in_usd,151445.0,,,,157527.458411,74150.772377,15000.0,105800.0,146100.0,198000.0,800000.0
employee_residence,151445.0,104.0,US,135506.0,,,,,,,
remote_ratio,151445.0,,,,20.938625,40.620393,0.0,0.0,0.0,0.0,100.0
company_location,151445.0,97.0,US,135569.0,,,,,,,


In [None]:
# --------------------------
# 3. Initial Cleaning
# --------------------------
# Standardize column names (lowercase, replace spaces with _)
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Drop duplicates if any
df = df.drop_duplicates()

# Example: convert salary_in_usd to numeric if needed
df['salary_in_usd'] = pd.to_numeric(df['salary_in_usd'], errors="coerce")

# Drop rows with no salary
df = df.dropna(subset=['salary_in_usd'])

print("Cleaned shape:", df.shape)