In [123]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [125]:
df = pd.read_csv('ds_salaries.csv')
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,80000,EUR,85847,ES,100,ES,L
1,2023,MI,CT,ML Engineer,30000,USD,30000,US,100,US,S
2,2023,MI,CT,ML Engineer,25500,USD,25500,US,100,US,S
3,2023,SE,FT,Data Scientist,175000,USD,175000,CA,100,CA,M
4,2023,SE,FT,Data Scientist,120000,USD,120000,CA,100,CA,M


### How many rows, columns in the data:

In [126]:
df.shape

(3755, 11)

### How many duplicated values in the dataset:

In [127]:
df[df.duplicated()]

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
115,2023,SE,FT,Data Scientist,150000,USD,150000,US,0,US,M
123,2023,SE,FT,Analytics Engineer,289800,USD,289800,US,0,US,M
153,2023,MI,FT,Data Engineer,100000,USD,100000,US,100,US,M
154,2023,MI,FT,Data Engineer,70000,USD,70000,US,100,US,M
160,2023,SE,FT,Data Engineer,115000,USD,115000,US,0,US,M
...,...,...,...,...,...,...,...,...,...,...,...
3439,2022,MI,FT,Data Scientist,78000,USD,78000,US,100,US,M
3440,2022,SE,FT,Data Engineer,135000,USD,135000,US,100,US,M
3441,2022,SE,FT,Data Engineer,115000,USD,115000,US,100,US,M
3586,2021,MI,FT,Data Engineer,200000,USD,200000,US,100,US,L


### Dropping duplicate values from data:

In [128]:
df.drop_duplicates(inplace = True)
df.shape

(2584, 11)

### How many null/missing values in each column:

In [129]:
df.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [130]:
df.dropna(inplace = True)

### Statistics of Numerical Columns:

In [131]:
df1.describe()

Unnamed: 0,work_year,salary_in_usd,remote_ratio
count,3755.0,3755.0,3755.0
mean,2022.373635,137570.38988,46.271638
std,0.691448,63055.625278,48.58905
min,2020.0,5132.0,0.0
25%,2022.0,95000.0,0.0
50%,2022.0,135000.0,0.0
75%,2023.0,175000.0,100.0
max,2023.0,450000.0,100.0


### Drop the unwanted columns:

In [132]:
df.columns

Index(['work_year', 'experience_level', 'employment_type', 'job_title',
       'salary', 'salary_currency', 'salary_in_usd', 'employee_residence',
       'remote_ratio', 'company_location', 'company_size'],
      dtype='object')

In [134]:
df = df.drop(['salary','employee_residence', 'salary_currency'], axis = 1)
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_in_usd,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,85847,100,ES,L
1,2023,MI,CT,ML Engineer,30000,100,US,S
2,2023,MI,CT,ML Engineer,25500,100,US,S
3,2023,SE,FT,Data Scientist,175000,100,CA,M
4,2023,SE,FT,Data Scientist,120000,100,CA,M


In [135]:
df.rename(columns = {'salary_in_usd':'salary'}, inplace = True)
df.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,remote_ratio,company_location,company_size
0,2023,SE,FT,Principal Data Scientist,85847,100,ES,L
1,2023,MI,CT,ML Engineer,30000,100,US,S
2,2023,MI,CT,ML Engineer,25500,100,US,S
3,2023,SE,FT,Data Scientist,175000,100,CA,M
4,2023,SE,FT,Data Scientist,120000,100,CA,M


In [32]:
df.dtypes

work_year              int64
experience_level      object
employment_type       object
job_title             object
salary                 int64
salary_currency       object
salary_in_usd          int64
employee_residence    object
remote_ratio           int64
company_location      object
company_size          object
dtype: object

In [31]:
df.experience_level.value_counts()

experience_level
SE    2516
MI     805
EN     320
EX     114
Name: count, dtype: int64

In [27]:
df.job_title.value_counts()

job_title
Data Engineer                          1040
Data Scientist                          840
Data Analyst                            612
Machine Learning Engineer               289
Analytics Engineer                      103
                                       ... 
Principal Machine Learning Engineer       1
Azure Data Engineer                       1
Manager Data Management                   1
Marketing Data Engineer                   1
Finance Data Analyst                      1
Name: count, Length: 93, dtype: int64

In [147]:
job_title = df.job_title.value_counts()
job_title

job_title
Data Engineer                598
Data Scientist               538
Data Analyst                 396
Machine Learning Engineer    206
Analytics Engineer            91
                            ... 
Compliance Data Analyst        1
Deep Learning Researcher       1
Staff Data Analyst             1
Data DevOps Engineer           1
Finance Data Analyst           1
Name: count, Length: 93, dtype: int64

In [151]:
job_title_lessthan10 = job_title[job_title<10]
job_title_lessthan10

job_title
Lead Data Scientist         9
Head of Data Science        9
BI Analyst                  9
Data Science Lead           8
Principal Data Scientist    8
                           ..
Compliance Data Analyst     1
Deep Learning Researcher    1
Staff Data Analyst          1
Data DevOps Engineer        1
Finance Data Analyst        1
Name: count, Length: 64, dtype: int64

In [152]:
def handle_job_title(value):
    if (value in job_title_lessthan10):
        return 'Others'
    else:
        return value

df.job_title = df.job_title.apply(handle_job_title)
df.job_title.value_counts()

job_title
Data Engineer                               598
Data Scientist                              538
Data Analyst                                396
Others                                      213
Machine Learning Engineer                   206
Analytics Engineer                           91
Research Scientist                           65
Data Architect                               64
Data Science Manager                         52
ML Engineer                                  34
Research Engineer                            33
Applied Scientist                            31
Machine Learning Scientist                   26
Data Manager                                 23
Data Science Consultant                      23
Computer Vision Engineer                     18
Data Analytics Manager                       18
AI Scientist                                 16
Business Data Analyst                        15
BI Data Analyst                              15
Data Specialist               

In [14]:
df.salary_in_usd.value_counts()

salary_in_usd
100000    99
150000    98
120000    91
160000    84
130000    82
          ..
234100     1
223800     1
172100     1
232200     1
94665      1
Name: count, Length: 1035, dtype: int64

In [None]:
df.

In [15]:
df.employee_residence.value_counts()

employee_residence
US    3004
GB     167
CA      85
ES      80
IN      71
      ... 
BA       1
AM       1
CY       1
KW       1
MT       1
Name: count, Length: 78, dtype: int64

In [None]:
df.company_size

In [11]:
df.describe()

Unnamed: 0,work_year,salary,salary_in_usd,remote_ratio
count,3755.0,3755.0,3755.0,3755.0
mean,2022.373635,190695.6,137570.38988,46.271638
std,0.691448,671676.5,63055.625278,48.58905
min,2020.0,6000.0,5132.0,0.0
25%,2022.0,100000.0,95000.0,0.0
50%,2022.0,138000.0,135000.0,0.0
75%,2023.0,180000.0,175000.0,100.0
max,2023.0,30400000.0,450000.0,100.0


In [7]:
df.shape

(3755, 11)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           3755 non-null   int64 
 1   experience_level    3755 non-null   object
 2   employment_type     3755 non-null   object
 3   job_title           3755 non-null   object
 4   salary              3755 non-null   int64 
 5   salary_currency     3755 non-null   object
 6   salary_in_usd       3755 non-null   int64 
 7   employee_residence  3755 non-null   object
 8   remote_ratio        3755 non-null   int64 
 9   company_location    3755 non-null   object
 10  company_size        3755 non-null   object
dtypes: int64(4), object(7)
memory usage: 322.8+ KB
