In [3]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

#### Read data

In [4]:
salary_data = pd.read_csv('data/Dataset.csv')

salary_data.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [5]:
salary_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 375 entries, 0 to 374
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Age                  373 non-null    float64
 1   Gender               373 non-null    object 
 2   Education Level      373 non-null    object 
 3   Job Title            373 non-null    object 
 4   Years of Experience  373 non-null    float64
 5   Salary               373 non-null    float64
dtypes: float64(3), object(3)
memory usage: 17.7+ KB


In [6]:
salary_data.shape

(375, 6)

#### Rename coloumns

In [7]:
salary_data.columns=['Age', 'Gender', 'Degree', 'Job_Title', 'Experience_years', 'Salary']

salary_data.head()

Unnamed: 0,Age,Gender,Degree,Job_Title,Experience_years,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


In [8]:
salary_data.dtypes

Age                 float64
Gender               object
Degree               object
Job_Title            object
Experience_years    float64
Salary              float64
dtype: object

#### Check and drop duplicate records

In [9]:

duplicate_rows = salary_data.duplicated()

# Check if there are any duplicate rows
has_duplicates = duplicate_rows.any()

# Print result
print(f"Are there any duplicate rows? {has_duplicates}")

# If you want to see the count of duplicate rows
duplicate_count = duplicate_rows.sum()
print(f"Number of duplicate rows: {duplicate_count}")


Are there any duplicate rows? True
Number of duplicate rows: 50


In [10]:
salary_data1 = salary_data.drop_duplicates(keep = 'first')

salary_data1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 325 entries, 0 to 371
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Age               324 non-null    float64
 1   Gender            324 non-null    object 
 2   Degree            324 non-null    object 
 3   Job_Title         324 non-null    object 
 4   Experience_years  324 non-null    float64
 5   Salary            324 non-null    float64
dtypes: float64(3), object(3)
memory usage: 17.8+ KB


In [11]:
salary_data1.shape

(325, 6)

#### Missing values

In [12]:
salary_data1.isnull().sum()

Age                 1
Gender              1
Degree              1
Job_Title           1
Experience_years    1
Salary              1
dtype: int64

In [13]:
salary_data1.dropna(how = 'any', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  salary_data1.dropna(how = 'any', inplace=True)


In [14]:
salary_data1.shape

(324, 6)

In [15]:
salary_data1.head()

Unnamed: 0,Age,Gender,Degree,Job_Title,Experience_years,Salary
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0
4,52.0,Male,Master's,Director,20.0,200000.0


## Data Exploration and Visualization

#### Statistics of numeric columns

In [17]:
salary_data1.describe()

Unnamed: 0,Age,Experience_years,Salary
count,324.0,324.0,324.0
mean,37.382716,10.058642,99985.648148
std,7.185844,6.65047,48652.27144
min,23.0,0.0,350.0
25%,31.0,4.0,55000.0
50%,36.5,9.0,95000.0
75%,44.0,16.0,140000.0
max,53.0,25.0,250000.0
