# Outlier Detection and Treatment

In [22]:
import pandas as pd
import numpy as np

In [23]:
dataset = pd.read_csv("Dataset salary 2024 outlier.csv")

In [25]:
dataset.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2024.0,SE,FT,AI Engineer,20273040000.0,USD,202730.0,US,0.0,US,M
1,2024.0,SE,FT,AI Engineer,92118.0,USD,92118.0,US,0.0,US,M
2,2024.0,SE,FT,Data Engineer,130500.0,USD,130500.0,US,0.0,US,M
3,,SE,FT,Data Engineer,96000.0,USD,96000.0,US,0.0,US,M
4,2024.0,SE,FT,,-444444000000.0,USD,190000.0,US,0.0,US,M


In [26]:
dataset.isnull().sum()

work_year             5
experience_level      1
employment_type       5
job_title             2
salary                4
salary_currency       1
salary_in_usd         3
employee_residence    0
remote_ratio          9
company_location      4
company_size          3
dtype: int64

In [27]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16534 entries, 0 to 16533
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   work_year           16529 non-null  float64
 1   experience_level    16533 non-null  object 
 2   employment_type     16529 non-null  object 
 3   job_title           16532 non-null  object 
 4   salary              16530 non-null  float64
 5   salary_currency     16533 non-null  object 
 6   salary_in_usd       16531 non-null  float64
 7   employee_residence  16534 non-null  object 
 8   remote_ratio        16525 non-null  float64
 9   company_location    16530 non-null  object 
 10  company_size        16531 non-null  object 
dtypes: float64(4), object(7)
memory usage: 1.4+ MB


In [28]:
for column in dataset.columns:
    if dataset[column].dtype in ['int64', 'float64']:
        dataset[column].fillna(dataset[column].mean(),inplace = True)

In [29]:
for column in dataset.columns:
    if dataset[column].dtype in ['object']:
        dataset[column].fillna(dataset[column].mode()[0],inplace = True)

In [30]:
dataset.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

In [31]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16534 entries, 0 to 16533
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   work_year           16534 non-null  float64
 1   experience_level    16534 non-null  object 
 2   employment_type     16534 non-null  object 
 3   job_title           16534 non-null  object 
 4   salary              16534 non-null  float64
 5   salary_currency     16534 non-null  object 
 6   salary_in_usd       16534 non-null  float64
 7   employee_residence  16534 non-null  object 
 8   remote_ratio        16534 non-null  float64
 9   company_location    16534 non-null  object 
 10  company_size        16534 non-null  object 
dtypes: float64(4), object(7)
memory usage: 1.4+ MB


# Outlier

### IQR

In [32]:
# Identify numerical columns
numerical_columns = dataset.select_dtypes(include=[np.number]).columns
print(f"Numerical columns: {numerical_columns}")

Numerical columns: Index(['work_year', 'salary', 'salary_in_usd', 'remote_ratio'], dtype='object')


In [33]:
# Calculate quartiles and IQR for each column
Q1 = dataset[numerical_columns].quantile(0.25)
Q3 = dataset[numerical_columns].quantile(0.75)
IQR = Q3 - Q1


In [34]:
# Define the boundaries to identify outliers for each column
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [35]:
# Print and handle outliers for each numerical column
for col in dataset[numerical_columns]:
    outliers_below = dataset[col] < lower_bound[col]
    outliers_above = dataset[col] > upper_bound[col]
    
    # Print outliers below the lower bound
    if outliers_below.any():
        print(f"\nOutliers below lower bound in column '{col}':")
        print(dataset.loc[outliers_below, col])
    
    # Print outliers above the upper bound
    if outliers_above.any():
        print(f"\nOutliers above upper bound in column '{col}':")
        print(dataset.loc[outliers_above, col])


Outliers below lower bound in column 'work_year':
9198     2020.0
9401     2020.0
11233    2021.0
11234    2020.0
11711    2020.0
          ...  
16528    2021.0
16529    2020.0
16530    2021.0
16531    2020.0
16532    2020.0
Name: work_year, Length: 290, dtype: float64

Outliers below lower bound in column 'salary':
4     -4.444440e+11
8     -2.549697e+07
15    -2.549697e+07
517   -2.549697e+07
577   -2.549697e+07
Name: salary, dtype: float64

Outliers above upper bound in column 'salary':
0        2.027304e+10
6        4.000000e+05
145      8.000000e+05
309      3.850000e+05
385      3.700000e+05
             ...     
16509    1.335000e+06
16514    1.450000e+06
16526    4.230000e+05
16529    4.120000e+05
16533    7.000000e+06
Name: salary, Length: 346, dtype: float64

Outliers above upper bound in column 'salary_in_usd':
6        4.000000e+05
14       6.794643e+10
52       1.123220e+15
309      3.850000e+05
385      3.700000e+05
             ...     
16316    4.500000e+05
16461    4

In [36]:
# Handling outliers using NumPy for each column
for col in dataset[numerical_columns]:
    dataset[col] = np.where(dataset[col] < lower_bound[col], lower_bound[col], np.where(dataset[col] > upper_bound[col], upper_bound[col], dataset[col]))

In [37]:
# Print and handle outliers for each numerical column
for col in dataset[numerical_columns]:
    outliers_below = dataset[col] < lower_bound[col]
    outliers_above = dataset[col] > upper_bound[col]
    
    # Print outliers below the lower bound
    if outliers_below.any():
        print(f"\nOutliers below lower bound in column '{col}':")
        print(dataset.loc[outliers_below, col])
    
    # Print outliers above the upper bound
    if outliers_above.any():
        print(f"\nOutliers above upper bound in column '{col}':")
        print(dataset.loc[outliers_above, col])