# Revision

In [9]:
import pandas as pd 
import numpy as np

#### 1)Understand the Data

In [10]:
dataset = pd.read_csv('Dataset salary re.csv')

In [11]:
dataset.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,Unnamed: 11
0,2024.0,SE,FT,AI Engineer,202730.0,USD,202730,US,0.0,US,M,7.0
1,2024.0,SE,FT,AI Engineer,202730.0,USD,202730,US,0.0,US,M,7.0
2,2024.0,SE,FT,AI Engineer,202730.0,USD,202730,US,0.0,US,M,7.0
3,2024.0,SE,FT,AI Engineer,202730.0,USD,202730,US,0.0,US,M,7.0
4,2024.0,SE,FT,Data Engineer,130500.0,USD,130500,US,0.0,US,M,7.0


In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16536 entries, 0 to 16535
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   work_year           16534 non-null  float64
 1   experience_level    16535 non-null  object 
 2   employment_type     16535 non-null  object 
 3   job_title           16536 non-null  object 
 4   salary              16533 non-null  float64
 5   salary_currency     16536 non-null  object 
 6   salary_in_usd       16536 non-null  int64  
 7   employee_residence  16536 non-null  object 
 8   remote_ratio        16536 non-null  float64
 9   company_location    16535 non-null  object 
 10  company_size        16536 non-null  object 
 11  Unnamed: 11         1971 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 1.5+ MB


#### 2)Remove Unnecessary Data

In [13]:
# Remove duplicate rows
dataset.drop_duplicates(inplace=True)

In [14]:
dataset.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,Unnamed: 11
0,2024.0,SE,FT,AI Engineer,202730.0,USD,202730,US,0.0,US,M,7.0
4,2024.0,SE,FT,Data Engineer,130500.0,USD,130500,US,0.0,US,M,7.0
5,2024.0,SE,FT,Data Engineer,96000.0,USD,96000,US,0.0,US,M,7.0
6,2024.0,SE,FT,Machine Learning Engineer,190000.0,USD,190000,US,0.0,US,M,7.0
7,2024.0,SE,FT,Machine Learning Engineer,160000.0,USD,160000,US,0.0,US,M,7.0


In [15]:
# Calculate the percentage of null values in each column
null_percentage = dataset.isnull().mean() * 100

# Display the result
print(null_percentage)

work_year              0.018882
experience_level       0.009441
employment_type        0.009441
job_title              0.000000
salary                 0.028323
salary_currency        0.000000
salary_in_usd          0.000000
employee_residence     0.000000
remote_ratio           0.000000
company_location       0.009441
company_size           0.000000
Unnamed: 11           84.658233
dtype: float64


In [16]:
dataset = dataset.drop('Unnamed: 11', axis=1)

In [17]:
dataset.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2024.0,SE,FT,AI Engineer,202730.0,USD,202730,US,0.0,US,M
4,2024.0,SE,FT,Data Engineer,130500.0,USD,130500,US,0.0,US,M
5,2024.0,SE,FT,Data Engineer,96000.0,USD,96000,US,0.0,US,M
6,2024.0,SE,FT,Machine Learning Engineer,190000.0,USD,190000,US,0.0,US,M
7,2024.0,SE,FT,Machine Learning Engineer,160000.0,USD,160000,US,0.0,US,M


#### 3)Handle Missing Values

In [19]:
# Check for missing values
dataset.isnull().sum()

work_year             2
experience_level      1
employment_type       1
job_title             0
salary                3
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      1
company_size          0
dtype: int64

In [20]:
# Mean imputation for numerical columns
for column in dataset.columns:
    if dataset[column].dtype in ['int64', 'float64']:
        dataset[column].fillna(dataset[column].mean(), inplace=True)

In [21]:
# mode imputation for numerical columns
for column in dataset.columns:
    if dataset[column].dtype in ['object']:
        dataset[column].fillna(dataset[column].mode()[0], inplace=True)

In [22]:
# Check for missing values
dataset.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary                0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
dtype: int64

#### 4)Outlier Detection and Treatment

In [23]:
# Identify numerical columns
numerical_columns = dataset.select_dtypes(include=[np.number]).columns
print(f"Numerical columns: {numerical_columns}")

Numerical columns: Index(['work_year', 'salary', 'salary_in_usd', 'remote_ratio'], dtype='object')


In [24]:
# Calculate quartiles and IQR for each column
Q1 = dataset[numerical_columns].quantile(0.25)
Q3 = dataset[numerical_columns].quantile(0.75)
IQR = Q3 - Q1


In [25]:
# Define the boundaries to identify outliers for each column
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [26]:
# Handling outliers using NumPy for each column
for col in dataset[numerical_columns]:
    dataset[col] = np.where(dataset[col] < lower_bound[col], lower_bound[col], np.where(dataset[col] > upper_bound[col], upper_bound[col], dataset[col]))

#### 5)Encoding Categorical Variables

In [27]:
from sklearn.preprocessing import LabelEncoder

In [28]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

In [29]:
# Loop through each column in the DataFrame
for column in dataset.columns:
    if dataset[column].dtype == 'object':  
        dataset[column] = label_encoder.fit_transform(dataset[column])

In [30]:
dataset.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2024.0,3,2,3,202730.0,21,202730.0,84,0.0,74,1
4,2024.0,3,2,58,130500.0,21,130500.0,84,0.0,74,1
5,2024.0,3,2,58,96000.0,21,96000.0,84,0.0,74,1
6,2024.0,3,2,121,190000.0,21,190000.0,84,0.0,74,1
7,2024.0,3,2,121,160000.0,21,160000.0,84,0.0,74,1


#### 6)Standardize or Normalize Data

In [31]:
from sklearn.preprocessing import MinMaxScaler

In [32]:
# Initialize the MinMaxScaler
min_max_scaler = MinMaxScaler()

In [33]:
# Fit and transform the numerical columns
dataset = min_max_scaler.fit_transform(dataset)

In [35]:
dataset = pd.DataFrame(dataset)

In [36]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1.0,1.0,0.666667,0.019355,0.606104,0.954545,0.614501,0.965517,0.0,0.973684,0.5
1,1.0,1.0,0.666667,0.374194,0.374139,0.954545,0.378069,0.965517,0.0,0.973684,0.5
2,1.0,1.0,0.666667,0.374194,0.263342,0.954545,0.265139,0.965517,0.0,0.973684,0.5
3,1.0,1.0,0.666667,0.780645,0.565222,0.954545,0.572831,0.965517,0.0,0.973684,0.5
4,1.0,1.0,0.666667,0.780645,0.468877,0.954545,0.474632,0.965517,0.0,0.973684,0.5
