# Revision

In [73]:
import pandas as pd 
import numpy as np

#### 1)Understand the Data

In [74]:
dataset = pd.read_csv('Dataset salary re.csv')

In [75]:
dataset.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,Unnamed: 10,salary
0,2024.0,SE,FT,AI Engineer,USD,202730,US,0.0,US,M,7.0,202730.0
1,2024.0,SE,FT,AI Engineer,USD,202730,US,0.0,US,M,7.0,202730.0
2,2024.0,SE,FT,AI Engineer,USD,202730,US,0.0,US,M,7.0,202730.0
3,2024.0,SE,FT,AI Engineer,USD,202730,US,0.0,US,M,7.0,202730.0
4,2024.0,SE,FT,Data Engineer,USD,130500,US,0.0,US,M,7.0,130500.0


In [76]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16536 entries, 0 to 16535
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   work_year           16534 non-null  float64
 1   experience_level    16535 non-null  object 
 2   employment_type     16535 non-null  object 
 3   job_title           16536 non-null  object 
 4   salary_currency     16536 non-null  object 
 5   salary_in_usd       16536 non-null  int64  
 6   employee_residence  16536 non-null  object 
 7   remote_ratio        16536 non-null  float64
 8   company_location    16535 non-null  object 
 9   company_size        16536 non-null  object 
 10  Unnamed: 10         1971 non-null   float64
 11  salary              16533 non-null  float64
dtypes: float64(4), int64(1), object(7)
memory usage: 1.5+ MB


#### 2)Remove Unnecessary Data

In [77]:
# Remove duplicate rows
dataset.drop_duplicates(inplace=True)

In [78]:
dataset.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,Unnamed: 10,salary
0,2024.0,SE,FT,AI Engineer,USD,202730,US,0.0,US,M,7.0,202730.0
4,2024.0,SE,FT,Data Engineer,USD,130500,US,0.0,US,M,7.0,130500.0
5,2024.0,SE,FT,Data Engineer,USD,96000,US,0.0,US,M,7.0,96000.0
6,2024.0,SE,FT,Machine Learning Engineer,USD,190000,US,0.0,US,M,7.0,190000.0
7,2024.0,SE,FT,Machine Learning Engineer,USD,160000,US,0.0,US,M,7.0,160000.0


In [79]:
# Calculate the percentage of null values in each column
null_percentage = dataset.isnull().mean() * 100

# Display the result
print(null_percentage)

work_year              0.018882
experience_level       0.009441
employment_type        0.009441
job_title              0.000000
salary_currency        0.000000
salary_in_usd          0.000000
employee_residence     0.000000
remote_ratio           0.000000
company_location       0.009441
company_size           0.000000
Unnamed: 10           84.658233
salary                 0.028323
dtype: float64


In [83]:
dataset = dataset.drop('Unnamed: 10', axis=1)

In [84]:
dataset.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,salary
0,2024.0,SE,FT,AI Engineer,USD,202730,US,0.0,US,M,202730.0
4,2024.0,SE,FT,Data Engineer,USD,130500,US,0.0,US,M,130500.0
5,2024.0,SE,FT,Data Engineer,USD,96000,US,0.0,US,M,96000.0
6,2024.0,SE,FT,Machine Learning Engineer,USD,190000,US,0.0,US,M,190000.0
7,2024.0,SE,FT,Machine Learning Engineer,USD,160000,US,0.0,US,M,160000.0


#### 3)Handle Missing Values

In [85]:
# Check for missing values
dataset.isnull().sum()

work_year             2
experience_level      1
employment_type       1
job_title             0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      1
company_size          0
salary                3
dtype: int64

In [86]:
# Mean imputation for numerical columns
for column in dataset.columns:
    if dataset[column].dtype in ['int64', 'float64']:
        dataset[column].fillna(dataset[column].mean(), inplace=True)

In [87]:
# mode imputation for numerical columns
for column in dataset.columns:
    if dataset[column].dtype in ['object']:
        dataset[column].fillna(dataset[column].mode()[0], inplace=True)

In [88]:
# Check for missing values
dataset.isnull().sum()

work_year             0
experience_level      0
employment_type       0
job_title             0
salary_currency       0
salary_in_usd         0
employee_residence    0
remote_ratio          0
company_location      0
company_size          0
salary                0
dtype: int64

#### 4)Outlier Detection and Treatment

In [89]:
# Identify numerical columns
numerical_columns = dataset.select_dtypes(include=[np.number]).columns
print(f"Numerical columns: {numerical_columns}")

Numerical columns: Index(['work_year', 'salary_in_usd', 'remote_ratio', 'salary'], dtype='object')


In [90]:
# Calculate quartiles and IQR for each column
Q1 = dataset[numerical_columns].quantile(0.25)
Q3 = dataset[numerical_columns].quantile(0.75)
IQR = Q3 - Q1


In [91]:
# Define the boundaries to identify outliers for each column
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [92]:
# Handling outliers using NumPy for each column
for col in dataset[numerical_columns]:
    dataset[col] = np.where(dataset[col] < lower_bound[col], lower_bound[col], np.where(dataset[col] > upper_bound[col], upper_bound[col], dataset[col]))

#### 5)Encoding Categorical Variables

In [93]:
from sklearn.preprocessing import LabelEncoder

In [94]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

In [95]:
# Loop through each column in the DataFrame
for column in dataset.columns:
    if dataset[column].dtype == 'object':  
        dataset[column] = label_encoder.fit_transform(dataset[column])

In [96]:
dataset.head()

Unnamed: 0,work_year,experience_level,employment_type,job_title,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size,salary
0,2024.0,3,2,3,21,202730.0,84,0.0,74,1,202730.0
4,2024.0,3,2,58,21,130500.0,84,0.0,74,1,130500.0
5,2024.0,3,2,58,21,96000.0,84,0.0,74,1,96000.0
6,2024.0,3,2,121,21,190000.0,84,0.0,74,1,190000.0
7,2024.0,3,2,121,21,160000.0,84,0.0,74,1,160000.0


#### 6)Standardize or Normalize Data

In [97]:
from sklearn.preprocessing import MinMaxScaler

In [98]:
# Initialize the MinMaxScaler
min_max_scaler = MinMaxScaler()

In [99]:
# Fit and transform the numerical columns
dataset = min_max_scaler.fit_transform(dataset)

In [100]:
print(dataset)

[[1.         1.         0.66666667 ... 0.97368421 0.5        0.6061044 ]
 [1.         1.         0.66666667 ... 0.97368421 0.5        0.37413852]
 [1.         1.         0.66666667 ... 0.97368421 0.5        0.26334213]
 ...
 [0.         0.         0.66666667 ... 0.97368421 1.         0.29224554]
 [0.         0.         0.         ... 0.97368421 0.         0.27618809]
 [0.         1.         0.66666667 ... 0.5        0.         1.        ]]


#### Split into data and teaget


In [121]:
# split into data and teaget
data_x = dataset[:,:-1]
data_y = dataset[:,-1]

In [122]:
data_x

array([[1.        , 1.        , 0.66666667, ..., 0.        , 0.97368421,
        0.5       ],
       [1.        , 1.        , 0.66666667, ..., 0.        , 0.97368421,
        0.5       ],
       [1.        , 1.        , 0.66666667, ..., 0.        , 0.97368421,
        0.5       ],
       ...,
       [0.        , 0.        , 0.66666667, ..., 0.4       , 0.97368421,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.4       , 0.97368421,
        0.        ],
       [0.        , 1.        , 0.66666667, ..., 0.2       , 0.5       ,
        0.        ]])

In [123]:
data_y

array([0.6061044 , 0.37413852, 0.26334213, ..., 0.29224554, 0.27618809,
       1.        ])

In [120]:
# Feature selection 
from sklearn.feature_selection import SelectPercentile,f_classif

In [124]:
data_x.shape

(10592, 10)

In [125]:
select = SelectPercentile(score_func=f_classif,percentile=70).fit_transform(data_x,data_y)

In [126]:
data_x = select

In [127]:
data_x.shape

(10592, 7)

In [128]:
# split int train and test
from sklearn.model_selection import train_test_split

In [129]:
x_train,x_test,y_train,y_test = train_test_split(data_x,data_y,test_size=0.3,random_state = 88)

In [131]:
# KNN
from sklearn.neighbors import KNeighborsRegressor

In [132]:
knn = KNeighborsRegressor(n_neighbors=5,weights='distance',algorithm='auto').fit(x_train,y_train)

In [134]:
# train accuracy
print("Train Accuracy ",knn.score(x_train,y_train))

Train Accuracy  0.9998596333045258


In [135]:
# Test accuracy
print("Test Accuracy ",knn.score(x_test,y_test))

Test Accuracy  0.9589242539018782


In [136]:
# git y Predict
y_pred = knn.predict(x_test)

In [137]:
# Mean Absolute  Error
from sklearn.metrics import mean_absolute_error as me
print(me(y_test,y_pred))

0.0111904860088567
