# Homework_8 - KNeighborsRegressor for Salary Prediction

Created on Wed Oct  2 11:16:45 2024
@author: hanna.dunska


# Step 1. Setting up the libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_percentage_error
import category_encoders as ce
from sklearn.preprocessing import PowerTransformer
import warnings

# Step 2. Data Loading

In [2]:
X_train = pd.read_csv('/Users/hanna.dunska/Desktop/Machine Learning/MACHINE-LEARNING-NEO/datasets/mod_04_hw_train_data.csv')
X_test = pd.read_csv('/Users/hanna.dunska/Desktop/Machine Learning/MACHINE-LEARNING-NEO/datasets/mod_04_hw_valid_data.csv')

X_train.info()
X_test.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries, 0 to 248
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           249 non-null    object 
 1   Phone_Number   249 non-null    object 
 2   Experience     247 non-null    float64
 3   Qualification  248 non-null    object 
 4   University     249 non-null    object 
 5   Role           246 non-null    object 
 6   Cert           247 non-null    object 
 7   Date_Of_Birth  249 non-null    object 
 8   Salary         249 non-null    int64  
dtypes: float64(1), int64(1), object(7)
memory usage: 17.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Name           7 non-null      object
 1   Phone_Number   7 non-null      object
 2   Experience     7 non-null      int64 
 3   Qualification  7 non-null    

# Step 3. Defining the Target Data

In [3]:
y_train = X_train.pop('Salary')
y_train.head()

y_test = X_test.pop('Salary')
y_test.head()


0    109300
1     84800
2     98900
3    116500
4     75800
Name: Salary, dtype: int64

# Step 4. Exploratory Data Analysis (EDA)

In [4]:
# 4.1 Outliers
X_train['Experience'].describe()
# Decision_1: I did not perform outlier cleaning due to the absence of outliers

# 4.2 Feature Engineering in X_train
X_train['Date_Of_Birth'].head()

X_train['Date_Of_Birth'] = pd.to_datetime(
    X_train['Date_Of_Birth'], 
    format='%d/%m/%Y', 
    errors='coerce'
)

if X_train['Date_Of_Birth'].isna().sum() > 0:
    print("There are NaT values in 'Date_Of_Birth'.")    
    
X_train['Date_Of_Birth'].head()

max_year = X_train['Date_Of_Birth'].dt.year.max()
max_year

X_train['Age'] = max_year - X_train['Date_Of_Birth'].dt.year
X_train['Age'].describe()

bins = [-1, 17, 24, 34, 44, 54, float('inf')]
labels = ['Under 18 years old', '18-24 years old', '25-34 years old', '35-44 years old', '45-54 years old', '55 and older']

X_train['Age_Group'] = pd.cut(X_train['Age'], bins=bins, labels=labels).astype('object')
print(X_train[['Age', 'Age_Group']].head())


X_train.drop(['Phone_Number', 'Name', 'Age', 'Date_Of_Birth'], axis=1, inplace=True)
X_train = X_train[X_train['Age_Group'] != 'Under 18 years old']
y_train = y_train[X_train.index] 
# Decision_2: I deleted the values of the 'Under 18 years old' group due to legal concerns.
# Decision_3: I have made a decision to delete the 'Phone_Number', 'Name', 'Age', 'Date_Of_Birth' features as unnecessary ones

# 4.3 Handling NAs in X_train
X_train.isna().mean().sort_values(ascending=False)
X_train.dropna(how='any', inplace=True)
y_train = y_train[X_train.index] 
# Decision_4: I have decided to remove NA values from the dataset and not to impute any data (such as average values or interpolated values)

# 4.4 Feature Engineering in X_test
X_test['Date_Of_Birth'] = pd.to_datetime(
    X_test['Date_Of_Birth'], 
    format='%d/%m/%Y', 
    errors='coerce'
)

X_test['Age'] = max_year - X_test['Date_Of_Birth'].dt.year
X_test['Age_Group'] = pd.cut(X_test['Age'], bins=bins, labels=labels).astype('object')


X_test.drop(['Phone_Number', 'Name', 'Age', 'Date_Of_Birth'], axis=1, inplace=True)
X_test = X_test[X_test['Age_Group'] != 'Under 18 years old']
y_train = y_train[X_train.index]


# 4.5 Handling NAs in X_test
X_test.isna().mean().sort_values(ascending=False)
X_test.dropna(how='any', inplace=True)

X_train.info()
X_test.info()


   Age           Age_Group
0   49     45-54 years old
1    8  Under 18 years old
2   19     18-24 years old
3   51     45-54 years old
4   51     45-54 years old
<class 'pandas.core.frame.DataFrame'>
Index: 220 entries, 0 to 248
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Experience     220 non-null    float64
 1   Qualification  220 non-null    object 
 2   University     220 non-null    object 
 3   Role           220 non-null    object 
 4   Cert           220 non-null    object 
 5   Age_Group      220 non-null    object 
dtypes: float64(1), object(5)
memory usage: 12.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Experience     7 non-null      int64 
 1   Qualification  7 non-null      object
 2   University     7 non-null      object
 3   Role          

# Step 5. Discretizing Numerical Features

In [5]:
num_cols = X_train.select_dtypes(include='float64').columns
kbins = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform').fit(X_train[num_cols])

X_train[num_cols] = kbins.transform(X_train[num_cols]).astype(int)
X_test[num_cols] = kbins.transform(X_test[num_cols]).astype(int)

print(kbins.bin_edges_)

[array([1.        , 2.33333333, 3.66666667, 5.        ])]


# Step 6. Categorical DataProcessing and Encoding

In [6]:
with warnings.catch_warnings():
    warnings.simplefilter('ignore', category=FutureWarning)

    ordinal_cols = ['Experience', 'Qualification'] 
    onehot_cols = ['University', 'Role', 'Cert', 'Age_Group']  
    
    ordinal_encoder = ce.OrdinalEncoder(cols=ordinal_cols)
    X_train = ordinal_encoder.fit_transform(X_train)
    X_test = ordinal_encoder.transform(X_test)
    
    onehot_encoder = ce.OneHotEncoder(cols=onehot_cols, use_cat_names=True)
    X_train = onehot_encoder.fit_transform(X_train, y_train)
    X_test = onehot_encoder.transform(X_test)

# Step 7. Assimetry Handling

In [7]:
power_transform = PowerTransformer().set_output(transform='pandas')

X_train = power_transform.fit_transform(X_train)
X_test = power_transform.transform(X_test)


# Step 8. Constructing the KNeighbors Regressor

In [8]:
knn_reg_mod = KNeighborsRegressor(n_neighbors=10, n_jobs=-1).fit(X_train, y_train)

knn_reg_preds = knn_reg_mod.predict(X_test)

mape = mean_absolute_percentage_error(y_test, knn_reg_preds)
print(f'Validation MAPE: {mape:.2%}')



Validation MAPE: 4.28%


# Step 9. Conclusion

In the process of EDA, I made the following decisions:

 - Decision 1: I did not perform outlier cleaning for the float data, particularly the "Experience" feature, due to the absence of outliers.
 - Decision 2: I decided to delete the 'Phone_Number', 'Name', 'Age', and 'Date_Of_Birth' features as unnecessary.
 - Decision 3: I deleted the values of the 'Under 18 years old' group due to legal concerns - minors cannot work, and thus there can be no talk of predicting salary.
 - Decision 4: I decided to remove NA values from the dataset and not to impute any data (such as average values or interpolated values) in order not to introduce noise into the data.

I carried out these steps for both the training and test datasets.

Next, I used KBinsDiscretizer(n_bins=3, strategy='uniform'). This discretization helped me generalize the experience variable. My experience was distributed from 1 to 5, so I logically divided it into the following bins:

 - 1 to 2.33
 - 2.33 to 3.67
 - 3.67 to 5

Then, I used different category encoders:

 - Ordinal columns: ['Experience', 'Qualification']
 - One-hot columns: ['University', 'Role', 'Cert', 'Age_Group']

For ordinal categorical variables, I used OrdinalEncoder, while for categorical variables without an explicit order, I used OneHotEncoder.

By using PowerTransformer, I reduced the skewness of numerical variables since all variables became numerical after encoding.

Then, I found that in my case, the optimal number of neighbors is 10 for the KNeighborsRegressor.

The obtained result corresponds to the task conditions, as I expect to achieve a MAPE error on the validation set in the range of 3-5%.