#Download The Required Libraries

In [None]:
!pip install xgboost
!pip install sklearn

#Import Libraries

In [None]:
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split , RandomizedSearchCV
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.metrics import mean_squared_error as MSE

#Load Dataset

In [None]:
url='https://raw.githubusercontent.com/Asalghaani/Employee_analytics/main/Data/employee_records.csv"
dataset=pd.read_csv(url,index_col=0)
dataset.head()

Unnamed: 0_level_0,Employee_Name,Age,Country,Department,Position,Salary,Joining_Date
Employee_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Daniel Taylor,25,UK,HR,Analyst,142278.32,2023-06-04
2,Ethan Brown,44,India,Marketing,Executive,98549.2,2018-01-13
3,Sophia Martinez,51,Japan,Finance,Developer,85565.84,2015-04-30
4,Ethan Martinez,47,Germany,Support,Analyst,34513.67,2015-06-17
5,Mia Brown,32,Australia,Support,Consultant,45339.72,2019-02-22


#Preprocessing On Dataset

In [None]:
dataset.shape

(30000, 7)

##Encoding Data

In [None]:
dataset.drop(columns=['Employee_Name'],inplace=True)

to_encoding_columns=['Country','Department','Position']

encoder=OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1)
dataset[to_encoding_columns]=encoder.fit_transform(dataset[to_encoding_columns])
dataset.head()

Unnamed: 0_level_0,Age,Country,Department,Position,Salary,Joining_Date
Employee_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,25,8.0,2.0,0.0,142278.32,2023-06-04
2,44,5.0,3.0,4.0,98549.2,2018-01-13
3,51,6.0,1.0,3.0,85565.84,2015-04-30
4,47,4.0,5.0,0.0,34513.67,2015-06-17
5,32,0.0,5.0,2.0,45339.72,2019-02-22


##Scale Data

In [None]:
x=dataset[['Age','Country','Department','Position']]
y=dataset['Salary']

scaler=StandardScaler()
x=scaler.fit_transform(x)
y=scaler.fit_transform(y.values.reshape(-1,1))

##Split into training, testing and validation data

In [None]:
x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=42)
x_train,x_val,y_train,y_val= train_test_split(x_train,y_train,test_size=0.2,random_state=42)

##Data Shape

In [None]:
def shape(parametr):
  return(parametr.shape)

In [None]:
print("x_train: ",shape(x_train))
print("y_train: ",shape(y_train))
print("x_val: ",shape(x_val))
print("y_val: ",shape(y_val))
print("x_test: ",shape(x_test))
print("y_test: ",shape(y_test))

#Load & Train Model

In [None]:
Model=xgb.XGBRegressor(objective='reg:squarederror',random_state=42,
                       tree_method='hist',device='cuda',early_stopping_rounds=10,
                       eval_metric='rmse')

In [None]:
param_grid={
    'n_estimators': [100,150,200],
    'learning_rate': [0.01,0.02,0.05],
    'max_depth': [2,3],
    'min_child_weight': [1,3,5],
    'gamma': [0,0.1,0.2],
    'subsample': [0.7,0.8,0.9],
    'colsample_bytree': [0.7,0.8,0.9],
    'reg_alpha': [2,3,5],
    'reg_lambda': [0,0.1,0.5]
}

search_param=RandomizedSearchCV(estimator=Model,
                               param_distributions=param_grid,
                               n_iter=100,
                               cv=10,
                               n_jobs=-1,
                               verbose=2,
                               random_state=42)

search_param.fit(x_train,y_train,eval_set=[(x_val,y_val)],verbose=10)

print(f"best hyperparameters: {search_param.best_params_}")
print(f"best score: {search_param.best_score_}")

best_model=search_param.best_estimator_
predict=best_model.predict(x_test)
loss=MSE(y_test,predict)

print(f"loss: {loss}")

Fitting 10 folds for each of 100 candidates, totalling 1000 fits
[0]	validation_0-rmse:1.00550
[10]	validation_0-rmse:1.00543
[20]	validation_0-rmse:1.00542
[30]	validation_0-rmse:1.00542
[31]	validation_0-rmse:1.00541
best hyperparameters: {'subsample': 0.8, 'reg_lambda': 0, 'reg_alpha': 5, 'n_estimators': 150, 'min_child_weight': 1, 'max_depth': 3, 'learning_rate': 0.02, 'gamma': 0, 'colsample_bytree': 0.7}
best score: 0.0001524380841979145
loss: 1.0084304016417998


In [None]:
print("""Help-Text:
Be careful when entering information.
The age should be in numbers without decimals.
The first letter of the information must be capitalized, and if you use abbreviations, all letters should be capitalized.""")


age=int(input("Please Enter Your Age: "))
country=input("Please Enter Your Country: ")
department=input("Please Enter Your Department: ")
position=input("Please Enter Your Position: ")

user_input=pd.DataFrame([[country,department,position]],columns=to_encoding_columns)
encoded_input=encoder.transform(user_input)

data_to_array=np.array([age,encoded_input[0,0],encoded_input[0,1],encoded_input[0,2]])

pred_salary=best_model.predict(data_to_array)
pred_salary=scaler.inverse_transform(pred_salary.reshape(-1,1))
print(f"Predicted Salary: {pred_salary[0][0]}")

Help-Text:
Be careful when entering information.
The age should be in numbers without decimals.
The first letter of the information must be capitalized, and if you use abbreviations, all letters should be capitalized.
Please Enter Your Age: 41
Please Enter Your Country: India
Please Enter Your Department: Marketing
Please Enter Your Position: Executive
Predicted Salary: 91128.078125
