In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Define constants
np.random.seed(42)
number_of_training = 500
number_of_test = 100

# Define possible values
job_roles = ['Software Engineer', 'Accountant', 'HR Manager', 'Sales Executive', 'Data Analyst']
education_levels = ['BSc', 'HND', 'MSc', 'PhD']
locations = ['Lagos', 'Abuja', 'Kano', 'Port Harcourt', 'Ibadan']
genders = ['Male', 'Female']

# Salary ranges (in NGN) for each role (for realism)
jobrole_base_salary = {
    'Software Engineer': 350000,
    'Accountant': 250000,
    'HR Manager': 300000,
    'Sales Executive': 200000,
    'Data Analyst': 320000
}

# Function to generate dataset
def generate_dataset(n):
    data = {
        'Job Role': np.random.choice(job_roles, n),
        'Experience Years': np.random.randint(0, 21, n),
        'Education Level': np.random.choice(education_levels, n),
        'Location': np.random.choice(locations, n),
        'Age': np.random.randint(20, 60, n),
        'Gender': np.random.choice(genders, n),
    }

    df = pd.DataFrame(data)

    # Generate salary based on role, experience and education level
    salary = []
    for idx, row in df.iterrows():
        base = jobrole_base_salary[row['Job Role']]
        bonus = row['Experience Years'] * 10000
        edu_bonus = {'HND': 10000, 'BSc': 20000, 'MSc': 40000, 'PhD': 60000}[row['Education Level']]
        final_salary = base + bonus + edu_bonus
        salary.append(final_salary)

    df['Salary (₦)'] = salary
    return df

# Generate datasets
train_df = generate_dataset(number_of_training)
test_df = generate_dataset(number_of_test)

# Save for inspection
train_df.to_csv("train_payroll.csv", index=False)
test_df.to_csv("test_payroll.csv", index=False)

# Prepare data for ML
features = ['Job Role', 'Experience Years', 'Education Level', 'Location', 'Age', 'Gender']
target = 'Salary (₦)'

# Combine for consistent encoding
combined_df = pd.concat([train_df[features], test_df[features]], axis=0)

# One-hot encode categorical features
encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_features = encoder.fit_transform(combined_df.select_dtypes(include='object'))
encoded_feature_names = encoder.get_feature_names_out(combined_df.select_dtypes(include='object').columns)

# Final feature DataFrame
numerical_features = combined_df.select_dtypes(exclude='object').reset_index(drop=True)
X_all = pd.DataFrame(encoded_features, columns=encoded_feature_names)
X_all = pd.concat([numerical_features, X_all], axis=1)

# Split into training and test sets
X_train = X_all[:number_of_training]
X_test = X_all[number_of_training:]
y_train = train_df[target]
y_test = test_df[target]

# Model training
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Prediction
y_pred = model.predict(X_test)

# Evaluate
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): ₦{rmse:.2f}")

# Preview predictions
preview = test_df.copy()
preview['Predicted Salary (₦)'] = y_pred.astype(int)
print(preview[['Job Role', 'Experience Years', 'Education Level', 'Location', 'Age', 'Gender', 'Salary (₦)', 'Predicted Salary (₦)']].head())

Root Mean Squared Error (RMSE): ₦11455.46
            Job Role  Experience Years Education Level       Location  Age  \
0         Accountant                 1             PhD           Kano   56   
1       Data Analyst                 4             PhD          Abuja   38   
2       Data Analyst                 2             BSc  Port Harcourt   47   
3         Accountant                 8             MSc         Ibadan   59   
4  Software Engineer                19             HND         Ibadan   39   

   Gender  Salary (₦)  Predicted Salary (₦)  
0    Male      320000                309000  
1  Female      420000                419000  
2  Female      360000                358000  
3    Male      370000                368600  
4  Female      550000                557200  
