In [None]:
#This notebook performs inference using the trained Random Forest model on new, unseen data. The data is in its raw format, as it was before preprocessing (not scaled or encoded). The model was saved in the training notebook (`P1M2_abyan.ipynb`).

import pandas as pd
import joblib

In [3]:
# Load the trained Random Forest model
try:
    best_model = joblib.load('best_model_rf.pkl')
    print("Random Forest model loaded successfully")
except FileNotFoundError:
    print("Error: 'best_model_rf.pkl' not found. Ensure the model was saved in the training notebook.")
    raise
except Exception as e:
    print(f"Error loading model: {e}")
    raise

Random Forest model loaded successfully


In [4]:
# Define new data (raw format, not scaled or slagencoded)
new_data = pd.DataFrame({
    'Age': [35, 28, 45],
    'BusinessTravel': ['Travel_Rarely', 'Travel_Frequently', 'Non-Travel'],
    'DailyRate': [800, 600, 1200],
    'Department': ['Sales', 'Research & Development', 'Human Resources'],
    'DistanceFromHome': [5, 20, 2],
    'Education': [3, 2, 4],
    'EducationField': ['Marketing', 'Life Sciences', 'Human Resources'],
    'EmployeeCount': [1, 1, 1],
    'EmployeeNumber': [2001, 2002, 2003],
    'EnvironmentSatisfaction': [3, 2, 4],
    'Gender': ['Male', 'Female', 'Male'],
    'HourlyRate': [70, 50, 90],
    'JobInvolvement': [3, 2, 3],
    'JobLevel': [2, 1, 3],
    'JobRole': ['Sales Executive', 'Research Scientist', 'Manager'],
    'JobSatisfaction': [4, 3, 4],
    'MaritalStatus': ['Married', 'Single', 'Divorced'],
    'MonthlyIncome': [5000, 3000, 10000],
    'MonthlyRate': [15000, 12000, 20000],
    'NumCompaniesWorked': [2, 1, 5],
    'Over18': ['Y', 'Y', 'Y'],
    'OverTime': ['No', 'Yes', 'No'],
    'PercentSalaryHike': [15, 12, 20],
    'PerformanceRating': [3, 3, 4],
    'RelationshipSatisfaction': [3, 2, 4],
    'StandardHours': [80, 80, 80],
    'StockOptionLevel': [1, 0, 2],
    'TotalWorkingYears': [10, 5, 20],
    'TrainingTimesLastYear': [3, 2, 3],
    'WorkLifeBalance': [3, 2, 3],
    'YearsAtCompany': [5, 3, 15],
    'YearsInCurrentRole': [3, 2, 10],
    'YearsSinceLastPromotion': [1, 0, 5],
    'YearsWithCurrManager': [3, 2, 8]
})

# Display the new data
print("New Data for Inference:")
print(new_data)

New Data for Inference:
   Age     BusinessTravel  DailyRate              Department  \
0   35      Travel_Rarely        800                   Sales   
1   28  Travel_Frequently        600  Research & Development   
2   45         Non-Travel       1200         Human Resources   

   DistanceFromHome  Education   EducationField  EmployeeCount  \
0                 5          3        Marketing              1   
1                20          2    Life Sciences              1   
2                 2          4  Human Resources              1   

   EmployeeNumber  EnvironmentSatisfaction  ... RelationshipSatisfaction  \
0            2001                        3  ...                        3   
1            2002                        2  ...                        2   
2            2003                        4  ...                        4   

   StandardHours  StockOptionLevel  TotalWorkingYears TrainingTimesLastYear  \
0             80                 1                 10                 

In [5]:
# Predict attrition for the new data
try:
    predictions = best_model.predict(new_data)
except Exception as e:
    print(f"Error during prediction: {e}")
    raise

# Map predictions back to human-readable labels
# Assuming the model was trained with 0 = 'No', 1 = 'Yes'
predictions_labels = ['Yes' if pred == 1 else 'No' for pred in predictions]

# Display results
results = pd.DataFrame({
    'EmployeeNumber': new_data['EmployeeNumber'],
    'Predicted_Attrition': predictions_labels
})
print("\nInference Results:")
print(results)


Inference Results:
   EmployeeNumber Predicted_Attrition
0            2001                  No
1            2002                  No
2            2003                  No
