In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
from google.colab import files

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
# Load the dataset
df = pd.read_csv("gdrive/My Drive/Colab Notebooks/New folder/employee_dataset(9).csv")

# Display the first few rows
print(df.head())


   EmployeeID  Age Department  YearsOfService EducationLevel  \
0           1   50         IT               6         Master   
1           2   36  Marketing              11       Bachelor   
2           3   29         HR              11       Bachelor   
3           4   42         IT              10         Master   
4           5   40         IT              12      Associate   

   NumberOfLeavesTaken  AttendancePercentage           ContractStartDate  \
0                   12                    93  2015-05-27 19:27:50.852538   
1                    7                    90  2022-03-19 19:27:50.852557   
2                    8                    76  2023-08-19 19:27:50.852562   
3                   19                    75  2015-11-19 19:27:50.852566   
4                    1                    77  2019-11-04 19:27:50.852570   

   PerformanceRating                  JobTitle  DaysSinceStart  \
0                  5                 Architech            3584   
1                  3      

In [4]:
# Check for null values
print(df.isnull().sum())

EmployeeID              0
Age                     0
Department              0
YearsOfService          0
EducationLevel          0
NumberOfLeavesTaken     0
AttendancePercentage    0
ContractStartDate       0
PerformanceRating       0
JobTitle                0
DaysSinceStart          0
LeavePercentage         0
PromotionEligible       0
Turnover                0
dtype: int64
0


In [5]:
# Check for duplicates
print(df.duplicated().sum())

0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   EmployeeID            10000 non-null  int64  
 1   Age                   10000 non-null  int64  
 2   Department            10000 non-null  object 
 3   YearsOfService        10000 non-null  int64  
 4   EducationLevel        10000 non-null  object 
 5   NumberOfLeavesTaken   10000 non-null  int64  
 6   AttendancePercentage  10000 non-null  int64  
 7   ContractStartDate     10000 non-null  object 
 8   PerformanceRating     10000 non-null  int64  
 9   JobTitle              10000 non-null  object 
 10  DaysSinceStart        10000 non-null  int64  
 11  LeavePercentage       10000 non-null  float64
 12  PromotionEligible     10000 non-null  bool   
 13  Turnover              10000 non-null  bool   
dtypes: bool(2), float64(1), int64(7), object(4)
memory usage: 957.2+ KB


In [None]:
# Define categorical and numerical features
categorical_features = ['Department', 'EducationLevel', 'JobTitle', 'DaysSinceStart']
numerical_features = ['Age', 'YearsOfService', 'NumberOfLeavesTaken', 'AttendancePercentage', 'PerformanceRating', 'LeavePercentage', 'DaysSinceStart']

# Initialize OneHotEncoder
one_hot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# Fit and transform categorical features
encoded_categorical_data = one_hot_encoder.fit_transform(df[categorical_features])

In [None]:
# drop unwanted column
df = df.drop(columns=['EmployeeID','ContractStartDate'])


In [None]:
# Scale numerical features
scaler = StandardScaler()
scaled_numerical_data = scaler.fit_transform(df[numerical_features])

# Combine encoded categorical and scaled numerical data
X = np.hstack([encoded_categorical_data, scaled_numerical_data])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   10000 non-null  int64  
 1   Department            10000 non-null  object 
 2   YearsOfService        10000 non-null  int64  
 3   EducationLevel        10000 non-null  object 
 4   NumberOfLeavesTaken   10000 non-null  int64  
 5   AttendancePercentage  10000 non-null  int64  
 6   PerformanceRating     10000 non-null  int64  
 7   JobTitle              10000 non-null  object 
 8   DaysSinceStart        10000 non-null  int64  
 9   LeavePercentage       10000 non-null  float64
 10  PromotionEligible     10000 non-null  bool   
 11  Turnover              10000 non-null  bool   
dtypes: bool(2), float64(1), int64(6), object(3)
memory usage: 800.9+ KB


In [None]:
# Define target variables
y_turnover = df['Turnover']
y_promotion = df['PromotionEligible']

# Split the data into training and testing sets
X_train, X_test, y_train_turnover, y_test_turnover = train_test_split(X, y_turnover, test_size=0.2, random_state=42)
X_train, X_test, y_train_promotion, y_test_promotion = train_test_split(X, y_promotion, test_size=0.2, random_state=42)

In [None]:
# Train the Turnover Model
turnover_model = RandomForestClassifier(n_estimators=100, random_state=42)
turnover_model.fit(X_train, y_train_turnover)

# Predictions for Turnover
y_pred_turnover = turnover_model.predict(X_test)

# Evaluation for Turnover
print("Turnover Prediction Accuracy:", accuracy_score(y_test_turnover, y_pred_turnover))
print(classification_report(y_test_turnover, y_pred_turnover))

Turnover Prediction Accuracy: 0.9445
              precision    recall  f1-score   support

       False       0.91      1.00      0.95      1068
        True       1.00      0.88      0.94       932

    accuracy                           0.94      2000
   macro avg       0.95      0.94      0.94      2000
weighted avg       0.95      0.94      0.94      2000



In [None]:
# Train the Promotion Model
promotion_model = RandomForestClassifier(n_estimators=100, random_state=42)
promotion_model.fit(X_train, y_train_promotion)

# Predictions for Promotion
y_pred_promotion = promotion_model.predict(X_test)

# Evaluation for Promotion
print("Promotion Prediction Accuracy:", accuracy_score(y_test_promotion, y_pred_promotion))
print(classification_report(y_test_promotion, y_pred_promotion))

Promotion Prediction Accuracy: 0.9055
              precision    recall  f1-score   support

       False       0.91      0.95      0.93      1330
        True       0.89      0.82      0.85       670

    accuracy                           0.91      2000
   macro avg       0.90      0.88      0.89      2000
weighted avg       0.90      0.91      0.90      2000



In [None]:
# Save the models and preprocessing objects
joblib.dump(turnover_model, 'turnover_model.pkl')
files.download('turnover_model.pkl')
joblib.dump(promotion_model, 'promotion_model.pkl')
files.download('promotion_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
files.download('scaler.pkl')
joblib.dump(one_hot_encoder, 'one_hot_encoder.pkl')
files.download('one_hot_encoder.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>