In [19]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

# load data
train_data = pd.read_csv('../data/train.csv')

# Remove duplicates
train_data = train_data.drop_duplicates()

# Encode categorical variables manually (salary levels)
salary_mapping = {"low": 0, "medium": 1, "high": 2}
train_data["salary"] = train_data["salary"].map(salary_mapping)

# Define features and target variable
X = train_data.drop("left", axis=1)
y = train_data["left"].copy()

# Splitting dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Identify numerical and categorical features
num_features = X_train.select_dtypes(include=np.number).columns
cat_features = X_train.select_dtypes(include="object").columns

print(f"Numerical Features: {list(num_features)}")
print(f"Categorical Features: {list(cat_features)}")

# Save train and validation sets
os.makedirs("data", exist_ok=True)
X_train.to_csv("data/X_train.csv", index=False)
X_val.to_csv("data/X_val.csv", index=False)
y_train.to_csv("data/y_train.csv", index=False)
y_val.to_csv("data/y_val.csv", index=False)

Numerical Features: ['satisfaction_level', 'last_evaluation', 'number_project', 'average_montly_hours', 'time_spend_company', 'Work_accident', 'promotion_last_5years', 'Departments', 'salary', 'work_hours_per_project']
Categorical Features: []


In [21]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

# Numerical pipeline: handle missing values and scale data
num_pipeline = Pipeline([
    ("num_imputer", SimpleImputer(strategy="mean")),
    ("std_scaler", StandardScaler())
])

# Categorical pipeline: handle missing values and encode categorical data
cat_pipeline = Pipeline([
    ("cat_imputer", SimpleImputer(strategy="most_frequent")),
    ("ordinal_encoder", OrdinalEncoder())
])

# Combine both pipelines using ColumnTransformer
pre_processing_pipeline = ColumnTransformer([
    ("num_pipe", num_pipeline, num_features),
    ("cat_pipe", cat_pipeline, cat_features)
])

print(pre_processing_pipeline)

ColumnTransformer(transformers=[('num_pipe',
                                 Pipeline(steps=[('num_imputer',
                                                  SimpleImputer()),
                                                 ('std_scaler',
                                                  StandardScaler())]),
                                 Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'Departments', 'salary',
       'work_hours_per_project'],
      dtype='object')),
                                ('cat_pipe',
                                 Pipeline(steps=[('cat_imputer',
                                                  SimpleImputer(strategy='most_frequent')),
                                                 ('ordinal_encoder',
                                                  OrdinalEncoder())]),
                                 Index([], dtype='object'))])

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

# Create a complete ML pipeline with preprocessing and model
model_pipeline = Pipeline([
    ("pre_processing", pre_processing_pipeline),
    ("model", RandomForestClassifier(n_estimators=120, random_state=42))
])

print(model_pipeline)

# Train the model
model = model_pipeline.fit(X_train, y_train)

Pipeline(steps=[('pre_processing',
                 ColumnTransformer(transformers=[('num_pipe',
                                                  Pipeline(steps=[('num_imputer',
                                                                   SimpleImputer()),
                                                                  ('std_scaler',
                                                                   StandardScaler())]),
                                                  Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'Work_accident',
       'promotion_last_5years', 'Departments', 'salary',
       'work_hours_per_project'],
      dtype='object')),
                                                 ('cat_pipe',
                                                  Pipeline(steps=[('cat_imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
             



In [23]:
from sklearn.metrics import accuracy_score

# Predict on validation set
y_pred = model.predict(X_val)

# Evaluate model
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.9833




In [24]:
import joblib

# Save the trained model
os.makedirs("models", exist_ok=True)
joblib.dump(model, "models/hr_model_pipeline.pkl")

print("Model saved successfully!")

Model saved successfully!
