In [8]:
from IPython import get_ipython
from IPython.display import display
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import pickle
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import RandomizedSearchCV

class TestModel:
    def __init__(self):
        with open('best_model.pkl', 'rb') as f:
            self.model = pickle.load(f)

    def predict(self, X):
        return self.model.predict(X)

    def predict_proba(self, X):
        return self.model.predict_proba(X)

def production(X_path, y_path):
    model = TestModel()
    df_X = pd.read_csv(X_path)
    df_y = pd.read_csv(y_path)['Left']
    distance_counts = df_X['Distance'].value_counts(dropna=False)
    df_X['Distance_Missing'] = df_X['Distance'].isnull().astype(int)
    df_X['Distance'] = (
        df_X['Distance']
        .fillna('Unknown')
        .replace({'<5mile': '<5miles'})
    )
    distance_map = {
        '<5miles': 1,
        '~10miles': 2,
        '~15miles': 3,
        '~20miles': 4,
        '>30miles': 5,
        'Unknown': 0
    }
    df_X['Distance_Num'] = df_X['Distance'].map(distance_map)
    print(df_X['Distance_Num'].value_counts().sort_index())
    verification = (
        df_X[['Distance', 'Distance_Num']]
        .drop_duplicates()
        .sort_values('Distance_Num')
    )
    def clean_salary(sal):
        if isinstance(sal, str):
            sal = sal.upper().replace('K', '')
        return int(sal) * 1000
    df_X['PreviousSalary'] = df_X['PreviousSalary'].apply(clean_salary)
    df_X['Salary'] = df_X['Salary'].apply(clean_salary)
    numerical_cols = ['YearsWorked', 'TrainingHours', 'NumOfProjects', 'TeamSize', 
                      'WorkSatisfactionScore', 'JobEngagementScore', 'PhysicalActivityScore',
                      'MentalWellbeingScore', 'SelfReview', 'SupervisorReview']
    for col in numerical_cols:
        df_X[col] = pd.to_numeric(df_X[col], errors='coerce')
    def engineer_features(df):
        df['SalaryGrowthPerYear'] = np.where(
            df['YearsWorked'] > 0,
            (df['Salary'] - df['PreviousSalary']) / df['YearsWorked'],
            0
        )
        df['ReviewDiscrepancy'] = abs(df['SelfReview'] - df['SupervisorReview'])
        df['WorkloadScore'] = df['NumOfProjects'] * (5 - df['WorkLifeBalance'])
        df['TenureToProjects'] = np.where(
            df['NumOfProjects'] > 0,
            df['YearsWorked'] / df['NumOfProjects'],
            0
        )
        df['AvgTrainingPerYear'] = df['TrainingHours'] / (df['YearsWorked'] + 1)
        df['Distance'] = df['Distance'].map({
            '<5miles': 1,
            '~5miles': 2,
            '~10miles': 3,
            '~15miles': 4,
            '~20miles': 5,
            '>30miles': 6
        })
        return df
    df_X = engineer_features(df_X)
    from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import classification_report
    X = df_X
    y = df_y
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y)
    rf_pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42))
    ])
    cv_scores = cross_val_score(rf_pipeline, X_train, y_train, cv=5, scoring='accuracy')
    print("Random Forest CV Accuracy:", cv_scores.mean())
    param_grid = {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [5, 10, None],
        'classifier__min_samples_split': [2, 5]
    }
    grid_search = GridSearchCV(
        rf_pipeline, param_grid, cv=3, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_rfmodel = grid_search.best_estimator_
    print("Best Parameters:", grid_search.best_params_)
    y_pred = best_rfmodel.predict(X_test)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

production(
    X_path='https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_X_prod.csv',
    y_path='https://raw.githubusercontent.com/msaricaumbc/DS_data/master/ds602/final/employee_departure_dataset_y_prod.csv'
)


Distance_Num
0       90
1     9975
2    29854
3    29983
4    20055
5    10043
Name: count, dtype: int64
Random Forest CV Accuracy: 0.855575
Best Parameters: {'classifier__max_depth': 10, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 200}

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.83      0.88     12809
           1       0.75      0.89      0.81      7191

    accuracy                           0.85     20000
   macro avg       0.84      0.86      0.85     20000
weighted avg       0.86      0.85      0.85     20000

