In [1]:
# imports
import pandas as pd
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [2]:
# model imports
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import recall_score, precision_score, f1_score, auc, roc_auc_score, root_mean_squared_error

In [3]:
data = pd.read_csv('Data/combined_dataset.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,Job Title,Department,Age,Gender,Marital Status,Years of Service,Salary,Exited
0,0,HR Specialist,Human Resources,25,Male,Married,2,537.25,0
1,1,Project Manager,Sales & Marketing,53,Male,Single,8,154.13,0
2,2,Billing Specialist,IT & Software,44,Female,Married,8,368.54,0
3,3,Marketing Analyst,Data Analytics,37,Female,Married,7,269.92,0
4,4,Product Manager,IT & Software,30,Male,Single,4,131.17,0


In [5]:
data.shape

(4320, 9)

In [6]:
data.columns

Index(['Unnamed: 0', 'Job Title', 'Department', 'Age', 'Gender',
       'Marital Status', 'Years of Service', 'Salary', 'Exited'],
      dtype='object')

In [7]:
# drop unwanted column
data.drop(columns=['Unnamed: 0'], inplace=True)

In [8]:
data.head()

Unnamed: 0,Job Title,Department,Age,Gender,Marital Status,Years of Service,Salary,Exited
0,HR Specialist,Human Resources,25,Male,Married,2,537.25,0
1,Project Manager,Sales & Marketing,53,Male,Single,8,154.13,0
2,Billing Specialist,IT & Software,44,Female,Married,8,368.54,0
3,Marketing Analyst,Data Analytics,37,Female,Married,7,269.92,0
4,Product Manager,IT & Software,30,Male,Single,4,131.17,0


In [9]:
# feature segregation
x_data = data.drop('Exited', axis=1)
y_data = data['Exited']

In [10]:
# Create transformers for numerical and categorical features
categorical_features = ['Job Title', 'Department', 'Gender', 'Marital Status']
numerical_features = ['Age', 'Years of Service', 'Salary']

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Apply preprocessing and SMOTE (oversampling technique)
smote = SMOTE(random_state=42)
X_transformed = preprocessor.fit_transform(x_data)
X_resampled, y_resampled = smote.fit_resample(X_transformed, y_data)

In [11]:
# split data
train_x, test_x, train_y, test_y = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

In [12]:
# model training
# we'll use different models and determine the best one
# based on evaluation metrics

lr_model = LogisticRegression(max_iter=200)
lSVC_model = LinearSVC(max_iter=200)
svc_model = SVC(max_iter=200, kernel='sigmoid')
knn_model = KNeighborsClassifier(n_neighbors=4)

In [13]:
# train models
def train_all_models(train_x, train_y, models=[
    lr_model, svc_model, knn_model, lSVC_model
]):
    for model in models:
        model.fit(train_x, train_y)

train_all_models(train_x, train_y)



In [14]:
# evaluate models on specific metrics
def evaluate_models(test_x, test_y, models_dict={'Logistic Regression': lr_model, 
                                                'Linear SVC': lSVC_model,
                                                'SVC': svc_model, 
                                                'KNN': knn_model}):
    model_names = []
    rmse_scores = []
    recall_scores = []
    precision_scores = []
    f1_scores = []
    roc_auc_scores = []
    
    for name, model in models_dict.items():
        y_pred = model.predict(test_x)
        
        model_names.append(name)
        rmse_scores.append(root_mean_squared_error(test_y, y_pred))
        recall_scores.append(recall_score(test_y, y_pred))
        precision_scores.append(precision_score(test_y, y_pred))
        f1_scores.append(f1_score(test_y, y_pred))
        roc_auc_scores.append(roc_auc_score(test_y, y_pred))
    
    results_df = pd.DataFrame({
        'Model': model_names,
        'RMSE': rmse_scores,
        'Recall': recall_scores,
        'Precision': precision_scores,
        'F1 Score': f1_scores,
        'ROC AUC': roc_auc_scores
    })
    
    return results_df.sort_values('F1 Score', ascending=False)

# run evaluation
results = evaluate_models(test_x, test_y)
print(results)

                 Model      RMSE    Recall  Precision  F1 Score   ROC AUC
3                  KNN  0.439155  0.915604   0.745208  0.821664  0.810253
0  Logistic Regression  0.596418  0.647694   0.629771  0.638607  0.644383
1           Linear SVC  0.598411  0.646712   0.627022  0.636715  0.642043
2                  SVC  0.632456  0.721295   0.569326  0.636364  0.603478


In [15]:
# from the analysis, KNN is the best option to use as our predictive model.
knn_model.predict_proba(test_x[10])

array([[0.75, 0.25]])

In [16]:
knn_model.predict(test_x[10])

array([0])

In [17]:
test_y.values[10]

np.int64(0)

In [18]:
def predict_prob(data: any):
    """Function only takes single entries"""
    res = knn_model.predict_proba(data)
    return f"Probility of exiting: {res[0][1]*100}%"

In [19]:
predict_prob(test_x[20])

'Probility of exiting: 25.0%'

In [20]:
test_y.values[20]

np.int64(0)

In [None]:
# run preprocessor.fit('your csv file for prediction here')
# before passing it as parameters for the functions below
# testData = preprocessor.fit_transform('your loaded csv file')

In [21]:
def predict(data:any):
    """Function only takes single entries"""
    res = knn_model.predict(data)
    if res == 0: return 'Not-Exiting'
    else: return 'Exiting'

In [22]:
predict(test_x[20])

'Not-Exiting'

In [23]:
def exit_count(data:any):
    """Returns the total number of predicted exits."""
    total = 0
    res = knn_model.predict(data)
    for val in res:
        if val == 1:
            total += 1
    return total

In [24]:
exit_count(test_x[11: 30])

7

In [25]:
test_y.values[11: 30]

array([1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0])