### Code by Denis Loechel as part of master thesis on synthetic data generation

# Importing libraries

In [15]:
import pandas as pd
import numpy as np
from tabulate import tabulate
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score
from datetime import datetime
import warnings

In [16]:
# Ignoring warnings 
warnings.filterwarnings('ignore')

# Importing dataset and evaluating the dataset

### Model Explanation

#### In this model selection below , given our HR dataset and the part of the dataset , we aim to classify whether or not a future employee will be terminated  based on the other variables on the dataset such as salary, termination, special projects etc. 

# 1. Original Dataset

In [36]:
#Uploading our HR CSV file
data = pd.read_csv("20230308_Updated_HR_dataset.csv")

In [37]:
data.columns

Index(['EmpID', 'MarriedID', 'GenderID', 'DeptID', 'Salary', 'Termd',
       'PositionID', 'CitizenDesc', 'ManagerID', 'EmpSatisfaction',
       'SpecialProjectsCount', 'DaysLateLast30', 'Absences', 'BirthYear',
       'YearofTermination', 'YearofHire', 'YearofPerformanceReview',
       'YearsinCompany'],
      dtype='object')

In [38]:
# Selecting the termd column 
col_name = data.iloc[:, 5].name
col_name

'Termd'

## Model: Train_test_split and scaling

In [39]:
# Feature and target label creation for dataset 
X = data.drop(col_name, axis=1)  # features
y = data[col_name]  # target label

In [40]:
# Creating train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [98]:
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

## Combined Classification Model Test

In [99]:
# Classifiers
model_dict = {
    "LR": LogisticRegression(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "SVM": SVC()
}

# Evaluate each model using KFold cross-validation
results = []
names = []
scoring = {
    'accuracy': 'accuracy',
    'f1_score': 'f1_micro',
    'weighted_f1': 'f1_weighted'
}

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for name, model in model_dict.items():
    pipeline = make_pipeline(model)
    cv_results = cross_validate(pipeline, X_train, y_train, cv=kfold, scoring=scoring)
    accuracy = cv_results["test_accuracy"].mean()
    f1_score = cv_results["test_f1_score"].mean()
    f1_weighted = cv_results["test_weighted_f1"].mean()

    results.append([accuracy, f1_score, f1_weighted])
    names.append(name)
    print(f"{name}: Accuracy: {accuracy:.2%}, F1-Score: {f1_score:.3f}, Weighted F1-Score: {f1_weighted:.3f}")

LR: Accuracy: 100.00%, F1-Score: 1.000, Weighted F1-Score: 1.000
DT: Accuracy: 99.60%, F1-Score: 0.996, Weighted F1-Score: 0.996
RF: Accuracy: 100.00%, F1-Score: 1.000, Weighted F1-Score: 1.000
SVM: Accuracy: 66.93%, F1-Score: 0.669, Weighted F1-Score: 0.539


# 2. CTGAN Dataset (500 epochs)

In [100]:
#Uploading our HR CSV file
data = pd.read_csv("synthetic_dataset_CTGAN_epochs500.csv")

In [101]:
# Selecting the termd column 
col_name = data.iloc[:, 5].name
col_name

'Termd'

## Model: Train_test_split and scaling

In [102]:
# Feature and target label creation for dataset 
X = data.drop(col_name, axis=1)  # features
y = data[col_name]  # target label

In [103]:
# Creating train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [104]:
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

## Combined Classification Model Test

In [105]:
# Classifiers
model_dict = {
    "LR": LogisticRegression(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "SVM": SVC()
}

# Evaluate each model using KFold cross-validation
results = []
names = []
scoring = {
    'accuracy': 'accuracy',
    'f1_score': 'f1_micro',
    'weighted_f1': 'f1_weighted'
}

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for name, model in model_dict.items():
    pipeline = make_pipeline(model)
    cv_results = cross_validate(pipeline, X_train, y_train, cv=kfold, scoring=scoring)
    accuracy = cv_results["test_accuracy"].mean()
    f1_score = cv_results["test_f1_score"].mean()
    f1_weighted = cv_results["test_weighted_f1"].mean()

    results.append([accuracy, f1_score, f1_weighted])
    names.append(name)
    print(f"{name}: Accuracy: {accuracy:.2%}, F1-Score: {f1_score:.3f}, Weighted F1-Score: {f1_weighted:.3f}")

LR: Accuracy: 91.33%, F1-Score: 0.913, Weighted F1-Score: 0.872
DT: Accuracy: 82.04%, F1-Score: 0.820, Weighted F1-Score: 0.831
RF: Accuracy: 91.33%, F1-Score: 0.913, Weighted F1-Score: 0.872
SVM: Accuracy: 91.33%, F1-Score: 0.913, Weighted F1-Score: 0.872


# 3. CTGAN Dataset (1000 epochs)

In [142]:
#Uploading our HR CSV file
data = pd.read_csv("synthetic_dataset_CTGAN_epochs1000.csv")

In [143]:
# Selecting the termd column 
col_name = data.iloc[:, 5].name
col_name

'Termd'

## Model: Train_test_split and scaling

In [144]:
# Feature and target label creation for dataset 
X = data.drop(col_name, axis=1)  # features
y = data[col_name]  # target label

In [145]:
# Creating train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [146]:
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

## Combined Classification Model Test

In [147]:
# Classifiers
model_dict = {
    "LR": LogisticRegression(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "SVM": SVC()
}

# Evaluate each model using KFold cross-validation
results = []
names = []
scoring = {
    'accuracy': 'accuracy',
    'f1_score': 'f1_micro',
    'weighted_f1': 'f1_weighted'
}

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for name, model in model_dict.items():
    pipeline = make_pipeline(model)
    cv_results = cross_validate(pipeline, X_train, y_train, cv=kfold, scoring=scoring)
    accuracy = cv_results["test_accuracy"].mean()
    f1_score = cv_results["test_f1_score"].mean()
    f1_weighted = cv_results["test_weighted_f1"].mean()

    results.append([accuracy, f1_score, f1_weighted])
    names.append(name)
    print(f"{name}: Accuracy: {accuracy:.2%}, F1-Score: {f1_score:.3f}, Weighted F1-Score: {f1_weighted:.3f}")

LR: Accuracy: 66.43%, F1-Score: 0.664, Weighted F1-Score: 0.530
DT: Accuracy: 55.55%, F1-Score: 0.556, Weighted F1-Score: 0.558
RF: Accuracy: 65.64%, F1-Score: 0.656, Weighted F1-Score: 0.542
SVM: Accuracy: 66.38%, F1-Score: 0.664, Weighted F1-Score: 0.530


# 4. CTGAN Dataset (1500 epochs)

In [112]:
#Uploading our HR CSV file
data = pd.read_csv("synthetic_dataset_CTGAN_epochs1500.csv")

In [113]:
# Selecting the termd column 
col_name = data.iloc[:, 5].name
col_name

'Termd'

## Model: Train_test_split and scaling

In [114]:
# Feature and target label creation for dataset 
X = data.drop(col_name, axis=1)  # features
y = data[col_name]  # target label

In [115]:
# Creating train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [116]:
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

## Combined Classification Model Test

In [117]:
# Classifiers
model_dict = {
    "LR": LogisticRegression(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "SVM": SVC()
}

# Evaluate each model using KFold cross-validation
results = []
names = []
scoring = {
    'accuracy': 'accuracy',
    'f1_score': 'f1_micro',
    'weighted_f1': 'f1_weighted'
}

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for name, model in model_dict.items():
    pipeline = make_pipeline(model)
    cv_results = cross_validate(pipeline, X_train, y_train, cv=kfold, scoring=scoring)
    accuracy = cv_results["test_accuracy"].mean()
    f1_score = cv_results["test_f1_score"].mean()
    f1_weighted = cv_results["test_weighted_f1"].mean()

    results.append([accuracy, f1_score, f1_weighted])
    names.append(name)
    print(f"{name}: Accuracy: {accuracy:.2%}, F1-Score: {f1_score:.3f}, Weighted F1-Score: {f1_weighted:.3f}")

LR: Accuracy: 85.50%, F1-Score: 0.855, Weighted F1-Score: 0.788
DT: Accuracy: 73.02%, F1-Score: 0.730, Weighted F1-Score: 0.741
RF: Accuracy: 85.50%, F1-Score: 0.855, Weighted F1-Score: 0.788
SVM: Accuracy: 85.50%, F1-Score: 0.855, Weighted F1-Score: 0.788


# 5. CTGAN Dataset (2000 epochs)

In [118]:
#Uploading our HR CSV file
data = pd.read_csv("synthetic_dataset_CTGAN_epochs2000.csv")

In [119]:
# Selecting the termd column 
col_name = data.iloc[:, 5].name
col_name

'Termd'

## Model: Train_test_split and scaling

In [120]:
# Feature and target label creation for dataset 
X = data.drop(col_name, axis=1)  # features
y = data[col_name]  # target label

In [121]:
# Creating train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [122]:
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

## Combined Classification Model Test

In [123]:
# Classifiers
model_dict = {
    "LR": LogisticRegression(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "SVM": SVC()
}

# Evaluate each model using KFold cross-validation
results = []
names = []
scoring = {
    'accuracy': 'accuracy',
    'f1_score': 'f1_micro',
    'weighted_f1': 'f1_weighted'
}

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for name, model in model_dict.items():
    pipeline = make_pipeline(model)
    cv_results = cross_validate(pipeline, X_train, y_train, cv=kfold, scoring=scoring)
    accuracy = cv_results["test_accuracy"].mean()
    f1_score = cv_results["test_f1_score"].mean()
    f1_weighted = cv_results["test_weighted_f1"].mean()

    results.append([accuracy, f1_score, f1_weighted])
    names.append(name)
    print(f"{name}: Accuracy: {accuracy:.2%}, F1-Score: {f1_score:.3f}, Weighted F1-Score: {f1_weighted:.3f}")

LR: Accuracy: 75.37%, F1-Score: 0.754, Weighted F1-Score: 0.648
DT: Accuracy: 61.57%, F1-Score: 0.616, Weighted F1-Score: 0.621
RF: Accuracy: 75.32%, F1-Score: 0.753, Weighted F1-Score: 0.649
SVM: Accuracy: 75.37%, F1-Score: 0.754, Weighted F1-Score: 0.648


# 6. CopulaGAN (500 epochs)

In [184]:
#Uploading our HR CSV file
data = pd.read_csv("synthetic_dataset_CopulaGANepochs500.csv")

In [185]:
# Selecting the termd column 
col_name = data.iloc[:, 5].name
col_name

'Termd'

## Model: Train_test_split and scaling

In [186]:
# Feature and target label creation for dataset 
X = data.drop(col_name, axis=1)  # features
y = data[col_name]  # target label

In [187]:
# Creating train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [188]:
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

## Combined Classification Model Test

In [189]:
# Classifiers
model_dict = {
    "LR": LogisticRegression(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "SVM": SVC()
}

# Evaluate each model using KFold cross-validation
results = []
names = []
scoring = {
    'accuracy': 'accuracy',
    'f1_score': 'f1_micro',
    'weighted_f1': 'f1_weighted'
}

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for name, model in model_dict.items():
    pipeline = make_pipeline(model)
    cv_results = cross_validate(pipeline, X_train, y_train, cv=kfold, scoring=scoring)
    accuracy = cv_results["test_accuracy"].mean()
    f1_score = cv_results["test_f1_score"].mean()
    f1_weighted = cv_results["test_weighted_f1"].mean()

    results.append([accuracy, f1_score, f1_weighted])
    names.append(name)
    print(f"{name}: Accuracy: {accuracy:.2%}, F1-Score: {f1_score:.3f}, Weighted F1-Score: {f1_weighted:.3f}")

LR: Accuracy: 82.58%, F1-Score: 0.826, Weighted F1-Score: 0.747
DT: Accuracy: 69.38%, F1-Score: 0.694, Weighted F1-Score: 0.704
RF: Accuracy: 82.58%, F1-Score: 0.826, Weighted F1-Score: 0.747
SVM: Accuracy: 82.58%, F1-Score: 0.826, Weighted F1-Score: 0.747


# 7. CopulaGAN (1000 epochs)

In [22]:
#Uploading our HR CSV file
data = pd.read_csv("synthetic_dataset_CopulaGANepochs1000.csv")

In [23]:
# Selecting the termd column 
col_name = data.iloc[:, 5].name
col_name

'Termd'

## Model: Train_test_split and scaling

In [24]:
# Feature and target label creation for dataset 
X = data.drop(col_name, axis=1)  # features
y = data[col_name]  # target label

In [25]:
# Creating train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

## Combined Classification Model Test

In [195]:
# Classifiers
model_dict = {
    "LR": LogisticRegression(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "SVM": SVC()
}

# Evaluate each model using KFold cross-validation
results = []
names = []
scoring = {
    'accuracy': 'accuracy',
    'f1_score': 'f1_micro',
    'weighted_f1': 'f1_weighted'
}

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for name, model in model_dict.items():
    pipeline = make_pipeline(model)
    cv_results = cross_validate(pipeline, X_train, y_train, cv=kfold, scoring=scoring)
    accuracy = cv_results["test_accuracy"].mean()
    f1_score = cv_results["test_f1_score"].mean()
    f1_weighted = cv_results["test_weighted_f1"].mean()

    results.append([accuracy, f1_score, f1_weighted])
    names.append(name)
    print(f"{name}: Accuracy: {accuracy:.2%}, F1-Score: {f1_score:.3f}, Weighted F1-Score: {f1_weighted:.3f}")

LR: Accuracy: 51.23%, F1-Score: 0.512, Weighted F1-Score: 0.352
DT: Accuracy: 49.94%, F1-Score: 0.499, Weighted F1-Score: 0.500
RF: Accuracy: 49.71%, F1-Score: 0.497, Weighted F1-Score: 0.496
SVM: Accuracy: 51.17%, F1-Score: 0.512, Weighted F1-Score: 0.349


# 8. CopulaGAN (1500 epochs)

In [196]:
#Uploading our HR CSV file
data = pd.read_csv("synthetic_dataset_CopulaGANepochs1500.csv")

In [197]:
# Selecting the termd column 
col_name = data.iloc[:, 5].name
col_name

'Termd'

## Model: Train_test_split and scaling

In [198]:
# Feature and target label creation for dataset 
X = data.drop(col_name, axis=1)  # features
y = data[col_name]  # target label

In [199]:
# Creating train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [200]:
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

## Combined Classification Model Test

In [201]:
# Classifiers
model_dict = {
    "LR": LogisticRegression(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "SVM": SVC()
}

# Evaluate each model using KFold cross-validation
results = []
names = []
scoring = {
    'accuracy': 'accuracy',
    'f1_score': 'f1_micro',
    'weighted_f1': 'f1_weighted'
}

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for name, model in model_dict.items():
    pipeline = make_pipeline(model)
    cv_results = cross_validate(pipeline, X_train, y_train, cv=kfold, scoring=scoring)
    accuracy = cv_results["test_accuracy"].mean()
    f1_score = cv_results["test_f1_score"].mean()
    f1_weighted = cv_results["test_weighted_f1"].mean()

    results.append([accuracy, f1_score, f1_weighted])
    names.append(name)
    print(f"{name}: Accuracy: {accuracy:.2%}, F1-Score: {f1_score:.3f}, Weighted F1-Score: {f1_weighted:.3f}")

LR: Accuracy: 83.01%, F1-Score: 0.830, Weighted F1-Score: 0.753
DT: Accuracy: 70.12%, F1-Score: 0.701, Weighted F1-Score: 0.710
RF: Accuracy: 83.01%, F1-Score: 0.830, Weighted F1-Score: 0.753
SVM: Accuracy: 83.01%, F1-Score: 0.830, Weighted F1-Score: 0.753


# 9. CopulaGAN (2000 epochs)

In [28]:
#Uploading our HR CSV file
data = pd.read_csv("synthetic_dataset_CopulaGANepochs2000.csv")

In [29]:
# Selecting the termd column 
col_name = data.iloc[:, 5].name
col_name

'Termd'

## Model: Train_test_split and scaling

In [30]:
# Feature and target label creation for dataset 
X = data.drop(col_name, axis=1)  # features
y = data[col_name]  # target label

In [31]:
# Creating train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
#scaler = StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

## Combined Classification Model Test

In [33]:
# Classifiers
model_dict = {
    "LR": LogisticRegression(),
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "SVM": SVC()
}

# Evaluate each model using KFold cross-validation
results = []
names = []
scoring = {
    'accuracy': 'accuracy',
    'f1_score': 'f1_micro',
    'weighted_f1': 'f1_weighted'
}

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

for name, model in model_dict.items():
    pipeline = make_pipeline(model)
    cv_results = cross_validate(pipeline, X_train, y_train, cv=kfold, scoring=scoring)
    accuracy = cv_results["test_accuracy"].mean()
    f1_score = cv_results["test_f1_score"].mean()
    f1_weighted = cv_results["test_weighted_f1"].mean()

    results.append([accuracy, f1_score, f1_weighted])
    names.append(name)
    print(f"{name}: Accuracy: {accuracy:.2%}, F1-Score: {f1_score:.3f}, Weighted F1-Score: {f1_weighted:.3f}")

LR: Accuracy: 70.39%, F1-Score: 0.704, Weighted F1-Score: 0.582
DT: Accuracy: 56.51%, F1-Score: 0.565, Weighted F1-Score: 0.569
RF: Accuracy: 70.21%, F1-Score: 0.702, Weighted F1-Score: 0.583
SVM: Accuracy: 70.39%, F1-Score: 0.704, Weighted F1-Score: 0.582


# Documentation Sources for Reference

Classification models: https://scikit-learn.org/stable/supervised_learning.html
F-1 Score: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
KFold cross validation: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
Pipeline: https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html