In [1]:
import random

import math
import numpy as np
import pandas as pd
from numpy import log, exp

import matplotlib.pyplot as plt

In [2]:
class EmployeeRetention:
    def __init__(self, sex, age, promotions, years_employed, did_quit):
        self.sex = sex
        self.age = age
        self.promotions = promotions
        self.years_employed = years_employed
        self.did_quit = did_quit


employee_data = [(EmployeeRetention(row[0], row[1], row[2], row[3], row[4])) for index, row in
                 pd.read_csv("data/employee_logreg_data.txt").iterrows()]

  employee_data = [(EmployeeRetention(row[0], row[1], row[2], row[3], row[4])) for index, row in


In [None]:
sexes = [emp.sex for emp in employee_data]
did_quit = [emp.did_quit for emp in employee_data]

# Calculate the percentage of employees who quit per sex
sex_quit_percentage = {sex: sum(1 for emp in employee_data if emp.sex == sex and emp.did_quit == 1) /
                       sum(1 for emp in employee_data if emp.sex == sex) * 100 for sex in set(sexes)}
# Plot the data
plt.bar(sex_quit_percentage.keys(), sex_quit_percentage.values(), alpha=0.5)
plt.xlabel('Sex')
plt.ylabel('Quit Percentage')
plt.title('Employee Quit Percentage by Sex')
plt.xticks(ticks=[0, 1], labels=['0', '1'])
plt.show()

In [None]:
ages = [emp.age for emp in employee_data]

# Plot the data
plt.scatter(ages, did_quit, alpha=0.5)
plt.xlabel('Age')
plt.ylabel('Did Quit')
plt.title('Employee Age vs Did Quit')
plt.show()

ages_quit_percentage = {age: (sum(1 for emp in employee_data if emp.age == age and emp.did_quit == 1) /
                        sum(1 for emp in employee_data if emp.age == age) * 100, sum(1 for emp in employee_data if emp.age == age))
                        for age in set(ages)}

print(f"Age: Quit Percentage, Number of Employees with age")
print(ages_quit_percentage)

# Plot the data
plt.bar(ages_quit_percentage.keys(), np.array(list(ages_quit_percentage.values()))[:,0], alpha=0.5)
plt.xlabel('Age')
plt.ylabel('Quit Percentage')
plt.title('Employee Quit Percentage by Age')
plt.show()

In [None]:
promotions = [emp.promotions for emp in employee_data]

promo_quit_percentage = {promo: (sum(1 for emp in employee_data if emp.promotions == promo and emp.did_quit == 1) /
                         sum(1 for emp in employee_data if emp.promotions == promo) * 100, sum(1 for emp in employee_data if emp.promotions == promo))
                         for promo in set(promotions)}

print(f"Num promotions: Quit Percentage, Number of Employees with amount of promotions")
print(promo_quit_percentage)

plt.bar(promo_quit_percentage.keys(), np.array(list(promo_quit_percentage.values()))[:,0], alpha=0.5)
plt.xlabel('Promotions')
plt.ylabel('Quit Percentage')
plt.title('Employee Quit Percentage by Promotions')
plt.xticks(ticks=list(promo_quit_percentage.keys()), labels=list(promo_quit_percentage.keys()))
plt.show()

In [None]:
years_employed = [emp.years_employed for emp in employee_data]

years_employed_quit_percentage = {years: (sum(1 for emp in employee_data if emp.years_employed == years and emp.did_quit == 1) /
                            sum(1 for emp in employee_data if emp.years_employed == years) * 100, sum(1 for emp in employee_data if emp.years_employed == years))
                            for years in set(years_employed)}

print(f"Years employed: Quit Percentage, Number of Employees with amount of years employed")
print(years_employed_quit_percentage)

plt.bar(years_employed_quit_percentage.keys(), np.array(list(years_employed_quit_percentage.values()))[:,0], alpha=0.5)
plt.xlabel('Years Employed')
plt.ylabel('Quit Percentage')
plt.title('Employee Quit Percentage by Years Employed')
plt.xticks(ticks=list(years_employed_quit_percentage.keys()), labels=list(years_employed_quit_percentage.keys()))
plt.show()

In [None]:
best_likelihood = -100_000_000_000.0
b0 = 1.0  # constant
b1 = 1.0  # sex beta
b2 = 1.0  # age beta
b3 = 1.0  # promotions beta
b4 = 1.0  # years employed beta
b_values = [(b0, b1, b2, b3, b4)]
likelihoods = [best_likelihood]

iterations = 100_000

# calculate maximum likelihood

def predict_probability(sex, age, promotions, years_employed, b0, b1, b2, b3, b4):
    x = b0 + (b1 * sex) + (b2 * age) + (b3 * promotions) + (b4 * years_employed)
    odds = exp(-x)
    p = 1.0 / (1.0 + odds)
    return p

for i in range(iterations):

    # Select b0, b1, b2, b3, or b4 randomly, and adjust it by a random amount
    random_b = random.choice(range(5))

    random_adjust = np.random.standard_normal()

    if random_b == 0:
        b0 += random_adjust
    elif random_b == 1:
        b1 += random_adjust
    elif random_b == 2:
        b2 += random_adjust
    elif random_b == 3:
        b3 += random_adjust
    elif random_b == 4:
        b4 += random_adjust

    # calculate new likelihood
    # Use logarithmic addition to avoid multiplication and decimal underflow
    new_likelihood = 0.0

    for emp in employee_data:

        probability = predict_probability(emp.sex, emp.age, emp.promotions, emp.years_employed, b0, b1, b2, b3, b4)

        if emp.did_quit == 1:
            new_likelihood += log(probability)
        else:
            new_likelihood += log(1.00001 - probability)

    # If solution improves, keep it and make it new best likelihood. Otherwise undo the adjustment
    if best_likelihood < new_likelihood:
        best_likelihood = new_likelihood
        b_values.append((b0, b1, b2, b3, b4))
        likelihoods.append(new_likelihood)
        
        print(f"{i}/{iterations} | #adjustments: {len(b_values)}")
        print("1.0 / (1 + exp(-({0} + {1}*sex + {2}*age + {3}*promo + {4}*years))".format(b0, b1, b2, b3, b4))
        print("BEST LIKELIHOOD: {0} | log-likelihood: {1}\n".format(math.exp(best_likelihood), best_likelihood))
        
    elif random_b == 0:
        b0 -= random_adjust
    elif random_b == 1:
        b1 -= random_adjust
    elif random_b == 2:
        b2 -= random_adjust
    elif random_b == 3:
        b3 -= random_adjust
    elif random_b == 4:
        b4 -= random_adjust

# Print best result
print("1.0 / (1 + exp(-({0} + {1}*sex + {2}*age + {3}*promo + {4}*years))".format(b0, b1, b2, b3, b4))
print("BEST LIKELIHOOD: {0} | log-likelihood: {1}\n".format(math.exp(best_likelihood), best_likelihood))
print(f"#adjustments: {len(b_values)}")

In [None]:
# Interact and test with new employee data
def predict_employee_will_stay(sex, age, promotions, years_employed, b0, b1, b2, b3, b4):
    probability_of_leaving = predict_probability(sex, age, promotions, years_employed, b0, b1, b2, b3, b4)
    if probability_of_leaving >= .5:
        return "WILL LEAVE, {0}% chance of leaving".format(round(probability_of_leaving * 100.0,2))
    else:
        return "WILL STAY, {0}% chance of leaving".format(round(probability_of_leaving * 100.0,2))


while True:
    n = input("Predict employee will stay or leave {sex},{age},{promotions},{years employed}: ")
    (sex, age, promotions, years_employed, b0, b1, b2, b3, b4) = n.split(",")
    print(predict_employee_will_stay(int(sex), int(age), int(promotions), int(years_employed)))

## sklearn

In [96]:
from sklearn.linear_model import LogisticRegression

In [None]:
X = np.array([[emp.sex, emp.age, emp.promotions, emp.years_employed] for emp in employee_data])
y = np.array([emp.did_quit for emp in employee_data])

reg_model = LogisticRegression()
clf = reg_model.fit(X, y)

b0 = clf.intercept_[0]
b1, b2, b3, b4 = clf.coef_[0]

log_likelihood = (sum(math.log(predict_probability(emp.sex, emp.age, emp.promotions, emp.years_employed, b0, b1, b2, b3, b4)) for emp in employee_data if emp.did_quit == 1.0) + 
             sum(math.log(1.0 - predict_probability(emp.sex, emp.age, emp.promotions, emp.years_employed, b0, b1, b2, b3, b4)) for emp in employee_data if emp.did_quit == 0.0))

print(f"b0: {b0}, b1: {b1}, b2: {b2}, b3: {b3}, b4: {b4}")
print(f"Log-Likelihood: {log_likelihood}")
print(f"Likelihood: {np.exp(log_likelihood)}")
print(clf.predict([[0, 50, 4, 10]]))
