In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Data Loading and Preprocessing

In [None]:
hr_df= pd.read_csv('data/turnover.csv')

In [None]:
hr_df = hr_df.rename(columns={'average_montly_hours':'average_monthly_hours', 'sales': 'role', 'Work_accident':'work_accident'})
hr_df.head()

#### Splitting the dataset

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(hr_df, test_size=0.2, random_state=289, stratify=hr_df.left)

#### Outliers removal

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
class OutlierRemoval(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self,X):
        return self
    
    def transform(self, X):
        for column in self.columns:
            q25, q75 = np.percentile(X[[column]], [25,75])
            iqr = q75-q25
            
            X = X[(X[column] <= (q75 + 1.5*iqr)) & (X[column] >= (q25 - 1.5*iqr))]
            
        return X
    
outlier_removal = OutlierRemoval(['last_evaluation', 'time_spend_company'])
train_set_adv = outlier_removal.fit_transform(train_set)

In [None]:
X = train_set.drop('left', axis=1).copy()
y = train_set['left'].copy()

X_adv = train_set_adv.drop('left', axis=1).copy()
y_adv = train_set_adv['left'].copy()

X_test = test_set.drop('left', axis=1).copy()
y_test = test_set['left'].copy()

In [None]:
X.head()

#### Preprocessing 

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

class CustomOrdinalEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, order={}):
        self.order = order
        
    def fit(self,X):
        return self
    
    def transform(self,X):
        if self.order :
            for column in X.columns:
                X.loc[:, column] =  X.loc[:, column].replace(list(self.order.keys()), list(self.order.values()))
    
        return X

binning_features = ['satisfaction_level', 'last_evaluation', 'average_monthly_hours']

basic_featuring = ColumnTransformer([
    ('role_enc', OneHotEncoder(), ['role']),
    ('binning', KBinsDiscretizer(n_bins=7, encode='ordinal'), binning_features),
    ('scaling', MinMaxScaler(), ['average_monthly_hours']),
    ('salary_enc', CustomOrdinalEncoder())
])



basic_featuring = ColumnTransformer([
    ('role_basic', OneHotEncoder(), ['role'])
], remainder='passthrough')


basic_feature_pipeline = Pipeline([
    ('basic_salary', CustomOrdinalEncoder('salary', {'low': 1, 'medium': 2, 'high': 3})),
    ('basic_role', basic_featuring),
    ('basic_scaling', MinMaxScaler())
])


advanced_featuring = ColumnTransformer([
    ('binning', KBinsDiscretizer(n_bins=7, encode='ordinal'), ),
    ('role_advanced', OneHotEncoder(), ['role'])
], remainder='passthrough')

advanced_feature_pipeline = Pipeline([
    ('advanced_salary', CustomOrdinalEncoder('salary', {'low':1, 'medium': 2, 'high': 3})),
    ('advanced_featuring', advanced_featuring),
    ('advanced_scaling', MinMaxScaler()),
    ('linear_reg', De1)
])


In [None]:
X_prepared = basic_feature_pipeline.fit_transform(X)
X_test_prepared = basic_feature_pipeline.transform(X_test)

In [None]:
X_adv_prepared = advanced_feature_pipeline.fit_transform(X_adv)
X_test_adv_prepared = advanced_feature_pipeline.transform(X_test)

#### Feature Engineering - nonregularized logit, all features

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, matthews_corrcoef

initial_logreg = LogisticRegression(penalty='none', solver='newton-cg')
initial_logreg.fit(initial_X, initial_y)

initial_y_pred = initial_logreg.predict(X_test)

print(classification_report(y_test, initial_y_pred))
print(matthews_corrcoef(y_test, initial_y_pred))


In [None]:
sorted([*zip(initial_X_test.columns, initial_logreg.coef_[0])], key = lambda pair: abs(pair[1]), reverse = True)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, matthews_corrcoef

X = train.drop('left', axis=1)
y = train['left']

logreg = LogisticRegression(penalty='none', solver='newton-cg')
logreg.fit(X, y)

X_test = test.drop('left', axis=1)
y_test = test['left']

y_pred = logreg.predict(X_test)

print(classification_report(y_test, y_pred))
print(matthews_corrcoef(y_test, y_pred))


In [None]:
sorted([*zip(X_test.columns, logreg.coef_[0])], key = lambda pair: abs(pair[1]), reverse = True)

#### Feature Engineering - regularized logit, all features

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, matthews_corrcoef

initial_logreg_l1 = LogisticRegression(penalty='l1', solver='liblinear')
initial_logreg_l1.fit(initial_X, initial_y)

initial_y_pred_l1 = initial_logreg_l1.predict(initial_X_test)

print(classification_report(initial_y_test, initial_y_pred_l1))
print(matthews_corrcoef(initial_y_test, initial_y_pred_l1))

In [None]:
sorted([*zip(X_test.columns, initial_logreg_l1.coef_[0])], key = lambda pair: abs(pair[1]), reverse = True)