In [88]:
import numpy as np
import pandas as pd
import matplotlib as plt
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,precision_recall_curve
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV


In [89]:
! ls

pipe.ipynb   turnover.csv


In [95]:
data = pd.read_csv('turnover.csv', encoding = 'ISO-8859-1')

In [96]:
data

Unnamed: 0,stag,event,gender,age,industry,profession,traffic,coach,head_gender,greywage,way,extraversion,independ,selfcontrol,anxiety,novator
0,7.030801,1,m,35.0,Banks,HR,rabrecNErab,no,f,white,bus,6.2,4.1,5.7,7.1,8.3
1,22.965092,1,m,33.0,Banks,HR,empjs,no,m,white,bus,6.2,4.1,5.7,7.1,8.3
2,15.934292,1,f,35.0,PowerGeneration,HR,rabrecNErab,no,m,white,bus,6.2,6.2,2.6,4.8,8.3
3,15.934292,1,f,35.0,PowerGeneration,HR,rabrecNErab,no,m,white,bus,5.4,7.6,4.9,2.5,6.7
4,8.410678,1,m,32.0,Retail,Commercial,youjs,yes,f,white,bus,3.0,4.1,8.0,7.1,3.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1124,10.611910,0,f,41.0,Banks,HR,rabrecNErab,my head,m,white,bus,8.6,3.4,2.6,4.8,8.3
1125,10.611910,0,f,41.0,Banks,HR,rabrecNErab,my head,m,white,bus,8.6,3.4,2.6,4.8,8.3
1126,118.800821,0,f,34.0,Telecom,Accounting,KA,no,f,white,bus,4.6,5.5,7.2,6.3,3.7
1127,49.412731,0,f,51.0,Consult,HR,empjs,no,m,grey,bus,3.8,7.6,5.7,6.3,5.2


In [100]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('event', axis=1),data['event'], test_size=0.3)

In [101]:
class preprocessing:
    def __init__(self, cat, num, is_linear, fill_func = np.mean):
        

        
        self.cat = cat
        self.num = num
        self.is_linear = is_linear
        self.to_binarize = None
        
        self.fill_func = fill_func
        self.fill_values = None
        if is_linear:
            self.column_transformer = ColumnTransformer([
                ('ohe', OneHotEncoder(handle_unknown="ignore"), self.cat),
                ('scaling', StandardScaler(), self.num)
            ])
        else: 
            self.column_transformer = ColumnTransformer([
                ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value',
                                           unknown_value = -1), self.cat),
                ('chill', 'passthrough', self.num)
                
            ])
                    
    def fit_transform(self, df):
        
        self.to_binarize = np.array(df.columns)[(df.isna().sum() / df.shape[0] > 0.5).values]
        df[self.to_binarize] = df[self.to_binarize].isna().astype(int)
        
        if self.is_linear:
            self.fill_values = dict(self.fill_func(df[self.num]))
            df[self.num] = df[self.num].fillna(value = self.fill_values)
        else: 
            print
            df[self.num] = df[self.num].fillna(value = -1000)

        df[self.cat] = df[self.cat].fillna(-1000)
        df = self.column_transformer.fit_transform(df)
        return df
    
    def transform(self, df):
        df[self.to_binarize] = df[self.to_binarize].isna().astype(int)
        df[self.cat] = df[self.cat].fillna(-1)
        df[self.num] = df[self.num].fillna(value = self.fill_values)
        df = self.column_transformer.transform(df)
        return df
        

In [102]:
class pipe():
    def __init__(self,
                 model,
                 is_linear : bool):
        self.model = model
        self.final_model = None
        self.prep = None
        self.is_linear = is_linear
    
    def fit(self, X_train,
                  y_train,
                  params : dict,
                  cat : list,
                  num : list,
                  cv : int = 5):
        

        self.prep = preprocessing(cat=cat, num=num, is_linear= self.is_linear)
        X_train = self.prep.fit_transform(X_train)
        self.final_model = GridSearchCV(self.model,
                                        params,
                                        cv=cv,
                                        scoring='roc_auc',
                                        return_train_score=False,
                                        verbose = 1,
                                        n_jobs = -1)
        self.final_model.fit(X_train, y_train)
        print('-----------------------------------------------')
        print(f'best_params is {self.final_model.best_params_}')
        print(f'best_score is {self.final_model.best_score_}')
        print()
        return
    

In [103]:
categories = ['gender', 'industry', 'profession',
              'traffic', 'coach', 'head_gender', 'greywage', 'way']
numeric = list(set(X_train.columns) - set(categories))

In [104]:
lr = pipe(LogisticRegression(),
          is_linear=True)
params = {'C' : [1.0, 0.5, 0.2], 'max_iter' : [100,200,500]}
lr.fit(X_train, y_train, params, categories, numeric)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
-----------------------------------------------
best_params is {'C': 0.5, 'max_iter': 100}
best_score is 0.6505227373571789



In [105]:
lgbm = pipe(LGBMClassifier(),
            is_linear=False)
params = {'learning_rate' : [0.1, 0.5, 1], 'n_estimators' : [100, 500, 1000], 'max_depth' : [5,10,15]}
lgbm.fit(X_train, y_train, params, categories, numeric)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
-----------------------------------------------
best_params is {'learning_rate': 0.5, 'max_depth': 10, 'n_estimators': 500}
best_score is 0.7125735212552229

