In [153]:
import numpy as np
import pandas as pd
import xam

import warnings
warnings.filterwarnings("ignore")

In [154]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [155]:
# drop ID column
train.drop('EmployeeNo', axis=1, inplace=True)
test_id = test['EmployeeNo']
test.drop('EmployeeNo', axis=1, inplace=True)

In [156]:
cat_cols = list(train.select_dtypes(include='object').columns)
num_cols = list(train.select_dtypes(exclude='object').columns)

In [158]:
train['Qualification'].value_counts()

First Degree or HND         25578
MSc, MBA and PhD            10469
Non-University Education      586
Name: Qualification, dtype: int64

Fill missing column in Qualification with Mode (First Degree or HND) and create a new column to indicate that they were missing

In [159]:
# train['Missing_quaifications'] =  train['Qualification'].isna().astype('int64')
train['Qualification'].fillna('Non-University Education', inplace=True)

# test['Missing_quaifications'] =  test['Qualification'].isna().astype('int64')
test['Qualification'].fillna('Non-University Education', inplace=True)

### Exploration numerical features

In [160]:
train.describe()

Unnamed: 0,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,Promoted_or_Not
count,38312.0,38312.0,38312.0,38312.0,38312.0,38312.0,38312.0,38312.0
mean,2.25368,1986.209334,7.698959,2013.139695,0.352996,0.023152,55.366465,0.084595
std,0.609443,7.646047,3.744135,4.261451,0.477908,0.150388,13.362741,0.278282
min,2.0,1950.0,0.0,1982.0,0.0,0.0,31.0,0.0
25%,2.0,1982.0,5.0,2012.0,0.0,0.0,43.0,0.0
50%,2.0,1988.0,7.5,2014.0,0.0,0.0,52.0,0.0
75%,2.0,1992.0,10.0,2016.0,1.0,0.0,68.0,0.0
max,11.0,2001.0,12.5,2018.0,1.0,1.0,91.0,1.0


### Exploration Categorical features

In [161]:
train[cat_cols].head()

Unnamed: 0,Division,Qualification,Gender,Channel_of_Recruitment,State_Of_Origin,Foreign_schooled,Marital_Status,Past_Disciplinary_Action,Previous_IntraDepartmental_Movement,No_of_previous_employers
0,Commercial Sales and Marketing,"MSc, MBA and PhD",Female,Direct Internal process,ANAMBRA,No,Married,No,No,0
1,Customer Support and Field Operations,First Degree or HND,Male,Agency and others,ANAMBRA,Yes,Married,No,No,0
2,Commercial Sales and Marketing,First Degree or HND,Male,Direct Internal process,KATSINA,Yes,Married,No,No,0
3,Commercial Sales and Marketing,First Degree or HND,Male,Agency and others,NIGER,Yes,Single,No,No,1
4,Information and Strategy,First Degree or HND,Male,Direct Internal process,AKWA IBOM,Yes,Married,No,No,1


In [162]:
for col in cat_cols:
    print("{}: {}".format(col, train[col].nunique()))

Division: 9
Qualification: 3
Gender: 2
Channel_of_Recruitment: 3
State_Of_Origin: 37
Foreign_schooled: 2
Marital_Status: 3
Past_Disciplinary_Action: 2
Previous_IntraDepartmental_Movement: 2
No_of_previous_employers: 7


In [164]:
# Label Encode Division and  State of Origin
from sklearn.preprocessing import LabelEncoder

to_enc = ['Division', 'State_Of_Origin']

lb = LabelEncoder()
for col in to_enc:
    lb.fit(train[col])
    train[col] = lb.transform(train[col])
    test[col] = lb.transform(test[col])

In [166]:
# mean encoding

mean_enc = xam.feature_extraction.BayesianTargetEncoder(columns=cat_cols)

y = train['Promoted_or_Not']
train_enc = train.drop('Promoted_or_Not', axis=1)
mean_enc.fit(train_enc, y)

train_enc = mean_enc.transform(train_enc)
test_enc = mean_enc.transform(test)

In [167]:
train_enc.drop(cat_cols, axis=1, inplace=True)
test_enc.drop(cat_cols, axis=1, inplace=True)

In [168]:
train_enc.head()

Unnamed: 0,Trainings_Attended,Year_of_birth,Last_performance_score,Year_of_recruitment,Targets_met,Previous_Award,Training_score_average,Division_mean,Qualification_mean,Gender_mean,Channel_of_Recruitment_mean,State_Of_Origin_mean,Foreign_schooled_mean,Marital_Status_mean,Past_Disciplinary_Action_mean,Previous_IntraDepartmental_Movement_mean,No_of_previous_employers_mean
0,2,1986,12.5,2011,1,0,41,0.072019,0.096552,0.08901,0.083188,0.095281,0.081785,0.084971,0.084511,0.085336,0.082371
1,2,1991,12.5,2015,0,0,52,0.088252,0.081917,0.082708,0.084281,0.095281,0.084868,0.084971,0.084511,0.085336,0.082371
2,2,1987,7.5,2012,0,0,42,0.072019,0.081917,0.082708,0.083188,0.078964,0.084868,0.084971,0.084511,0.085336,0.082371
3,3,1982,2.5,2009,0,0,42,0.072019,0.081917,0.082708,0.084281,0.07258,0.084868,0.08175,0.084511,0.085336,0.085541
4,3,1990,7.5,2012,0,0,77,0.09329,0.081917,0.082708,0.083188,0.084682,0.084868,0.084971,0.084511,0.085336,0.085541


In [169]:
# # One hot encode
# train = pd.get_dummies(train_enc)
# test = pd.get_dummies(test_enc)

In [170]:
train_enc.shape

(38312, 17)

In [171]:
test.shape

(16496, 17)

In [172]:
# get columns for feature importance
train_cols = train_enc.columns

# target = train['Promoted_or_Not']
# train.drop('Promoted_or_Not', axis=1, inplace=True)

In [173]:
# split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_enc, y, test_size=0.3, random_state=1)

In [174]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)


### MODELING

In [175]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.ensemble import BaggingClassifier, VotingClassifier

import lightgbm as lgb
# import xgboost as xgb
import catboost as cb


In [176]:
def train_and_score(model, X, y, test, test_y):
    model.fit(X, y)
    pred = model.predict(test)
    print("F1_score is {}".format(f1_score(pred, test_y)))

In [177]:
def max_train_and_score(models, X, y, test, test_y, full=False):
    pred_array = pd.DataFrame()
    
    for i,model in enumerate(models):
        model.fit(X, y)
        pred = model.predict(test)
        pred_array['model{}'.format(i)] = pred
        max_pred = round(pred_array.mean(axis=1)).astype('int64')
        
    if full:
        pass
    else:
        print("F1_score is {}".format(f1_score(max_pred, test_y)))
    
    return max_pred

In [178]:
cb_model = cb.CatBoostClassifier(iterations=700, random_seed=1, silent=True)
train_and_score(cb_model, X_train, y_train, X_test, y_test)

F1_score is 0.5053929121725731


In [179]:
best_lgb = lgb.LGBMClassifier(n_estimators=1200, max_depth=4, random_state = 1)
train_and_score(best_lgb, X_train, y_train, X_test, y_test)

F1_score is 0.5047688921496699


In [180]:
# rf_model = RandomForestClassifier(n_estimators=1000, random_state=2)
# train_and_score(rf_model, X_train, y_train, X_test, y_test)

In [181]:
# xgb_model = xgb.XGBClassifier(n_estimators=1000, random_state=2)
# train_and_score(xgb_model, X_train, y_train, X_test, y_test)

In [182]:
# ests = [cb_model, xgb_model,best_lgb, rf_model]
# # pred = max_train_and_score(ests, X_train, y_train, X_test, y_test)

In [183]:
sample = pd.read_csv('sample_submission2.csv')

In [184]:
sample.head()

Unnamed: 0,EmployeeNo,Promoted_or_Not
0,YAK/S/34385,1
1,YAK/S/27825,1
2,YAK/S/23870,1
3,YAK/S/54784,1
4,YAK/S/25058,1


In [185]:

sca = StandardScaler()

sca.fit(train_enc)
train = sca.transform(train_enc)
test = sca.transform(test)

best_lgb.fit(train, y)
final_pred = best_lgb.predict(test)

In [186]:
sample['EmployeeNo'] = test_id
sample['Promoted_or_Not'] = final_pred.astype('int64')
sample.head()

Unnamed: 0,EmployeeNo,Promoted_or_Not
0,YAK/S/00005,0
1,YAK/S/00011,0
2,YAK/S/00015,0
3,YAK/S/00016,0
4,YAK/S/00017,0


In [187]:
sample['Promoted_or_Not'].value_counts()

0    15895
1      601
Name: Promoted_or_Not, dtype: int64

In [188]:
sample.to_csv('cb_mean1.csv', index=False)