In [60]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
import warnings

In [61]:
warnings.filterwarnings("ignore")
rcParams['figure.figsize'] = 12, 8

In [62]:
def show_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)
        
def data_viz(df, cols, target = None, plot_type = 'bar'):
    col_length = len(cols)
    
    if(plot_type == 'hist'):
        col_prop = round(col_length/2)
        fig, ax = plt.subplots(col_prop, 2, figsize=(20, 6 * col_prop))
    else:
        col_prop = round(col_length/2)
        fig, ax = plt.subplots(col_prop, 2, figsize=(20, 7 * col_prop))
    
    if(plot_type=='bar_target'):
        for variable, subplot in zip(cols, ax.flatten()):
            ct = pd.crosstab(df[variable], df[target])
            ct.plot.bar(stacked=True, ax=subplot)
            for label in subplot.get_xticklabels():
                label.set_rotation(15)
    elif(plot_type == 'bar'):
        for variable, subplot in zip(cols, ax.flatten()):
            sns.countplot(df[variable], ax=subplot)
            for label in subplot.get_xticklabels():
                label.set_rotation(30)
    elif(plot_type == 'box'):       
        for variable, subplot in zip(cols, ax.flatten()):
            sorted_vals = df.groupby([variable])[target].mean().sort_values()
            sns.boxplot(x=variable, y=target, data=df, ax=subplot, order=list(sorted_vals.index))
            for label in subplot.get_xticklabels():
                label.set_rotation(30)
    elif(plot_type == 'hist'):
        for variable, subplot in zip(cols, ax.flatten()):
            df[variable].fillna(df[variable].mean(), inplace=True)
            sns.distplot(df[variable], ax=subplot)
            for label in subplot.get_xticklabels():
                label.set_rotation(15)

In [63]:
%%time
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
submission = pd.read_csv('input/sample_submission.csv')

CPU times: user 51.8 ms, sys: 12.9 ms, total: 64.7 ms
Wall time: 64.2 ms


In [64]:
train.shape, test.shape, submission.shape

((18359, 14), (15021, 13), (15021, 2))

In [65]:
show_all(train.head())

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,23798,city_149,0.689,Male,Has relevent experience,no_enrollment,Graduate,STEM,3,100-500,Pvt Ltd,1,106,0
1,29166,city_83,0.923,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,<10,Funded Startup,1,69,0
2,46,city_16,0.91,,Has relevent experience,no_enrollment,Graduate,STEM,6,50-99,Public Sector,2,4,0
3,18527,city_64,0.666,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,50-99,Pvt Ltd,1,26,0
4,21751,city_100,0.887,,No relevent experience,no_enrollment,Masters,STEM,8,,,2,88,1


In [66]:
show_all(test.head())

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,16548,city_33,0.448,,No relevent experience,Full time course,Graduate,STEM,<1,1000-4999,Public Sector,,15
1,12036,city_28,0.939,Male,No relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,1.0,94
2,11061,city_103,0.92,Male,No relevent experience,Full time course,Graduate,STEM,3,,,1.0,17
3,5032,city_104,0.924,Male,No relevent experience,no_enrollment,Phd,STEM,>20,50-99,Pvt Ltd,2.0,76
4,17599,city_77,0.83,Male,Has relevent experience,no_enrollment,Graduate,STEM,6,<10,Pvt Ltd,2.0,65


In [67]:
show_all(submission.head())

Unnamed: 0,enrollee_id,target
0,16548,0
1,12036,0
2,11061,0
3,5032,0
4,17599,0


In [68]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18359 entries, 0 to 18358
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             18359 non-null  int64  
 1   city                    18359 non-null  object 
 2   city_development_index  18359 non-null  float64
 3   gender                  14261 non-null  object 
 4   relevent_experience     18359 non-null  object 
 5   enrolled_university     18017 non-null  object 
 6   education_level         17902 non-null  object 
 7   major_discipline        15521 non-null  object 
 8   experience              18300 non-null  object 
 9   company_size            13580 non-null  object 
 10  company_type            13320 non-null  object 
 11  last_new_job            17992 non-null  object 
 12  training_hours          18359 non-null  int64  
 13  target                  18359 non-null  int64  
dtypes: float64(1), int64(3), object(10)
me

In [69]:
show_all(train.iloc[:,1:].describe())

Unnamed: 0,city_development_index,training_hours,target
count,18359.0,18359.0,18359.0
mean,0.84714,65.899014,0.132088
std,0.110189,60.8853,0.338595
min,0.448,1.0,0.0
25%,0.796,23.0,0.0
50%,0.91,47.0,0.0
75%,0.92,89.0,0.0
max,0.949,336.0,1.0


In [70]:
for df in (train, test):
    missing_stats = df.isnull().sum()
    print(missing_stats[missing_stats != 0])

gender                 4098
enrolled_university     342
education_level         457
major_discipline       2838
experience               59
company_size           4779
company_type           5039
last_new_job            367
dtype: int64
gender                 3388
enrolled_university     279
education_level         395
major_discipline       2393
experience               44
company_size           4051
company_type           4330
last_new_job            304
dtype: int64


In [71]:
test['target'] = -1

In [72]:
full_data = train.append(test)
full_data.shape

(33380, 14)

## Exploratory Analysis

In [73]:
train.columns

Index(['enrollee_id', 'city', 'city_development_index', 'gender',
       'relevent_experience', 'enrolled_university', 'education_level',
       'major_discipline', 'experience', 'company_size', 'company_type',
       'last_new_job', 'training_hours', 'target'],
      dtype='object')

In [74]:
id_col = ['enrollee_id']
cat_cols = train.select_dtypes(include=['object']).columns.tolist()
target_col = 'target'

In [75]:
for col in full_data.columns:
    print(f"{col}: {full_data[col].nunique()}")

enrollee_id: 33380
city: 123
city_development_index: 93
gender: 3
relevent_experience: 2
enrolled_university: 3
education_level: 5
major_discipline: 6
experience: 22
company_size: 8
company_type: 6
last_new_job: 6
training_hours: 241
target: 3


In [76]:
train[target_col].value_counts(normalize=True)

0    0.867912
1    0.132088
Name: target, dtype: float64

In [77]:
for col in cat_cols:
    print(f"{col}:\n{train[col].value_counts()}\n")

city:
city_103    4358
city_21     1672
city_16     1654
city_114    1472
city_160     827
            ... 
city_111       3
city_121       3
city_129       3
city_171       2
city_140       1
Name: city, Length: 123, dtype: int64

gender:
Male      12884
Female     1188
Other       189
Name: gender, dtype: int64

relevent_experience:
Has relevent experience    13596
No relevent experience      4763
Name: relevent_experience, dtype: int64

enrolled_university:
no_enrollment       13659
Full time course     3187
Part time course     1171
Name: enrolled_university, dtype: int64

education_level:
Graduate          10769
Masters            4319
High School        2032
Phd                 459
Primary School      323
Name: education_level, dtype: int64

major_discipline:
STEM               13738
Humanities           688
Other                343
Business Degree      307
Arts                 239
No Major             206
Name: major_discipline, dtype: int64

experience:
>20    3437
5      1309


In [78]:
# data_viz(train, cols = cat_cols, target='target', plot_type = 'bar_target')

## Feature Engineering

In [79]:
show_all(full_data.head(5))

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,23798,city_149,0.689,Male,Has relevent experience,no_enrollment,Graduate,STEM,3,100-500,Pvt Ltd,1,106,0
1,29166,city_83,0.923,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,<10,Funded Startup,1,69,0
2,46,city_16,0.91,,Has relevent experience,no_enrollment,Graduate,STEM,6,50-99,Public Sector,2,4,0
3,18527,city_64,0.666,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,50-99,Pvt Ltd,1,26,0
4,21751,city_100,0.887,,No relevent experience,no_enrollment,Masters,STEM,8,,,2,88,1


In [80]:
for cat_col in cat_cols:
    for num_col in ['training_hours', 'city_development_index']:
        full_data[f'{cat_col}_{num_col}_std'] = full_data.groupby([cat_col])[num_col].transform('std')

In [81]:
for col in cat_cols:
    full_data[f'{col}_cnt'] = full_data.groupby([col])['enrollee_id'].transform('size')

## Modeling

In [82]:
import sys
sys.path.append('ml_modules/')

In [83]:
from custom_estimator import Estimator
from encoding import FreqeuncyEncoding
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import category_encoders as enc
from sklearn.metrics import roc_auc_score
from sklearn import model_selection

In [84]:
train_data = full_data[full_data[target_col] != -1]
test_data = full_data[full_data[target_col] == -1]

In [85]:
categorical_cols = cat_cols

In [86]:
fE = FreqeuncyEncoding(categorical_columns=categorical_cols, return_df=True)
train_data = fE.fit_transform(train_data)
test_data = fE.transform(test_data)

In [87]:
# Target Encoding
X = train_data[categorical_cols]
y = train_data[target_col].values

enc_train = np.zeros(X.shape)
smoothing = 0.3

Folds = model_selection.KFold(n_splits=5, random_state=2020, shuffle=True)

for train_idx, valid_idx in Folds.split(X):
    encoder = enc.TargetEncoder(cols=categorical_cols, smoothing=smoothing)

    encoder.fit(X.iloc[train_idx], y[train_idx])
    enc_train[valid_idx, :] = encoder.transform(X.iloc[valid_idx], y[valid_idx])

encoder.fit(X, y)
enc_test = encoder.transform(test_data[categorical_cols]).values

for idx, col in enumerate(categorical_cols):
    col = 'tE_' + col 
    train_data[col] = enc_train[:,idx]
    test_data[col] = enc_test[:, idx]

In [88]:
drop_cols = [target_col] + id_col
y = train_data[target_col]

train_data.drop(columns=drop_cols, axis=1, inplace=True)
test_data.drop(columns=drop_cols, axis=1, inplace=True)

In [89]:
train_data = train_data.replace({np.inf: np.nan, -np.inf: np.nan}).fillna(-999)
test_data = test_data.replace({np.inf: np.nan, -np.inf: np.nan}).fillna(-999)

In [90]:
for df in (train_data, test_data):
    missing_stats = df.isnull().mean()
    print(missing_stats[missing_stats != 0])

Series([], dtype: float64)
Series([], dtype: float64)


In [91]:
train_data.shape, test_data.shape

((18359, 52), (15021, 52))

In [92]:
show_all(train_data.head())

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,city_training_hours_std,city_city_development_index_std,gender_training_hours_std,gender_city_development_index_std,relevent_experience_training_hours_std,relevent_experience_city_development_index_std,enrolled_university_training_hours_std,enrolled_university_city_development_index_std,education_level_training_hours_std,education_level_city_development_index_std,major_discipline_training_hours_std,major_discipline_city_development_index_std,experience_training_hours_std,experience_city_development_index_std,company_size_training_hours_std,company_size_city_development_index_std,company_type_training_hours_std,company_type_city_development_index_std,last_new_job_training_hours_std,last_new_job_city_development_index_std,city_cnt,gender_cnt,relevent_experience_cnt,enrolled_university_cnt,education_level_cnt,major_discipline_cnt,experience_cnt,company_size_cnt,company_type_cnt,last_new_job_cnt,tE_city,tE_gender,tE_relevent_experience,tE_enrolled_university,tE_education_level,tE_major_discipline,tE_experience,tE_company_size,tE_company_type,tE_last_new_job
0,0.005719,0.689,0.903443,0.740563,0.758117,0.601553,0.885123,0.063333,0.198675,0.75458,0.420576,106,57.222887,0.0,60.288179,0.103698,60.491383,0.106611,60.056315,0.102845,60.499645,0.114978,60.545172,0.112573,61.449816,0.129321,60.529165,0.104624,60.341926,0.106892,60.814609,0.113584,194,23462.0,24698,24887.0,19512.0,24855.0,2166.0,4845.0,18114.0,13813.0,0.123596,0.121852,0.117198,0.116487,0.140341,0.135904,0.170758,0.099074,0.108219,0.13394
1,0.007735,0.923,0.903443,0.740563,0.758117,0.601553,0.885123,0.032896,0.100147,0.077928,0.420576,69,69.236521,0.0,60.288179,0.103698,60.491383,0.106611,60.056315,0.102845,60.499645,0.114978,60.545172,0.112573,63.212236,0.087054,58.780816,0.109552,59.472551,0.099289,60.814609,0.113584,243,23462.0,24698,24887.0,19512.0,24855.0,1080.0,2464.0,1880.0,13813.0,0.098214,0.121828,0.117155,0.116138,0.141664,0.136079,0.128151,0.10987,0.095588,0.133543
2,0.090092,0.91,0.0,0.740563,0.758117,0.601553,0.885123,0.061475,0.22975,0.074775,0.15757,4,62.901798,0.0,-999.0,-999.0,60.491383,0.106611,60.056315,0.102845,60.499645,0.114978,60.545172,0.112573,60.777722,0.119216,60.381607,0.109168,59.255864,0.091811,60.881834,0.106099,2928,-999.0,24698,24887.0,19512.0,24855.0,2039.0,5697.0,1792.0,5133.0,0.089025,0.155434,0.117155,0.116138,0.141664,0.136079,0.137363,0.09956,0.151515,0.136998
3,0.006482,0.666,0.903443,0.740563,0.758117,0.601553,0.885123,0.032896,0.22975,0.75458,0.420576,26,64.401106,0.0,60.288179,0.103698,60.491383,0.106611,60.056315,0.102845,60.499645,0.114978,60.545172,0.112573,63.212236,0.087054,60.381607,0.109168,60.341926,0.106892,60.814609,0.113584,234,23462.0,24698,24887.0,19512.0,24855.0,1080.0,5697.0,18114.0,13813.0,0.093264,0.121828,0.117155,0.116138,0.141664,0.136079,0.128151,0.09956,0.110131,0.133543
4,0.014761,0.887,0.0,0.259437,0.758117,0.241258,0.885123,0.041257,0.0,0.0,0.15757,88,56.009653,0.0,-999.0,-999.0,59.997954,0.118254,60.056315,0.102845,59.93469,0.101628,60.545172,0.112573,59.172541,0.112722,-999.0,-999.0,-999.0,-999.0,60.881834,0.106099,507,-999.0,8682,24887.0,7797.0,24855.0,1423.0,-999.0,-999.0,5133.0,0.164319,0.155095,0.175333,0.117233,0.116413,0.138087,0.109272,0.190701,0.182494,0.135253


In [93]:
lgb_params = {
    'n_estimators': 20000, 
    'learning_rate': 0.01,
    'boosting_type': 'gbdt', 
    'colsample_bytree': 0.70, 
    'min_child_weight': 9.0, 
    'num_leaves': 64, 
    'objective': 'binary', 
    'subsample': 0.70, 
    'subsample_freq': 5,
    'metric': 'custom',
    }

lgb_100 = Estimator( LGBMClassifier(**lgb_params)
                     , random_state=100
                     , eval_metric='AUC'
                     , scoring_metric=roc_auc_score
                     , early_stopping_rounds=100
                     , verbose=200
                    )

In [94]:
lgb_oof_100 = lgb_100.fit_transform(train_data, y.values)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[33]	train's auc: 0.762276	valid's auc: 0.643163
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[95]	train's auc: 0.783206	valid's auc: 0.670799
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[68]	train's auc: 0.768744	valid's auc: 0.679487
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[78]	train's auc: 0.770275	valid's auc: 0.682559
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[41]	train's auc: 0.761783	valid's auc: 0.670495


In [95]:
lgb_200 = Estimator(LGBMClassifier(**lgb_params)
                     , random_state=200
                     , eval_metric='AUC'
                     , scoring_metric=roc_auc_score
                     , early_stopping_rounds=100
                     , verbose=200
                    )

In [96]:
lgb_oof_200 = lgb_200.fit_transform(train_data, y.values)

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[27]	train's auc: 0.751325	valid's auc: 0.665219
Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.818324	valid's auc: 0.668564
Early stopping, best iteration is:
[138]	train's auc: 0.796342	valid's auc: 0.669918
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[39]	train's auc: 0.759946	valid's auc: 0.666222
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[32]	train's auc: 0.759066	valid's auc: 0.683627
Training until validation scores don't improve for 100 rounds
[200]	train's auc: 0.817166	valid's auc: 0.650374
Early stopping, best iteration is:
[274]	train's auc: 0.84145	valid's auc: 0.652512


In [97]:
np.mean([lgb_100.avg_cv_score, lgb_200.avg_cv_score]) 

0.6684000826936809

In [98]:
lgb_100.feature_importances(columns=train_data.columns).head(10)

Unnamed: 0,column,feature_importance,rank
11,training_hours,0.086953,1
42,tE_city,0.073253,2
48,tE_experience,0.071015,3
25,experience_city_development_index_std,0.046604,4
0,city,0.043367,5
49,tE_company_size,0.041476,6
51,tE_last_new_job,0.040542,7
1,city_development_index,0.039063,8
7,experience,0.035246,9
12,city_training_hours_std,0.033587,10


In [99]:
lgb_pred_100 = lgb_100.transform(test_data)
lgb_pred_200 = lgb_200.transform(test_data)

In [100]:
ctb_params = {
    'n_estimators': 20000, 
    'learning_rate': 0.01,
    'random_strength': 0.7,
    'use_best_model': True,
    # 'eval_metric': 'AUC',
    'depth': 6,
    }

ctb_100 = Estimator( CatBoostClassifier(**ctb_params)
                     , random_state=100
                     , eval_metric='AUC'
                     , scoring_metric=roc_auc_score
                     , early_stopping_rounds=100
                     , verbose=200
                    )

In [101]:
ctb_oof_100 = ctb_100.fit_transform(train_data, y.values)

0:	learn: 0.6859502	test: 0.6860616	best: 0.6860616 (0)	total: 19.3ms	remaining: 6m 25s
200:	learn: 0.3667362	test: 0.3784344	best: 0.3784344 (200)	total: 2.11s	remaining: 3m 27s
400:	learn: 0.3514175	test: 0.3739678	best: 0.3739377 (398)	total: 4.52s	remaining: 3m 40s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3738009336
bestIteration = 439

Shrink model to first 440 iterations.
0:	learn: 0.6860061	test: 0.6859848	best: 0.6859848 (0)	total: 13.1ms	remaining: 4m 21s
200:	learn: 0.3681991	test: 0.3745699	best: 0.3745699 (200)	total: 1.86s	remaining: 3m 3s
400:	learn: 0.3533492	test: 0.3685265	best: 0.3685265 (400)	total: 3.77s	remaining: 3m 4s
600:	learn: 0.3448203	test: 0.3681790	best: 0.3681692 (569)	total: 5.77s	remaining: 3m 6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3681691947
bestIteration = 569

Shrink model to first 570 iterations.
0:	learn: 0.6859302	test: 0.6859967	best: 0.6859967 (0)	total: 13.3ms	remaining: 4m 26s
200:	l

In [102]:
ctb_200 = Estimator( CatBoostClassifier(**ctb_params)
                     , random_state=200
                     , eval_metric='AUC'
                     , scoring_metric=roc_auc_score
                     , early_stopping_rounds=100
                     , verbose=200
                    )

In [56]:
ctb_oof_200 = ctb_200.fit_transform(train_data, y.values)

0:	learn: 0.6859821	test: 0.6859834	best: 0.6859834 (0)	total: 29.3ms	remaining: 9m 46s
200:	learn: 0.3671652	test: 0.3752777	best: 0.3752777 (200)	total: 2.15s	remaining: 3m 31s
400:	learn: 0.3525215	test: 0.3694392	best: 0.3694242 (395)	total: 4.43s	remaining: 3m 36s
600:	learn: 0.3444663	test: 0.3695038	best: 0.3693543 (506)	total: 6.67s	remaining: 3m 35s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3693542688
bestIteration = 506

Shrink model to first 507 iterations.
0:	learn: 0.6858968	test: 0.6859147	best: 0.6859147 (0)	total: 12.3ms	remaining: 4m 6s
200:	learn: 0.3677559	test: 0.3733452	best: 0.3733452 (200)	total: 2.4s	remaining: 3m 56s
400:	learn: 0.3532402	test: 0.3671577	best: 0.3671526 (399)	total: 4.54s	remaining: 3m 42s
600:	learn: 0.3451176	test: 0.3664449	best: 0.3663997 (582)	total: 6.79s	remaining: 3m 39s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3662101277
bestIteration = 694

Shrink model to first 695 iterations.
0:

In [57]:
np.mean([ctb_100.avg_cv_score, ctb_200.avg_cv_score])

0.6706150247176094

In [58]:
ctb_pred_100 = ctb_100.transform(test_data)
ctb_pred_200 = ctb_200.transform(test_data)

In [59]:
ctb_100.feature_importances(columns=train_data.columns).head(10)

Unnamed: 0,column,feature_importance,rank
42,tE_city,0.114315,1
1,city_development_index,0.06777,2
25,experience_city_development_index_std,0.037442,3
48,tE_experience,0.031398,4
49,tE_company_size,0.02944,5
0,city,0.029097,6
32,city_cnt,0.027533,7
27,company_size_city_development_index_std,0.026158,8
11,training_hours,0.024865,9
26,company_size_training_hours_std,0.023689,10


In [48]:
final_sub = submission.copy()

In [49]:
final_sub[target_col] = np.mean([
                                 lgb_pred_100, lgb_pred_200,
                                 ctb_pred_100, ctb_pred_200,
                                ]
                                , axis=0)

In [50]:
final_sub.head()

Unnamed: 0,enrollee_id,target
0,16548,0.331971
1,12036,0.193964
2,11061,0.341028
3,5032,0.200075
4,17599,0.219373


In [51]:
file_timestamp = datetime.now().strftime('%m%d%Y_%H%M')
model_name='LGBM_CTB'

In [52]:
final_sub.to_csv(f"output/{model_name}_{file_timestamp}.csv", index=False)