In [7]:
import pandas as pd
import numpy as np
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

from feature_engine.imputation import (
    CategoricalImputer,
)

from feature_engine.transformation import (
    YeoJohnsonTransformer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
    OneHotEncoder,
    CountFrequencyEncoder
)

import joblib

import preprocess as pp

## Read Data

In [21]:
data = pd.read_csv('../src/data/train.csv')
print(data.shape)
data.head()

(19158, 14)


Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


## Train-Test split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['enrollee_id', 'target'], axis=1),
    data['target'],
    test_size=0.2,
    random_state=0,
)

## Config

In [11]:
CAT_VARS_REPLACE_NA_WITH_STRING_MISSING = ['gender', 'major_discipline', 'company_size', 'company_type']

CAT_VARS_REPLACE_NA_WITH_FREQUENT = ['enrolled_university', 'education_level', 'experience', 'last_new_job']

NUM_VARS = ['city_development_index', 'training_hours']

NUM_VARS_YEO_JOHNSON = ['training_hours']

CAT_VARS_ORDINAL = ['relevent_experience', 'enrolled_university', 'education_level', 'major_discipline']
CAT_VARS_ORDINAL_ARBITRARY = ['city']
CAT_VARS_ONEHOT = ['gender']
CAT_VARS_COUNT_FREQUENCY = ['company_type']

EXPERIENCE_VAR = ['experience']

EXPERIENCE_MAP = {
    '<1': 0,
    '1': 1, 
    '2': 2, 
    '3': 3, 
    '4': 4, 
    '5': 5,
    '6': 6,
    '7': 7,
    '8': 8, 
    '9': 9, 
    '10': 10, 
    '11': 11,
    '12': 12,
    '13': 13, 
    '14': 14, 
    '15': 15, 
    '16': 16,
    '17': 17,
    '18': 18,
    '19': 19, 
    '20': 20, 
    '>20': 21
} 
LAST_NEW_JOB_VAR = ['last_new_job']

LAST_NEW_JOB_MAP = {
    'never': 0,
    '1': 1, 
    '2': 2, 
    '3': 3, 
    '4': 4, 
    '>4': 5
}

COMPANY_SIZE_VAR = ['company_size']

COMPANY_SIZE_MAP = {
    'Missing': 0,
    '<10': 1,
    '10/49': 2, 
    '100-500': 3, 
    '1000-4999': 4, 
    '10000+': 5, 
    '50-99': 6, 
    '500-999': 7, 
    '5000-9999': 8
}

## Feature Engineering Pipeline

In [23]:
fe_pipe = Pipeline([
    ('cat_imputer_missing', CategoricalImputer(imputation_method='missing', variables=CAT_VARS_REPLACE_NA_WITH_STRING_MISSING)),
    ('cat_imputer_frequent', CategoricalImputer(imputation_method='frequent', variables=CAT_VARS_REPLACE_NA_WITH_FREQUENT)),
    ('num_transformer_yeo_johnson', YeoJohnsonTransformer(variables=NUM_VARS_YEO_JOHNSON)),
    ('ordinal_encoder', OrdinalEncoder(encoding_method='ordered', variables=CAT_VARS_ORDINAL)),
    ('ordinal_encoder_arbitrary', OrdinalEncoder(encoding_method='arbitrary', variables=CAT_VARS_ORDINAL_ARBITRARY)),
    ('count_frequency_encoder', CountFrequencyEncoder(encoding_method='frequency', variables=CAT_VARS_COUNT_FREQUENCY)),
    ('onehot_encoder', OneHotEncoder(variables=CAT_VARS_ONEHOT)),
    ('experience_map', pp.Mapper(variables=EXPERIENCE_VAR, mappings=EXPERIENCE_MAP)),
    ('last_new_job_map', pp.Mapper(variables=LAST_NEW_JOB_VAR, mappings=LAST_NEW_JOB_MAP)),
    ('company_size_map', pp.Mapper(variables=COMPANY_SIZE_VAR, mappings=COMPANY_SIZE_MAP)),
    # ('min_max_scaler', MinMaxScaler())
])

In [24]:
X_train.head()

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
19147,city_21,0.624,Male,No relevent experience,Full time course,Graduate,STEM,1,100-500,Pvt Ltd,1,52
8464,city_21,0.624,,Has relevent experience,Full time course,Graduate,STEM,<1,<10,Pvt Ltd,,92
8869,city_16,0.91,Male,Has relevent experience,no_enrollment,Masters,STEM,9,,Pvt Ltd,1,36
11645,city_118,0.722,,Has relevent experience,Part time course,Masters,STEM,10,1000-4999,Pvt Ltd,3,19
7743,city_103,0.92,,No relevent experience,no_enrollment,Primary School,,2,,,never,69


In [25]:
fe_pipe.fit(X_train, y_train)

Pipeline(steps=[('cat_imputer_missing',
                 CategoricalImputer(variables=['gender', 'major_discipline',
                                               'company_size',
                                               'company_type'])),
                ('cat_imputer_frequent',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['enrolled_university',
                                               'education_level', 'experience',
                                               'last_new_job'])),
                ('num_transformer_yeo_johnson',
                 YeoJohnsonTransformer(variables=['t...
                                  '20': 20, '3': 3, '4': 4, '5': 5, '6': 6,
                                  '7': 7, '8': 8, '9': 9, '<1': 0, '>20': 21},
                        variables=['experience'])),
                ('last_new_job_map',
                 Mapper(mappings={'1': 1, '2': 2, '3': 3, '4': 4, '>4': 5,
        

In [26]:
X_train = fe_pipe.transform(X_train)
X_test = fe_pipe.transform(X_test)

In [27]:
X_train

Unnamed: 0,city,city_development_index,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,gender_Male,gender_Missing,gender_Other,gender_Female
19147,0,0.624,1,2,4,4,1,3,0.514746,1,5.371921,1,0,0,0
8464,0,0.624,0,2,4,4,0,1,0.514746,1,6.415291,0,1,0,0
8869,1,0.910,0,0,3,4,9,0,0.514746,1,4.748399,1,0,0,0
11645,2,0.722,0,1,3,4,10,4,0.514746,3,3.753794,0,1,0,0
7743,3,0.920,1,0,1,0,2,0,0.319522,0,5.877477,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9225,31,0.479,1,2,4,4,10,0,0.319522,5,4.702184,1,0,0,0
13123,0,0.624,0,0,4,4,6,6,0.514746,1,3.829470,1,0,0,0
9845,3,0.920,0,0,4,4,6,6,0.514746,0,5.232979,0,0,1,0
10799,0,0.624,1,0,4,4,2,2,0.319522,1,6.030879,0,1,0,0
