In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline

from feature_engine.imputation import (
    CategoricalImputer,
)

from feature_engine.transformation import (
    YeoJohnsonTransformer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
    OneHotEncoder,
    CountFrequencyEncoder
)

import joblib

## Read Data

In [2]:
data = pd.read_csv('../src/data/train.csv')
print(data.shape)
data.head()

(19158, 14)


Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


## Train-Test split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['enrollee_id', 'target'], axis=1),
    data['target'],
    test_size=0.2,
    random_state=0,
)

## Config

In [4]:
CAT_VARS_REPLACE_NA_WITH_STRING_MISSING = ['gender', 'major_discipline', 'company_size', 'company_type']

CAT_VARS_REPLACE_NA_WITH_FREQUENT = ['enrolled_university', 'education_level', 'experience', 'last_new_job']

NUM_VARS = ['city_development_index', 'training_hours']

NUM_VARS_YEO_JOHNSON = ['training_hours']

CAT_VARS_ORDINAL = ['relevent_experience', 'enrolled_university', 'education_level', 'major_discipline']
CAT_VARS_ORDINAL_ARBITRARY = ['city']
CAT_VARS_ONEHOT = ['gender']
CAT_VARS_COUNT_FREQUENCY = ['company_type']

EXPERIENCE_MAP = {
    '<1': 0,
    '1': 1, 
    '2': 2, 
    '3': 3, 
    '4': 4, 
    '5': 5,
    '6': 6,
    '7': 7,
    '8': 8, 
    '9': 9, 
    '10': 10, 
    '11': 11,
    '12': 12,
    '13': 13, 
    '14': 14, 
    '15': 15, 
    '16': 16,
    '17': 17,
    '18': 18,
    '19': 19, 
    '20': 20, 
    '>20': 21
} 

LAST_NEW_JOB_MAP = {
    'never': 0,
    '1': 1, 
    '2': 2, 
    '3': 3, 
    '4': 4, 
    '>4': 5
}

COMPANY_SIZE_MAP = {
    'Missing': 0,
    '<10': 1,
    '10/49': 2, 
    '100-500': 3, 
    '1000-4999': 4, 
    '10000+': 5, 
    '50-99': 6, 
    '500-999': 7, 
    '5000-9999': 8
}

## Feature Engineering Pipeline

In [5]:
fe_pipe = Pipeline([
    ('cat_imputer_missing', CategoricalImputer(imputation_method='missing', variables=CAT_VARS_REPLACE_NA_WITH_STRING_MISSING)),
    ('cat_imputer_frequent', CategoricalImputer(imputation_method='frequent', variables=CAT_VARS_REPLACE_NA_WITH_FREQUENT)),
    ('num_transformer_yeo_johnson', YeoJohnsonTransformer(variables=NUM_VARS_YEO_JOHNSON)),
    ('ordinal_encoder', OrdinalEncoder(encoding_method='ordered', variables=CAT_VARS_ORDINAL)),
    ('ordinal_encoder_arbitrary', OrdinalEncoder(encoding_method='arbitrary', variables=CAT_VARS_ORDINAL_ARBITRARY)),
    ('count_frequency_encoder', CountFrequencyEncoder(encoding_method='frequency', variables=CAT_VARS_COUNT_FREQUENCY)),
    ('onehot_encoder', OneHotEncoder(variables=cat_vars_onehot=CAT_VARS_ONEHOT)),
    
    
])

ValueError: not enough values to unpack (expected 2, got 0)

In [None]:
da