In [9]:
import pandas as pd
import numpy as np
import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from feature_engine.imputation import (
    CategoricalImputer,
)

from feature_engine.transformation import (
    YeoJohnsonTransformer,
)

from feature_engine.encoding import (
    RareLabelEncoder,
    OrdinalEncoder,
    OneHotEncoder,
    CountFrequencyEncoder
)

import joblib

import preprocess as pp

## Read Data

In [2]:
data = pd.read_csv('../src/data/train.csv')
print(data.shape)
data.head()

(19158, 14)


Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


## Train-Test Split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    data.drop(['enrollee_id', 'target'], axis=1),
    data['target'],
    test_size=0.2,
    random_state=0,
)

## Config

In [4]:
CAT_VARS_REPLACE_NA_WITH_STRING_MISSING = ['gender', 'major_discipline', 'company_size', 'company_type']

CAT_VARS_REPLACE_NA_WITH_FREQUENT = ['enrolled_university', 'education_level', 'experience', 'last_new_job']

NUM_VARS = ['city_development_index', 'training_hours']

NUM_VARS_YEO_JOHNSON = ['training_hours']

CAT_VARS_ORDINAL = ['relevent_experience', 'enrolled_university', 'education_level', 'major_discipline']
CAT_VARS_ORDINAL_ARBITRARY = ['city']
CAT_VARS_ONEHOT = ['gender']
CAT_VARS_COUNT_FREQUENCY = ['company_type']

EXPERIENCE_VAR = ['experience']

EXPERIENCE_MAP = {
    '<1': 0,
    '1': 1, 
    '2': 2, 
    '3': 3, 
    '4': 4, 
    '5': 5,
    '6': 6,
    '7': 7,
    '8': 8, 
    '9': 9, 
    '10': 10, 
    '11': 11,
    '12': 12,
    '13': 13, 
    '14': 14, 
    '15': 15, 
    '16': 16,
    '17': 17,
    '18': 18,
    '19': 19, 
    '20': 20, 
    '>20': 21
} 
LAST_NEW_JOB_VAR = ['last_new_job']

LAST_NEW_JOB_MAP = {
    'never': 0,
    '1': 1, 
    '2': 2, 
    '3': 3, 
    '4': 4, 
    '>4': 5
}

COMPANY_SIZE_VAR = ['company_size']

COMPANY_SIZE_MAP = {
    'Missing': 0,
    '<10': 1,
    '10/49': 2, 
    '100-500': 3, 
    '1000-4999': 4, 
    '10000+': 5, 
    '50-99': 6, 
    '500-999': 7, 
    '5000-9999': 8
}

## Pipeline

In [7]:
pipe = Pipeline([
    ('cat_imputer_missing', CategoricalImputer(imputation_method='missing', variables=CAT_VARS_REPLACE_NA_WITH_STRING_MISSING)),
    ('cat_imputer_frequent', CategoricalImputer(imputation_method='frequent', variables=CAT_VARS_REPLACE_NA_WITH_FREQUENT)),
    ('num_transformer_yeo_johnson', YeoJohnsonTransformer(variables=NUM_VARS_YEO_JOHNSON)),
    ('ordinal_encoder', OrdinalEncoder(encoding_method='ordered', variables=CAT_VARS_ORDINAL)),
    ('ordinal_encoder_arbitrary', OrdinalEncoder(encoding_method='arbitrary', variables=CAT_VARS_ORDINAL_ARBITRARY)),
    ('count_frequency_encoder', CountFrequencyEncoder(encoding_method='frequency', variables=CAT_VARS_COUNT_FREQUENCY)),
    ('onehot_encoder', OneHotEncoder(variables=CAT_VARS_ONEHOT)),
    ('experience_map', pp.Mapper(variables=EXPERIENCE_VAR, mappings=EXPERIENCE_MAP)),
    ('last_new_job_map', pp.Mapper(variables=LAST_NEW_JOB_VAR, mappings=LAST_NEW_JOB_MAP)),
    ('company_size_map', pp.Mapper(variables=COMPANY_SIZE_VAR, mappings=COMPANY_SIZE_MAP)),
    ('min_max_scaler', MinMaxScaler()),
    
    ('logistic_regression', LogisticRegression(random_state=0))
])

In [8]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('cat_imputer_missing',
                 CategoricalImputer(variables=['gender', 'major_discipline',
                                               'company_size',
                                               'company_type'])),
                ('cat_imputer_frequent',
                 CategoricalImputer(imputation_method='frequent',
                                    variables=['enrolled_university',
                                               'education_level', 'experience',
                                               'last_new_job'])),
                ('num_transformer_yeo_johnson',
                 YeoJohnsonTransformer(variables=['t...
                 Mapper(mappings={'1': 1, '2': 2, '3': 3, '4': 4, '>4': 5,
                                  'never': 0},
                        variables=['last_new_job'])),
                ('company_size_map',
                 Mapper(mappings={'10/49': 2, '100-500': 3, '1000-4999': 4,
                                  '100

In [10]:
preds = pipe.predict(X_test)

In [11]:
accuracy_score(y_test, preds)

0.7713987473903967

## Save the pipe

In [12]:
joblib.dump(pipe, 'pipe.joblib') 

['pipe.joblib']

## Score new data

In [16]:
new_data = pd.read_csv('../src/data/test.csv')

In [18]:
new_data = new_data.drop(['enrollee_id'], axis=1)
new_data

Unnamed: 0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours
0,city_41,0.827,Male,Has relevent experience,Full time course,Graduate,STEM,9,<10,,1,21
1,city_103,0.920,Female,Has relevent experience,no_enrollment,Graduate,STEM,5,,Pvt Ltd,1,98
2,city_21,0.624,Male,No relevent experience,no_enrollment,High School,,<1,,Pvt Ltd,never,15
3,city_13,0.827,Male,Has relevent experience,no_enrollment,Masters,STEM,11,10/49,Pvt Ltd,1,39
4,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,10000+,Pvt Ltd,>4,72
...,...,...,...,...,...,...,...,...,...,...,...,...
2124,city_103,0.920,Male,No relevent experience,no_enrollment,Graduate,Humanities,16,,Public Sector,4,15
2125,city_136,0.897,Male,Has relevent experience,no_enrollment,Masters,STEM,18,,,2,30
2126,city_100,0.887,Male,No relevent experience,no_enrollment,Primary School,,3,,Pvt Ltd,never,18
2127,city_102,0.804,Male,Has relevent experience,Full time course,High School,,7,100-500,Public Sector,1,84


In [20]:
new_preds = pipe.predict(new_data)

In [21]:
new_preds

array([0., 0., 1., ..., 0., 0., 0.])

TODO:
    
- hyperparameter tuning
- multiple algorithms
- sklearn similar projects
- optuna