## Create Sample XGB Model with Sample Dataset

In [1]:
# Import libraries:
import pandas as pd
import numpy as np
from math import log
import pickle as pk

pd.set_option('display.max_columns', 200)

In [2]:
# Read in data:
data0 = pd.read_excel('sample_training_data.xlsx')

### Feature engineering process

In [3]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

In [4]:
data = data0.copy()

# Create date-length variable:
data['date1'] = pd.to_datetime(data['date1'])
data['date2'] = pd.to_datetime(data['date2'])
data['date_diff'] = (data['date2'] - data['date1']).map(lambda x: x.days)

# Create new cat3_1D variable (simulates SIC-1D):
data['cat3_1D'] = data['cat3'].map(lambda x: str(x)[0], na_action = 'ignore')

# Handle dummy_rating:
data['dummy_rating_cat'] = np.where(data['dummy_rating'].str.contains('A'), 1, -1)
data['dummy_rating_cat'] = np.where(pd.isnull(data['dummy_rating']), np.nan, data['dummy_rating_cat'])

# Handle num1 and num5:
data['num1'] = data['num1'] - 1900
data['num5'] = data['num5'].map(lambda x: log(x + 1) if x > 0 else 0, na_action = 'ignore')

In [5]:
# OHE cat1, cat2, and cat3_1D:
for f in ['cat1', 'cat2', 'cat3_1D']:
    # Create OHE feature:
    feature = data[f].map(str, na_action = 'ignore').fillna('null')
    onehot_encoder = OneHotEncoder(sparse = False, categories = 'auto')
    onehot_encoder.fit(feature.values.reshape(len(data), 1))
    f_enc = onehot_encoder.transform(feature.values.reshape(len(data), 1))

    # Save OHE object:
    with open('Sample XGB Model/ohe_{}.pkl'.format(f), 'wb') as pickle_file:
        pk.dump(onehot_encoder, pickle_file)

    # Append new OHE features and drop old:
    f_col_names = onehot_encoder.get_feature_names([f])
    data = pd.concat([data.drop(f, axis = 1).reset_index(drop = True), pd.DataFrame(f_enc, columns = f_col_names)], axis = 1)

In [6]:
# PCA:
pca_features = 4
pca_fields = ['pca1', 'pca2', 'pca3', 'pca4', 'pca5', 'pca6', 'pca7', 'pca8', 'pca9', 'pca10']
ph_cols = data[pca_fields]

# Missing values imputation:
missing_imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
missing_imputer.fit(ph_cols)
ph_cols = missing_imputer.transform(ph_cols)
with open('Sample XGB Model/missing_imputer.pkl', 'wb') as pickle_file:
    pk.dump(missing_imputer, pickle_file)
    
# Variable scaling:
variable_scaler = StandardScaler()
variable_scaler.fit(ph_cols)
ph_cols_std = variable_scaler.transform(ph_cols)
with open('Sample XGB Model/variable_scaler.pkl', 'wb') as pickle_file:
    pk.dump(variable_scaler, pickle_file)

# Create PCA features:
pca = PCA(n_components = pca_features)
pca.fit(ph_cols_std)
pca_cols = pca.transform(ph_cols_std)
with open('Sample XGB Model/pca.pkl', 'wb') as pickle_file:
    pk.dump(pca, pickle_file)

# Combine with overall dataset:
pca_cols_df = pd.DataFrame(data = pca_cols, columns = ['pca_out{}'.format(i) for i in range(pca_features)])
data = data.drop(pca_fields, axis = 1).reset_index(drop = True)
data = pd.concat([data, pca_cols_df], axis = 1)

In [7]:
# Drop irrelevant fields:
data.drop(['row_id', 'dummy_name', 'date1', 'date2', 'cat3', 'dummy_rating'], axis = 1, inplace = True)
print(data.shape)

(2356, 40)


### Train sample XGB model

In [8]:
import xgboost as xgb

In [9]:
# Create target variable and D-Matrix:
train_y = data['target_var'].fillna(0)
train_x = data.drop('target_var', axis = 1)
train_dm = xgb.DMatrix(train_x.values, train_y.values, feature_names = train_x.columns)

In [10]:
# Train model:
xgb_params = {
    'objective': 'count:poisson',
    'eval_metric': 'poisson-nloglik',
    'eta': 0.1,
    'max_depth': 4,
    'min_child_weight': 3,
    'subsample': 0.8,
    'colsample_bytree': 0.5,
    'gamma': 0.1,
    'alpha': 0.1,
    'nthread': 8
}

xgb_mod = xgb.train(dtrain = train_dm, params = xgb_params, num_boost_round = 50)

In [11]:
# Save:
xgb_mod.save_model('Sample XGB Model/sample_xgb.model')

### Try a prediction

In [13]:
col_order = ['num1', 'num2', 'num3', 'num4', 'num5', 'binary1', 'binary2', 'binary3', 'date_diff', 'dummy_rating_cat', 'cat1_I', 'cat1_N', 'cat1_P', 'cat1_R', 'cat1_null',
             'cat2_AMP', 'cat2_ECR', 'cat2_EP', 'cat2_FL', 'cat2_FLD', 'cat2_MFP', 'cat2_MN', 'cat2_N1C', 'cat2_PE', 'cat2_PP', 'cat3_1D_1', 'cat3_1D_2', 'cat3_1D_3',
             'cat3_1D_4', 'cat3_1D_5', 'cat3_1D_6', 'cat3_1D_7', 'cat3_1D_8', 'cat3_1D_9', 'cat3_1D_null', 'pca_out0', 'pca_out1', 'pca_out2', 'pca_out3']

rec = train_x.iloc[1:2]
rec = rec[col_order]

rec_dm = xgb.DMatrix(rec.values, feature_names = train_x.columns)
xgb_mod.predict(rec_dm)

array([0.7154076], dtype=float32)

In [14]:
data0.iloc[1:2].to_dict('records')

[{'binary1': 1,
  'binary2': 1,
  'binary3': 1,
  'cat1': 'R',
  'cat2': 'PP',
  'cat3': 3553.0,
  'date1': Timestamp('2018-02-05 00:00:00'),
  'date2': Timestamp('2019-02-05 00:00:00'),
  'dummy_name': '2Zl*3Nb&',
  'dummy_rating': 'A1',
  'num1': nan,
  'num2': 51.0,
  'num3': nan,
  'num4': 4.0,
  'num5': nan,
  'pca1': 1.92,
  'pca10': 1.05,
  'pca2': 1.28,
  'pca3': 1.1,
  'pca4': 1.4,
  'pca5': 1.03,
  'pca6': 1.73,
  'pca7': 1.65,
  'pca8': 1.98,
  'pca9': 1.86,
  'row_id': 2,
  'target_var': 0}]