In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn import set_config
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, fbeta_score, f1_score, make_scorer, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold
import pickle

from functools import partial

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

from xgboost import XGBClassifier
from custom_transformer import NumericalTransformer
import joblib
import pickle

In [2]:
X_train = pd.read_csv('data\X_train.csv')
X_test = pd.read_csv('data\X_test.csv')
y_train = pd.read_csv('data\y_train.csv')
y_test = pd.read_csv('data\y_test.csv')

In [3]:
columns_na = ['OCCUPATION_TYPE', 'HOUSETYPE_MODE', 'EMERGENCYSTATE_MODE']
numerical_col = X_train.select_dtypes(include = np.number).columns
categorical_col = X_train.select_dtypes(exclude = np.number).columns
freq_columns = [i for i in categorical_col if i not in columns_na]
na_columns = [i for i in categorical_col if i in columns_na]

onehot_transformer = OneHotEncoder(handle_unknown = 'ignore')
ordinal_transformer = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)
scaler = RobustScaler()
simpleimputer_frequency = SimpleImputer(strategy = "most_frequent")
simpleimputer_constant = SimpleImputer(strategy = "constant", fill_value = "XNA")
simpleimputer_median = SimpleImputer(strategy = "median")

pipeline_const_dt = make_pipeline(simpleimputer_constant, ordinal_transformer)
pipeline_most_frequente_dt = make_pipeline(simpleimputer_frequency, ordinal_transformer)
pipeline_mediane_scaler_dt = make_pipeline(NumericalTransformer(),simpleimputer_median, scaler)

In [4]:
preprocessor_dt = ColumnTransformer(
    transformers=[
        ('constant_columns', pipeline_const_dt, na_columns),
        ('most_frequent_columns', pipeline_most_frequente_dt, freq_columns),
        ('nums_columns', pipeline_mediane_scaler_dt, numerical_col),
    ])
preprocessor_dt

In [5]:
xgboost_auc = {'colsample_bytree':	0.6,
                'gamma':	0.0,
                'learning_rate':	0.1,
                'max_depth':	4,
                'min_child_weight':	2,
                'n_estimators':	300,
                'reg_alpha':	100,
                'reg_lambda':	0.1,
                'scale_pos_weight':	5,
                'seed':	0,
                'subsample':	0.8}
model =  XGBClassifier(**xgboost_auc)
pipeline = make_pipeline(preprocessor_dt, model)

In [6]:
pipeline.fit(X_train, y_train.TARGET)

In [8]:
joblib.dump(pipeline, 'pipeline-xgboost-scoring')

['pipeline-xgboost-scoring']

In [9]:
features_names = na_columns + freq_columns + numerical_col.tolist()

In [10]:
with open('columns_name.pickle', 'wb') as f:
    pickle.dump(features_names, f)