# This notebook is an example for the use of sklearn pipeline and it's addons we created.

## 1.Imports

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as ss
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import dill
from sklearn.pipeline import make_pipeline
from main_pipeline import ObjectsColumnaAsType, PandasImputer, PandasStandardScaler,TypeSelector
from pandas_feature_union import PandasFeatureUnion
from pandas_OneHotEncoder import OneHotEncoder
# from data_inspection import *
from preprocessing_pipeline import *
from classification_metrics import *
pd.options.mode.chained_assignment = None



In [2]:
input_data = pd.read_csv('data/users_dataset_for_models.csv', index_col= '_id')
# Train Test split
X, y = input_data.drop(columns = ['label']), input_data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.1, random_state=42)

## 2. Preprocessing pipeline
Each column in the input data is is going into column selector and the to another class which does the preproccess on the vector
Columns that do not need preprocess go to the `other_cols` varaibale

In [3]:
data = input_data.copy()
def apply_log(x,log = np.log):
    return log(1+x)

gender = make_pipeline(ColumnSelector('gender'),GenderColumnInOptions())
name_language = make_pipeline(ColumnSelector('name_language'),NameLanguageColumnInOptions())
# example for flexibility
hometown_region = make_pipeline(ColumnSelector('hometown_region'),ColumnInOptions(['Central District', 'Tel Aviv', 'Southern District', 'Heifa', 'Jerusalem']))
phone_exists = make_pipeline(ColumnSelector('phone'),ColumnExist())
# example for flexibility
log_likes = make_pipeline(ColumnSelector('likes_count'),ColumnApplyFunc(apply_log))
log_friends = make_pipeline(ColumnSelector('friends_count'),ColumnApplyFunc(apply_log))
other_cols = ['age']

## 3. Grouping the previews pipelines to one pipeline - pre procees pipeline
PandasFeatureUnion unites the different vectors to a Dataframe

In [4]:
pre_process = make_pipeline(
    PandasFeatureUnion(transformer_list=[
    ("gender",gender),
    ("name_language",name_language),
    ("hometown_region",hometown_region),
    ("phone_exists",phone_exists),
    ("log_likes",log_likes),
    ("log_friends",log_friends),   
     ("other_columns",ColumnSelector(columns= other_cols))
    ]))

## Normalizers
After the preprocess pipeline each type of variable {numeric,boolean, categorial}
is going to:
    
    imputer - fill missing values
    normalizer - numeric vectors are scaled to standard normal distribution,
        castegorial varaibles are one hot encoded

In [5]:
preprocess_pipeline = make_pipeline(
    pre_process,
    ObjectsColumnaAsType(),
    PandasFeatureUnion(transformer_list=[
        ("numeric_features", make_pipeline(
            TypeSelector(np.number),
            PandasImputer(strategy="mean"),
            PandasStandardScaler()
        )),
        ("categorical_features", make_pipeline(
            TypeSelector("category"),
            PandasImputer(strategy="most_frequent"),
            OneHotEncoder()
        )),
        ("boolean_features", make_pipeline(
            TypeSelector("bool"),
        ))
]))

preprocess_pipeline.fit(X_train)
X_train = preprocess_pipeline.transform(X_train)

Runnig a wrapper of sklearn frid search - search for the best hyper parameters.
the code was taken from -

http://www.davidsbatista.net/blog/2018/02/23/model_optimization/

In [6]:
from grid_search_helper import *

models = {
    'lr': LogisticRegression(),
}

params = {
    'lr': [{'solver': ['lbfgs'],'C': [2]},
           {'solver': ['lbfgs'],'C': [0.1,0.2,1]} 
    ]}

helper = EstimatorSelectionHelper(models, params)
best_model = helper.fit(X_train, y_train, scoring='roc_auc', n_jobs=8, cv =5)
helper.score_summary(sort_by='max_score')

Running GridSearchCV for lr.
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  20 out of  20 | elapsed:   11.5s finished


lr


Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,C,solver
3,lr,0.873447,0.875477,0.877322,0.0015759,1.0,lbfgs
0,lr,0.873447,0.875477,0.877322,0.00157585,2.0,lbfgs
2,lr,0.873448,0.875477,0.877322,0.00157489,0.2,lbfgs
1,lr,0.873449,0.875476,0.877319,0.00157317,0.1,lbfgs


## Creating the final pipeline - adding the best model to the preprocess pipeline

In [7]:
final_pipeline = make_pipeline(preprocess_pipeline , best_model)
# now we have a prediction machine that takes raw data and provides a label/probability
final_pipeline.predict(X_test)

array([1, 1, 1, ..., 0, 1, 1], dtype=int64)

#### It is possible to access internal data from the pipeline - quite complex - need trial and error 

In [8]:
# excessing objects
preprocess_pipeline.named_steps['pandasfeatureunion'].get_params()['numeric_features__pandasstandardscaler'].transformer.mean_
hometown_region.named_steps['columninoptions'].values

['Central District', 'Tel Aviv', 'Southern District', 'Heifa', 'Jerusalem']

## Dumping the model

In [9]:
from sklearn.externals import joblib
filename = 'finalized_model.sav'
joblib.dump(final_pipeline, filename)

['finalized_model.sav']

## Example for using the model we created on new data

In [10]:
# some time later...
# load the model from disk
loaded_model = joblib.load(filename)
loaded_model.predict(X_test)

array([1, 1, 1, ..., 0, 1, 1], dtype=int64)