# In this notebook, we are going to show you how to do GridSearchCV or RandomizedSearchCV with LazyTransformer pipeline

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
import statsmodels.api as sm
import matplotlib.pyplot as plt
from patsy import dmatrices

# load dataset
dta = sm.datasets.fair.load_pandas().data

# add "affair" column: 1 represents having affairs, 0 represents not
dta['affair'] = (dta.affairs > 0).astype(int)
df = dta.drop('affairs', axis=1).sample(frac=1.0, random_state=0)
df['occupation'] = df['occupation'].astype(str)
df['occupation_husb'] = df['occupation_husb'].astype(str)
print(df.shape)
df.head()

(6366, 9)


Unnamed: 0,rate_marriage,age,yrs_married,children,religious,educ,occupation,occupation_husb,affair
2764,3.0,27.0,9.0,2.0,1.0,14.0,3.0,5.0,0
4481,5.0,22.0,2.5,0.0,4.0,16.0,3.0,5.0,0
5360,4.0,27.0,6.0,0.0,2.0,17.0,4.0,4.0,0
5802,5.0,42.0,23.0,2.0,3.0,17.0,4.0,2.0,0
1220,4.0,27.0,6.0,1.0,3.0,14.0,3.0,2.0,1


In [3]:
from lazytransform import LazyTransformer

Imported LazyTransformer version:0.31. Call by using:
    lazy = LazyTransformer(model=None, encoders='auto', scalers=None, 
        date_to_string=False, transform_target=False, imbalanced=False)
    ### if you are not using a model in pipeline, you must use fit and transform ##
        X_trainm, y_trainm = lazy.fit_transform(X_train, y_train)
        X_testm = lazy.transform(X_test)
    ### If using a model in pipeline, use fit and predict only ###
        lazy.fit(X_train, y_train)
        lazy.predict(X_test)



In [4]:
target = 'affair'
modeltype = 'Classification'

In [5]:
from sklearn import set_config
#### You can display your pipeline in a nice diagram if it is a Jupyter Notebook ##
set_config(display="diagram")

In [6]:
if isinstance(target, str):
    cols = [x for x in list(df) if x not in [target]]
else:
    cols = [x for x in list(df) if x not in target]
X = df[cols]
y = df[target]
df.dtypes

rate_marriage      float64
age                float64
yrs_married        float64
children           float64
religious          float64
educ               float64
occupation          object
occupation_husb     object
affair               int32
dtype: object

In [7]:
###  drop the target y values that have very low samples ####
print(X.shape, y.shape)
if modeltype != 'Regression':
    low_counts = y.value_counts()[(y.value_counts()<=1).values].index
    print(len(low_counts))
    ## You need to remove those rows that have just one sample ##
    X = X[~(y.isin(low_counts))]
    y = y[~(y.isin(low_counts))]
X.shape, y.shape

(6366, 8) (6366,)
0


((6366, 8), (6366,))

In [8]:
test_size = 0.3
random_state = 0

In [9]:
from sklearn.model_selection import train_test_split
if modeltype == 'Regression':
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=random_state)
else:
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,stratify=y, random_state=random_state)
print(X_train.shape, X_test.shape)

(4456, 8) (1910, 8)


In [10]:
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.linear_model import LogisticRegression
if modeltype == 'Regression':
    lgb = LGBMRegressor(n_estimators=100, n_jobs=-1, random_state=99)
else:
    lgb = LGBMClassifier(n_estimators=100, n_jobs=-1, random_state=99, verbose=-1)
    #lgb = LogisticRegression()

In [11]:
### This returns a pipeline ## you need to fit it to see how well it performs on a hold out dataset
lazy = LazyTransformer(model=lgb, encoders='target', scalers='maxabs', 
        date_to_string=False, transform_target=True, imbalanced=False)

# LazyTransformer already has a grid search function for LightGBM that will grid search the lazy pipeline for best params

In [12]:
newpipe = lazy.lightgbm_grid_search(X_train, y_train, modeltype, params={},
                         grid_search=True, multi_label=False, log_y=False, gpu_flag=False)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
#### Single_Label Binary_Classification problem ####
Shape of dataset: (4456, 8). Now we classify variables into different types...
    no date time variables detected in this dataset
Time taken to define data pipeline = 1 second(s)
model pipeline fitted with LGBMClassifier model
Mean cross-validated train precision = 0.8060
    Mean cross-validated test precision = 0.5662
Time taken for Hyper Param tuning of LGBM (in minutes) = 0.2
Best params from search:
{'model__class_weight': None, 'model__learning_rate': 0.001, 'model__n_estimators': 150}
Returning a new LazyTransformer pipeline that contains the best model trained on your train dataset!


# once the best params are found, it fits the pipeline and returns a new pipeline with the best performing model. All in one step!

In [23]:
newpipe

In [13]:
newpipe.named_steps['model']

# Since the new pipeline has already been fitted, you can immediately predict

In [14]:
predictions1 = newpipe.predict(X_test)
predictions1

array([0, 0, 0, ..., 0, 0, 0])

In [15]:
from sklearn.metrics import balanced_accuracy_score, mean_squared_error, classification_report
if modeltype == 'Regression':
    print('RMSE = %0.2f' %np.sqrt(mean_squared_error(y_test.values, predictions1)))
else:
    print(classification_report(y_test.values, predictions1))

              precision    recall  f1-score   support

           0       0.76      0.84      0.80      1294
           1       0.56      0.43      0.49       616

    accuracy                           0.71      1910
   macro avg       0.66      0.64      0.64      1910
weighted avg       0.69      0.71      0.70      1910



# Let's compare the best model against an untuned lazy pipeline with a simple but powerful model like Logistic Regression

In [16]:
lgb = LogisticRegression()
lazy = LazyTransformer(model=lgb, encoders='target', scalers='maxabs', 
        date_to_string=False, transform_target=True, imbalanced=False)

In [17]:
## lazy at the moment has not been fitted yet! ##
lazy.fitted

False

In [18]:
lazy.fit(X_train, y_train)

#### Single_Label Binary_Classification problem ####
Shape of dataset: (4456, 8). Now we classify variables into different types...
    no date time variables detected in this dataset
Time taken to define data pipeline = 1 second(s)
model pipeline fitted with LogisticRegression model


<lazytransform.LazyTransformer at 0x29289070e80>

In [19]:
predictions2 = lazy.predict(X_test)
predictions2

array([0, 0, 0, ..., 0, 0, 0])

In [20]:
from sklearn.metrics import balanced_accuracy_score, mean_squared_error, classification_report
if modeltype == 'Regression':
    print('RMSE = %0.2f' %np.sqrt(mean_squared_error(y_test.values, predictions2)))
else:
    print(classification_report(y_test.values, predictions2))

              precision    recall  f1-score   support

           0       0.74      0.89      0.81      1294
           1       0.60      0.36      0.45       616

    accuracy                           0.72      1910
   macro avg       0.67      0.62      0.63      1910
weighted avg       0.70      0.72      0.69      1910



# we can see that we achieved high precision from the best model while the untuned model has low precision. This is how GridSearching the entire Lazy pipeline works. Hope you enjoyed it!

In [22]:
lazy.model.named_steps['model']