##### prepare the model with target scaling
model = TransformedTargetRegressor(regressor=pipeline, transformer=MinMaxScaler())

##### evaluate model
cv = KFold(n_splits=10, shuffle=True, random_state=1)
scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

[transform-target-variables-for-regression](https://machinelearningmastery.com/how-to-transform-target-variables-for-regression-with-scikit-learn/)

# installation

In [3]:
!pip install catboost
!pip install lightgbm
!pip install xgboost

Collecting catboost
  Using cached https://files.pythonhosted.org/packages/26/3e/9af57f5b02f16f69d68f287771f2c42d323a0756872917cb3bd485802487/catboost-0.23.2-cp37-none-manylinux1_x86_64.whl
Collecting plotly (from catboost)
  Using cached https://files.pythonhosted.org/packages/70/56/eabdc7b7187cdb9d6121f6de2831ad5b85f7d002fa4bfe0476dbdb554bf6/plotly-4.8.1-py2.py3-none-any.whl
Collecting graphviz (from catboost)
  Using cached https://files.pythonhosted.org/packages/83/cc/c62100906d30f95d46451c15eb407da7db201e30f42008f3643945910373/graphviz-0.14-py2.py3-none-any.whl
Collecting numpy>=1.16.0 (from catboost)
  Using cached https://files.pythonhosted.org/packages/1f/df/7988fbbdc8c9b8efb575029498ad84b77e023a3e4623e85068823a102b1d/numpy-1.18.4-cp37-cp37m-manylinux1_x86_64.whl
Collecting retrying>=1.3.3 (from plotly->catboost)
Installing collected packages: retrying, plotly, graphviz, numpy, catboost
  Found existing installation: numpy 1.15.4
    Uninstalling numpy-1.15.4:
      Successfull

# Imports

In [9]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as stats
from scipy.stats import norm
import seaborn as sns

# pipeline based
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config
# set_config(display='diagram')


# preprocessing based
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.impute import SimpleImputer 
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

# model based
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor

# hyper-parameters
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# evaluation
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RepeatedKFold 

# data import

In [10]:
train = pd.read_csv('Train_extra_features.csv', index_col=0)
test = pd.read_csv('Test_extra_features.csv', index_col=0)

In [11]:
train.head()

Unnamed: 0,Placement - Day of Month,Placement - Time,Confirmation - Time,Arrival at Pickup - Time,Pickup - Time,Distance (KM),Temperature,Pickup Lat,Pickup Long,Destination Lat,...,Pickup_hour,Pickup_minute,Pickup_second,delta-Time-Confirmation_Placement,delta-Time-Arrival-at-Pickup_Confirmation,delta-Time-Pickup_Arrival-at-Pickup,Platform Type,Personal or Business,Placement - Weekday (Mo = 1),Time from Pickup to Arrival
0,9,34546.0,34810.0,36287.0,37650.0,4,20.4,-1.317755,36.83037,-1.300406,...,10,27,30,264.0,1477.0,1363.0,Type 3,Business,Friday,745
1,12,40576.0,41001.0,42022.0,42249.0,16,26.4,-1.351453,36.899315,-1.295004,...,11,44,9,425.0,1021.0,227.0,Type 3,Personal,Friday,1993
2,30,45565.0,45764.0,46174.0,46383.0,3,,-1.308284,36.843419,-1.300921,...,12,53,3,199.0,410.0,209.0,Type 3,Business,Tuesday,455
3,15,33934.0,33965.0,34676.0,34986.0,9,19.2,-1.281301,36.832396,-1.257147,...,9,43,6,31.0,711.0,310.0,Type 3,Business,Friday,1341
4,13,35718.0,35778.0,36233.0,36323.0,9,15.4,-1.266597,36.792118,-1.295041,...,10,5,23,60.0,455.0,90.0,Type 1,Personal,Monday,1214


In [None]:
train.columns.to_list()

# Feature Engineering pipeline

# Train Test Split

In [12]:
# get features for preprocessing
categorical_features = [feature for feature in train.columns if train[feature].nunique() < 10 and train[feature].dtypes == 'object']
numerical_features = [feature for feature in train.columns if train[feature].dtypes in ['int64', 'float64'] and feature != 'Time from Pickup to Arrival']

In [13]:
categorical_features

['Platform Type', 'Personal or Business', 'Placement - Weekday (Mo = 1)']

In [14]:
# separate predictors from the target
X = train[numerical_features + categorical_features]
y = train['Time from Pickup to Arrival']

# split the data into subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# view X_train, X_valid dimensions
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((16959, 33), (4240, 33), (16959,), (4240,))

In [15]:
X_train.head()

Unnamed: 0,Placement - Day of Month,Placement - Time,Confirmation - Time,Arrival at Pickup - Time,Pickup - Time,Distance (KM),Temperature,Pickup Lat,Pickup Long,Destination Lat,...,Arrival at Pickup_second,Pickup_hour,Pickup_minute,Pickup_second,delta-Time-Confirmation_Placement,delta-Time-Arrival-at-Pickup_Confirmation,delta-Time-Pickup_Arrival-at-Pickup,Platform Type,Personal or Business,Placement - Weekday (Mo = 1)
11489,26,34654.0,34663.0,34813.0,35392.0,4,,-1.260093,36.808869,-1.265715,...,13,9,49,52,9.0,150.0,579.0,Type 3,Business,Friday
13451,22,49612.0,49672.0,50184.0,50758.0,10,31.1,-1.301446,36.766138,-1.252796,...,24,14,5,58,60.0,512.0,574.0,Type 3,Business,Friday
8592,12,56407.0,57242.0,59210.0,59393.0,22,21.0,-1.255189,36.782203,-1.34333,...,50,16,29,53,835.0,1968.0,183.0,Type 3,Business,Wednesday
4339,13,46149.0,46166.0,46557.0,47252.0,4,,-1.257147,36.795063,-1.274755,...,57,13,7,32,17.0,391.0,695.0,Type 3,Business,Monday
3287,20,45495.0,45515.0,46233.0,50338.0,18,27.8,-1.257147,36.795063,-1.333994,...,33,13,58,58,20.0,718.0,4105.0,Type 3,Business,Thursday


#  pipeline
---

1. Preprocesing

2. Feature Selection

3. Model Defination




# Step 1: Preprocessing pipeline
---

1. Numeric Features
    - NAN Imputation
    - Scaling : 
        - **PowerTransformation** standardize=True,   Box-Cox or Yeo-Johnson applied to various probability distributions
        - or StandardScaler

2. Categorical Features ['Platform Type','Personal or Business', 'Placement - Weekday (Mo = 1)']
    - NAN Imputation
    - Encoding: drop first, handle_unknown='ignore'
        - OneHoteEncode --> ['Personal or Business' ,'Placement - Weekday (Mo = 1)']

3. Feature Selection
     - Select From Model, VarianceThrehold

**Pipeline starts**

In [16]:
# numeric Transformers
numerical_transformer = Pipeline(steps=[
                            ('impute', SimpleImputer(strategy='mean')),
                            ('scale', StandardScaler())
                        ])

# --------------------------------------------------------------------------------
# Categorical Transformers
categorical_transformer =  Pipeline(steps=[
                                    ('impute',SimpleImputer(strategy='most_frequent')),
                                    ('encode', OneHotEncoder(handle_unknown='ignore'))
                                ])

# --------------------------------------------------------------------------------
# Preprocesing  pipeline
preprocessor = ColumnTransformer(transformers=[
                                            ('num', numerical_transformer, numerical_features ),
                                            ('cat', categorical_transformer, categorical_features)
                                            ],
                                            remainder='passthrough') # might remove pass through, if we got all our features done



# --------------------------------------------------------------------------------
# define a function to complete the model by adding the model 
def full_pipeline(model, preprocessor=preprocessor):
    
    return Pipeline(steps=[
                    ('preprocessor', preprocessor),
                    ('selector', SelectKBest(score_func=f_regression, k=10)),
                    ('model', model)
                ])


# Step 2: Feature Selection
---

**What we are Willing To try!**
1. VarianceThrehold
2. Select From Model
3. PCA with variance control or n_componets = 2

# Step 3: Define the Model
---

**Global Costants:**
1. random_state = 42
2. RMSE
2. GridSearchCV
    - for optimal parameter hunting

---

**Baseline Model Algorithms:**
1. Random Forest Regressor
    - n_estimators = 100

---

**Alternate Algorithms:**
1. Random Forest Regressor 
    - With Tuned Parameters

2. XGBoost Regressor
    - Default
    - Tuned parameters
    
3. CatBoost Regressor
    - Default
    - Tuned parameters

4. LightGBM Regressor
    - Default
    - Tuned parameters
    
---

**Advanced Ensemble Method:**

1. Stacking Regressor

    - Defination: level-1 [ 3 models ] level-2 [ 2 models ] mata-learner [1 model] 
    - ------------------------------------
    - **experiment-0**:
        - **level-1** [ Cart, KNN, SVM ] 
        - **mata-learner** [Linear Regression] <br>
    - ------------------------------------
     - **experiment-1**:
        - **level-1** [ XGBoost, CatBoost, LightGBM] 
        - **level-2** [ XGBoost Regressor & RandomForest ]
        - **mata-learner** [RandomForest] <br>
    - ------------------------------------
2. Bagging Regressor
    - 

3. More...


In [17]:
########################################################################################
#-----------------------------Random Forest Regressor----------------------------------
########################################################################################

rf_base_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)


# ------------------complete pipeline--------------------
rf_pipe = full_pipeline(rf_base_model)




In [18]:
########################################################################################
#-----------------------------------------XGBOOST---------------------------------------
########################################################################################

param_grid = {'learning_rate': [0.01, 0.02, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1], 
              'n_estimators':[10, 30, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000], 
              'subsample': [0.1, 0.2, 0.3, 0.5, 0.7, 0.9, 1],
              'max_depth': [int(x) for x in np.linspace(10, 110, num = 11)], 
              'colsample_bytree': [0.5, 0.7, 0.9, 1],
              'min_child_weight': [1, 2, 3, 4]
              }

# A parameter grid search for XGBoost
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

xgb_model = XGBRegressor(random_state=42 , n_jobs=-1, verbosity=0)


# ------------------complete pipeline--------------------
xgb_pipe = full_pipeline(xgb_model)


# Evaluation

In [19]:
def evaluate(model, X, y ):
    from sklearn.model_selection import cross_val_score
    from sklearn.model_selection import RepeatedKFold

    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1, error_score='raise')
    score = np.mean(scores)

    return np.sqrt(abs(score))

In [72]:
# xgboost
evaluate(xgb_pipe, X, y )

762.3910175615549

In [20]:
# random forest 
evaluate(rf_pipe, X, y )

766.1410758097359

In [None]:
import sklearn

In [None]:
sklearn.__version__