# Data 
Link: https://archive.ics.uci.edu/ml/datasets/in-vehicle+coupon+recommendation

## What does it say?
This data was collected via a survey on Amazon Mechanical Turk. The survey describes different driving scenarios including the destination, current time, weather, passenger, etc., and then ask the person whether he will accept the coupon if he is the driver.

## Task
- Classification (Binary)

In [1]:
# Import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
data = pd.read_csv("drive/My Drive/colab notebooks/in-vehicle-coupon-recommendation.csv")

In [3]:
data

Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,has_children,education,occupation,income,car,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,Home,Partner,Rainy,55,6PM,Carry out & Take away,1d,Male,26,Single,0,Bachelors degree,Sales & Related,$75000 - $87499,,never,never,1~3,4~8,1~3,1,0,0,1,0,1
12680,Work,Alone,Rainy,55,7AM,Carry out & Take away,1d,Male,26,Single,0,Bachelors degree,Sales & Related,$75000 - $87499,,never,never,1~3,4~8,1~3,1,0,0,0,1,1
12681,Work,Alone,Snowy,30,7AM,Coffee House,1d,Male,26,Single,0,Bachelors degree,Sales & Related,$75000 - $87499,,never,never,1~3,4~8,1~3,1,0,0,1,0,0
12682,Work,Alone,Snowy,30,7AM,Bar,1d,Male,26,Single,0,Bachelors degree,Sales & Related,$75000 - $87499,,never,never,1~3,4~8,1~3,1,1,1,0,1,0


In [4]:
data.isna().sum()

destination                 0
passanger                   0
weather                     0
temperature                 0
time                        0
coupon                      0
expiration                  0
gender                      0
age                         0
maritalStatus               0
has_children                0
education                   0
occupation                  0
income                      0
car                     12576
Bar                       107
CoffeeHouse               217
CarryAway                 151
RestaurantLessThan20      130
Restaurant20To50          189
toCoupon_GEQ5min            0
toCoupon_GEQ15min           0
toCoupon_GEQ25min           0
direction_same              0
direction_opp               0
Y                           0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12684 entries, 0 to 12683
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   destination           12684 non-null  object
 1   passanger             12684 non-null  object
 2   weather               12684 non-null  object
 3   temperature           12684 non-null  int64 
 4   time                  12684 non-null  object
 5   coupon                12684 non-null  object
 6   expiration            12684 non-null  object
 7   gender                12684 non-null  object
 8   age                   12684 non-null  object
 9   maritalStatus         12684 non-null  object
 10  has_children          12684 non-null  int64 
 11  education             12684 non-null  object
 12  occupation            12684 non-null  object
 13  income                12684 non-null  object
 14  car                   108 non-null    object
 15  Bar                   12577 non-null

In [42]:
def preprocess_inputs(df):
  df = df.copy()
  df = df.drop("car", axis=1)
  df['Bar'] = df.fillna(df['Bar'].mode())
  df['CoffeeHouse'] = df.fillna(df['CoffeeHouse'].mode())
  df['CarryAway'] = df.fillna(df['CarryAway'].mode())
  df['RestaurantLessThan20'] = df.fillna(df['RestaurantLessThan20'].mode())
  return df

In [43]:
X = preprocess_inputs(data)

In [44]:
X

Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,has_children,education,occupation,income,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,No Urgent Place,No Urgent Place,No Urgent Place,No Urgent Place,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,No Urgent Place,No Urgent Place,No Urgent Place,No Urgent Place,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,No Urgent Place,No Urgent Place,No Urgent Place,No Urgent Place,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,No Urgent Place,No Urgent Place,No Urgent Place,No Urgent Place,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,No Urgent Place,No Urgent Place,No Urgent Place,No Urgent Place,1~3,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12679,Home,Partner,Rainy,55,6PM,Carry out & Take away,1d,Male,26,Single,0,Bachelors degree,Sales & Related,$75000 - $87499,Home,Home,Home,Home,1~3,1,0,0,1,0,1
12680,Work,Alone,Rainy,55,7AM,Carry out & Take away,1d,Male,26,Single,0,Bachelors degree,Sales & Related,$75000 - $87499,Work,Work,Work,Work,1~3,1,0,0,0,1,1
12681,Work,Alone,Snowy,30,7AM,Coffee House,1d,Male,26,Single,0,Bachelors degree,Sales & Related,$75000 - $87499,Work,Work,Work,Work,1~3,1,0,0,1,0,0
12682,Work,Alone,Snowy,30,7AM,Bar,1d,Male,26,Single,0,Bachelors degree,Sales & Related,$75000 - $87499,Work,Work,Work,Work,1~3,1,1,1,0,1,0


In [45]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12684 entries, 0 to 12683
Data columns (total 25 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   destination           12684 non-null  object
 1   passanger             12684 non-null  object
 2   weather               12684 non-null  object
 3   temperature           12684 non-null  int64 
 4   time                  12684 non-null  object
 5   coupon                12684 non-null  object
 6   expiration            12684 non-null  object
 7   gender                12684 non-null  object
 8   age                   12684 non-null  object
 9   maritalStatus         12684 non-null  object
 10  has_children          12684 non-null  int64 
 11  education             12684 non-null  object
 12  occupation            12684 non-null  object
 13  income                12684 non-null  object
 14  Bar                   12684 non-null  object
 15  CoffeeHouse           12684 non-null

In [46]:
X.isna().sum()

destination               0
passanger                 0
weather                   0
temperature               0
time                      0
coupon                    0
expiration                0
gender                    0
age                       0
maritalStatus             0
has_children              0
education                 0
occupation                0
income                    0
Bar                       0
CoffeeHouse               0
CarryAway                 0
RestaurantLessThan20      0
Restaurant20To50        189
toCoupon_GEQ5min          0
toCoupon_GEQ15min         0
toCoupon_GEQ25min         0
direction_same            0
direction_opp             0
Y                         0
dtype: int64

In [47]:
X['Restaurant20To50'] = X.fillna(X['Restaurant20To50'].mode())

In [48]:
X.isna().sum()

destination             0
passanger               0
weather                 0
temperature             0
time                    0
coupon                  0
expiration              0
gender                  0
age                     0
maritalStatus           0
has_children            0
education               0
occupation              0
income                  0
Bar                     0
CoffeeHouse             0
CarryAway               0
RestaurantLessThan20    0
Restaurant20To50        0
toCoupon_GEQ5min        0
toCoupon_GEQ15min       0
toCoupon_GEQ25min       0
direction_same          0
direction_opp           0
Y                       0
dtype: int64

In [38]:
!pip install pycaret

Collecting pycaret
[?25l  Downloading https://files.pythonhosted.org/packages/30/4b/c2b856b18c0553238908f34d53e6c211f3cc4bfa13a8e8d522567a00b3d7/pycaret-2.3.0-py3-none-any.whl (261kB)
[K     |████████████████████████████████| 266kB 5.0MB/s 
[?25hCollecting pyLDAvis
[?25l  Downloading https://files.pythonhosted.org/packages/03/a5/15a0da6b0150b8b68610cc78af80364a80a9a4c8b6dd5ee549b8989d4b60/pyLDAvis-3.3.1.tar.gz (1.7MB)
[K     |████████████████████████████████| 1.7MB 35.8MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting lightgbm>=2.3.1
[?25l  Downloading https://files.pythonhosted.org/packages/18/b2/fff8370f48549ce223f929fe8cab4ee6bf285a41f86037d91312b48ed95b/lightgbm-3.2.1-py3-none-manylinux1_x86_64.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 38.8MB/s 
Collecting mlflow
[?25l 

In [39]:
import pycaret.classification as pyc

In [49]:
pyc.setup(
    data = X,
    target = 'Y',
    train_size=0.8,
    normalize = True
)

Unnamed: 0,Description,Value
0,session_id,8812
1,Target,Y
2,Target Type,Binary
3,Label Encoded,"0: 0, 1: 1"
4,Original Data,"(12684, 25)"
5,Missing Values,False
6,Numeric Features,0
7,Categorical Features,24
8,Ordinal Features,False
9,High Cardinality Features,False


('lightgbm',
 5,
 True,
 {'acc': <pycaret.containers.metrics.classification.AccuracyMetricContainer at 0x7fc0fe3ae810>,
  'auc': <pycaret.containers.metrics.classification.ROCAUCMetricContainer at 0x7fc0fe3ae7d0>,
  'f1': <pycaret.containers.metrics.classification.F1MetricContainer at 0x7fc0fe3ae550>,
  'kappa': <pycaret.containers.metrics.classification.KappaMetricContainer at 0x7fc0fe3ae390>,
  'mcc': <pycaret.containers.metrics.classification.MCCMetricContainer at 0x7fc0fe3ae310>,
  'precision': <pycaret.containers.metrics.classification.PrecisionMetricContainer at 0x7fc0fe3ae650>,
  'recall': <pycaret.containers.metrics.classification.RecallMetricContainer at 0x7fc0fe3ae750>},
        destination_Work  passanger_Alone  ...  toCoupon_GEQ25min_0  direction_same_1
 8783                0.0              0.0  ...                  1.0               0.0
 10529               1.0              1.0  ...                  1.0               1.0
 833                 0.0              0.0  ...      

In [50]:
pyc.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.712,0.7839,0.7917,0.7268,0.7577,0.4044,0.4068,0.237
rf,Random Forest Classifier,0.7071,0.7637,0.7688,0.7305,0.7491,0.3979,0.3988,1.552
et,Extra Trees Classifier,0.6985,0.7539,0.751,0.7278,0.7391,0.3823,0.3826,1.751
gbc,Gradient Boosting Classifier,0.6914,0.7511,0.7844,0.706,0.743,0.3596,0.363,1.49
knn,K Neighbors Classifier,0.6715,0.7134,0.7411,0.6993,0.7195,0.324,0.325,3.124
ridge,Ridge Classifier,0.6708,0.0,0.7548,0.6937,0.7228,0.3194,0.3213,0.045
lda,Linear Discriminant Analysis,0.6703,0.7191,0.7531,0.6938,0.7221,0.3187,0.3205,0.187
lr,Logistic Regression,0.6697,0.719,0.7552,0.6922,0.7222,0.3166,0.3187,0.529
ada,Ada Boost Classifier,0.6649,0.7169,0.7503,0.6886,0.718,0.307,0.3089,0.508
svm,SVM - Linear Kernel,0.6562,0.0,0.7562,0.6862,0.7105,0.2843,0.302,0.22


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=8812, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [55]:
best_model = pyc.create_model('lightgbm')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7143,0.7846,0.7768,0.7361,0.7559,0.4121,0.4129
1,0.7084,0.7795,0.7972,0.7199,0.7566,0.3953,0.3985
2,0.7025,0.764,0.7903,0.7159,0.7512,0.3834,0.3863
3,0.7054,0.7823,0.7816,0.7228,0.751,0.3917,0.3935
4,0.7074,0.7767,0.7834,0.7244,0.7527,0.3958,0.3976
5,0.734,0.7994,0.7938,0.7521,0.7723,0.453,0.454
6,0.7133,0.794,0.7886,0.7292,0.7577,0.408,0.4099
7,0.6992,0.7793,0.8163,0.703,0.7554,0.3705,0.3775
8,0.7347,0.7914,0.8111,0.7452,0.7768,0.4513,0.4538
9,0.7012,0.7878,0.7782,0.7196,0.7477,0.3827,0.3845


In [56]:
print(best_model)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=8812, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)


In [57]:
pyc.evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [58]:
pyc.save_model(best_model, "lightgbm")

Transformation Pipeline and Model Succesfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='Y',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_strategy='...
                  LGBMClassifier(boosting_type='gbdt', class_weight=None,
                                 colsample_bytree=1.0, importance_type='split',
                                 learning_rate=0.1, max_depth=-1,
                                 min_child_samples=20,