In [None]:
!gcloud services enable ml.googleapis.com
!gcloud services enable compute.googleapis.com

In [18]:
import os
import joblib
import subprocess
import numpy as np
import pandas as pd
import pickle


from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline



BUCKET_NAME = 'tp-2'
STORAGE_BUCKET = 'tp-2'
DATA_PATH = 'bookings.csv'
LOCAL_PATH = '/tmp'
PROJECT_ID = 'mimetic-setup-296322'


df = pd.read_csv('../TP2/bookings.csv')


## Transforming dataframe

def family(data):
    if ((data['adults'] > 0) & (data['children'] > 0)) or ((data['adults'] > 0) & (data['babies'] > 0)) :
        val = 1
    else:
        val = 0
    return val

def deposit(data):
    if ((data['deposit_type'] == 'No Deposit') | (data['deposit_type'] == 'Refundable')):
        return 0
    else:
        return 1
def feature(data):
    data["is_family"] = data.apply(family, axis = 1)
    data["total_customer"] = data["adults"] + data["children"] + data["babies"]
    data["deposit_given"] = data.apply(deposit, axis=1)
    data["total_nights"] = data["stays_in_weekend_nights"]+ data["stays_in_week_nights"]
    return data

df = feature(df)
df = df.drop(columns = ['company','adults', 'babies', 'children', 'deposit_type', 'reservation_status_date','is_canceled'])

# Features 
categorical_features = list(df.dtypes.where(df.dtypes== object).dropna().index)
numeric_features =  list(df.dtypes.where(df.dtypes != object).dropna().index)
numeric_features.remove('required_car_parking_spaces')
features = numeric_features + categorical_features
X = df[features]
y= df['required_car_parking_spaces'].apply(lambda x : 1 if x>0 else 0 )


# Create a scikit-learn pipeline with preprocessing steps + model
numeric_transformer = Pipeline(steps =[('scaler',StandardScaler()),('imputer',SimpleImputer(strategy="constant", fill_value=0))])
categorical_transformer = Pipeline(steps=[
                                    ("imputer", SimpleImputer(strategy="constant", fill_value="Not defined")),
                                    ("onehot", OneHotEncoder(handle_unknown='ignore'))]) # to deal with missing categorical data 
preproc = ColumnTransformer(transformers=[("num", numeric_transformer, numeric_features),
                                          ("cat", categorical_transformer, categorical_features)])

# Train the model
models = [("logreg_with_scale", LogisticRegression(max_iter=1000)),("svc",SVC()),("random_forest", RandomForestClassifier())]
models = [("random_forest", RandomForestClassifier())]


grids = {"logreg_with_scale" : {'logreg_with_scale__C': np.logspace(-2, 2, 5, base=2)}, 
         "svc" :{'svc__gamma':np.logspace(-4, 1, 10)},
         "random_forest" : {'random_forest__n_estimators': [500,600,800]}}

# cast int to float to avoid Standard scaler error :

X[numeric_features] = X[numeric_features].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [20]:
X.columns

Index(['lead_time', 'arrival_date_year', 'arrival_date_week_number',
       'arrival_date_day_of_month', 'stays_in_weekend_nights',
       'stays_in_week_nights', 'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'booking_changes', 'agent',
       'days_in_waiting_list', 'adr', 'total_of_special_requests', 'is_family',
       'total_customer', 'deposit_given', 'total_nights', 'hotel',
       'arrival_date_month', 'meal', 'country', 'market_segment',
       'distribution_channel', 'reserved_room_type', 'assigned_room_type',
       'customer_type', 'reservation_status'],
      dtype='object')

In [None]:
# Calculate a bunch of performance metrics
results = pd.DataFrame(
    {'accuracy': [accuracy_score(y_train, pred_train),
                  accuracy_score(y_test, pred_test)],
     'precision': [precision_score(y_train, pred_train, pos_label='yes'),
                   precision_score(y_test, pred_test, pos_label='yes')],
     'recall': [recall_score(y_train, pred_train, pos_label='yes'),
                recall_score(y_test, pred_test, pos_label='yes')],
     'f1': [f1_score(y_train, pred_train, pos_label='yes'),
            f1_score(y_test, pred_test, pos_label='yes')]},
    index=['train', 'test']
)

results.to_csv(os.path.join(LOCAL_PATH, 'results.csv'))

# Upload model and results Dataframe to Storage
subprocess.call([
    'gsutil', 'cp',
    # Local path of the model
    os.path.join(LOCAL_PATH, 'model.joblib'),
    os.path.join(args.storage_path, 'model.joblib')
])
subprocess.call([
    'gsutil', 'cp',
    # Local path of results
    os.path.join(LOCAL_PATH, 'results.csv'),
    os.path.join(args.storage_path, 'results.csv')
])