# Setup

Download your Kaggle API key file to `kaggle.json` to allow download of the data (log in to Kaggle, go to profile -> account).

See this guide for installing tensorflow on Apple silicon (e.g. my M1 laptop!): https://developer.apple.com/metal/tensorflow-plugin/

For me though, I had to install specific versions:
```bash
pip install tensorflow-macos==2.9
pip install tensorflow-metal==0.5.0
```

In [1]:
import opendatasets as od
import pandas as pd
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense
from sklearn.metrics import accuracy_score

# Download the data

In [2]:
od.download("https://www.kaggle.com/datasets/ahsan81/hotel-reservations-classification-dataset", data_dir="data")

hotel_data = pd.read_csv("./data/hotel-reservations-classification-dataset/Hotel Reservations.csv")

hotel_data.head()

Skipping, found downloaded files in "data/hotel-reservations-classification-dataset" (use force=True to force download)


Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


In [3]:
hotel_data.describe()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests
count,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0
mean,1.844962,0.105279,0.810724,2.2043,0.030986,85.232557,2017.820427,7.423653,15.596995,0.025637,0.023349,0.153411,103.423539,0.619655
std,0.518715,0.402648,0.870644,1.410905,0.173281,85.930817,0.383836,3.069894,8.740447,0.158053,0.368331,1.754171,35.089424,0.786236
min,0.0,0.0,0.0,0.0,0.0,0.0,2017.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,1.0,0.0,17.0,2018.0,5.0,8.0,0.0,0.0,0.0,80.3,0.0
50%,2.0,0.0,1.0,2.0,0.0,57.0,2018.0,8.0,16.0,0.0,0.0,0.0,99.45,0.0
75%,2.0,0.0,2.0,3.0,0.0,126.0,2018.0,10.0,23.0,0.0,0.0,0.0,120.0,1.0
max,4.0,10.0,7.0,17.0,1.0,443.0,2018.0,12.0,31.0,1.0,13.0,58.0,540.0,5.0


# Clean and structure the training and test data

In [4]:
# Drop the answers to create the input/feature data
x_df = hotel_data.drop(['Booking_ID', 'booking_status'], axis=1)

# Months and years are more like categorical data than numeric.  Convert to strings, then call 'get_dummies' to explode/pivot
x_df['arrival_month'] = [f"month_{x}" for x in x_df['arrival_month']]
x_df['arrival_year'] = [f"year_{x}" for x in x_df['arrival_year']]


# Converts catagorical columns into multiple boolean columns by value - which sames a lot of hassle!
x_df = pd.get_dummies(x_df)

# Convert everything else into a float
x_df = x_df.astype('float64')

# Create the output/target data
y_df = hotel_data['booking_status'].replace({"Not_Canceled": 0.0, "Canceled": 1.0})

# split into test and training sets
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2)

x_train


Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_date,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,...,arrival_month_month_5,arrival_month_month_6,arrival_month_month_7,arrival_month_month_8,arrival_month_month_9,market_segment_type_Aviation,market_segment_type_Complementary,market_segment_type_Corporate,market_segment_type_Offline,market_segment_type_Online
13521,2.0,0.0,2.0,5.0,0.0,138.0,14.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2579,2.0,0.0,0.0,3.0,0.0,34.0,25.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
14569,2.0,0.0,0.0,4.0,0.0,28.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25134,2.0,0.0,1.0,1.0,0.0,7.0,21.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23041,3.0,0.0,0.0,2.0,0.0,100.0,16.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2629,2.0,0.0,1.0,2.0,0.0,6.0,11.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
32214,1.0,0.0,1.0,2.0,0.0,219.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
11620,1.0,0.0,1.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
30931,2.0,0.0,0.0,4.0,0.0,232.0,20.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


# Create the NN model

In [5]:
model = Sequential()
model.add(Dense(units=12, activation='sigmoid', input_dim=len(x_train.columns)))
#model.add(Dense(units=12, activation='sigmoid'))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(loss='mse', optimizer='sgd', metrics='accuracy')

Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



2023-01-11 00:14:37.899243: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-01-11 00:14:37.899354: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


# Train the NN

In [6]:
model.fit(x_train, y_train, epochs=1000, batch_size=32, validation_data=(x_test, y_test))

Epoch 1/1000


2023-01-11 00:14:38.011634: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2023-01-11 00:14:38.160607: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




2023-01-11 00:14:42.208595: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000

In [None]:
y_hat = model.predict(x_test)
y_hat = [0.0 if x < 0.5 else 1.0 for x in y_hat]

accuracy_score(y_test, y_hat)



2023-01-11 00:11:24.814303: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




0.7834596829772571

In [None]:
pd.DataFrame({
    'predicted': y_hat,
    'actual': y_test
})

Unnamed: 0,predicted,actual
20733,0.0,0.0
9754,1.0,0.0
13141,0.0,0.0
4282,0.0,0.0
20709,0.0,0.0
...,...,...
501,0.0,0.0
164,0.0,0.0
768,0.0,0.0
25717,0.0,0.0
