In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import seaborn as sns
import plotly.express as px
from datetime import datetime

In [2]:
from statsmodels.tsa.stattools import adfuller
from pandas.plotting import autocorrelation_plot
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.api as sm
from pandas.tseries.offsets import DateOffset

In [3]:
df = pd.read_csv("hotel_bookings.csv")
df = df.copy()
df.head(10)

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03
5,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03
6,Resort Hotel,0,0,2015,July,27,1,0,2,2,...,No Deposit,,,0,Transient,107.0,0,0,Check-Out,2015-07-03
7,Resort Hotel,0,9,2015,July,27,1,0,2,2,...,No Deposit,303.0,,0,Transient,103.0,0,1,Check-Out,2015-07-03
8,Resort Hotel,1,85,2015,July,27,1,0,3,2,...,No Deposit,240.0,,0,Transient,82.0,0,1,Canceled,2015-05-06
9,Resort Hotel,1,75,2015,July,27,1,0,3,2,...,No Deposit,15.0,,0,Transient,105.5,0,0,Canceled,2015-04-22


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [5]:
df["agent"] = df["agent"].fillna("Not Applicable")
df["company"] = df["company"].fillna("Not Applicable")
df["children"] = df["children"].fillna(0)
df["country"] = df["country"].fillna("Unknown")

In [6]:
df = df.drop(df[(df.adults+df.babies+df.children)==0].index)

In [7]:
df['arrival_date_month'] = df['arrival_date_month'].map({'January':1, 'February': 2, 'March':3, 'April':4, 
                                                             'May':5, 'June':6, 'July':7,'August':8, 
                                                             'September':9, 'October':10, 'November':11,'December':12})

In [8]:
df['arrival_date'] = df['arrival_date_year'].map(str) + '-' + df['arrival_date_month'].map(str) + '-' \
                       + df['arrival_date_day_of_month'].map(str)

In [9]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,arrival_date
0,Resort Hotel,0,342,2015,7,27,1,0,0,2,...,Not Applicable,Not Applicable,0,Transient,0.0,0,0,Check-Out,2015-07-01,2015-7-1
1,Resort Hotel,0,737,2015,7,27,1,0,0,2,...,Not Applicable,Not Applicable,0,Transient,0.0,0,0,Check-Out,2015-07-01,2015-7-1
2,Resort Hotel,0,7,2015,7,27,1,0,1,1,...,Not Applicable,Not Applicable,0,Transient,75.0,0,0,Check-Out,2015-07-02,2015-7-1
3,Resort Hotel,0,13,2015,7,27,1,0,1,1,...,304.0,Not Applicable,0,Transient,75.0,0,0,Check-Out,2015-07-02,2015-7-1
4,Resort Hotel,0,14,2015,7,27,1,0,2,2,...,240.0,Not Applicable,0,Transient,98.0,0,1,Check-Out,2015-07-03,2015-7-1


In [10]:
df.shape

(119210, 33)

In [11]:
df['arrival_date'] = pd.to_datetime(df['arrival_date'])
df['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'])

In [12]:
df['Total Guests'] = df['adults'] + df['children'] + df["babies"]
df = df[df['Total Guests'] != 0]

In [13]:
#resort_otel = df[(df['hotel'] == 'Resort Hotel') & (df['is_canceled'] == 0)]
#city_otel = df[(df['hotel'] == 'City Hotel') & (df['is_canceled'] == 0)] 

resort_otel = df[df['hotel'] == 'Resort Hotel']
city_otel = df[df['hotel'] == 'City Hotel'] 

df = df.reset_index(drop=True)

In [14]:
df.shape

(119210, 34)

### Resort Otel

In [15]:
arrival_date_resort = resort_otel.iloc[: ,32:33]
total_guests_resort = resort_otel.iloc[: , 33:]

In [16]:
arrival_date_resort.reset_index(drop=True, inplace=True)
total_guests_resort.reset_index(drop=True, inplace=True)

resort_otel_data = pd.concat( [arrival_date_resort, total_guests_resort], axis=1) 

In [17]:
resort_otel_data.head()

Unnamed: 0,arrival_date,Total Guests
0,2015-07-01,2.0
1,2015-07-01,2.0
2,2015-07-01,1.0
3,2015-07-01,1.0
4,2015-07-01,2.0


In [18]:
#resort_otel_data = resort_otel_data.resample('m', on='arrival_date').sum()
resort_otel_data = resort_otel_data.resample('w', on='arrival_date').sum()
#resort_otel_data = resort_otel_data.resample('D', on='arrival_date').sum()

resort_otel_data

Unnamed: 0_level_0,Total Guests
arrival_date,Unnamed: 1_level_1
2015-07-05,471.0
2015-07-12,614.0
2015-07-19,732.0
2015-07-26,705.0
2015-08-02,781.0
...,...
2017-08-06,1061.0
2017-08-13,882.0
2017-08-20,1058.0
2017-08-27,946.0


### City Otel

In [19]:
arrival_date_city = city_otel.iloc[: ,32:33]
total_guests_city = city_otel.iloc[: , 33:]

In [20]:
arrival_date_city.reset_index(drop=True, inplace=True)
total_guests_city.reset_index(drop=True, inplace=True)

city_otel_data = pd.concat( [arrival_date_city, total_guests_city], axis=1) 

In [21]:
city_otel_data.head()

Unnamed: 0,arrival_date,Total Guests
0,2015-07-01,1.0
1,2015-07-01,2.0
2,2015-07-01,1.0
3,2015-07-01,2.0
4,2015-07-02,2.0


In [22]:
#city_otel_data = city_otel_data.resample('m', on='arrival_date').sum()
city_otel_data = city_otel_data.resample('w', on='arrival_date').sum()
#city_otel_data = city_otel_data.resample('D', on='arrival_date').sum()


city_otel_data

Unnamed: 0_level_0,Total Guests
arrival_date,Unnamed: 1_level_1
2015-07-05,354.0
2015-07-12,441.0
2015-07-19,646.0
2015-07-26,876.0
2015-08-02,446.0
...,...
2017-08-06,1584.0
2017-08-13,1623.0
2017-08-20,1802.0
2017-08-27,1475.0


In [23]:
from keras.models import Sequential
from keras.layers import Dense , Activation , Dropout
from keras.layers import LSTM
from keras.layers import LeakyReLU
import tensorflow as tf

In [24]:
from keras.preprocessing.sequence import TimeseriesGenerator
from sklearn.model_selection import train_test_split

In [25]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [26]:
print(city_otel_data.shape)
train = city_otel_data.iloc[:85]
test  = city_otel_data.iloc[85:] 
print(train.shape , test.shape)

(114, 1)
(85, 1) (29, 1)


In [27]:
data_train = np.array(train)

In [28]:
X_train , y_train = [] , []

for i in range(7 , len(data_train)-7):
    X_train.append(data_train[i-7:i])
    y_train.append(data_train[i:i+7])

In [29]:
X_train , y_train = np.array(X_train) , np.array(y_train)

In [30]:
X_train.shape , y_train.shape

((71, 7, 1), (71, 7, 1))

In [31]:
nsamples, nx, ny = X_train.shape
X_train = X_train.reshape((nsamples,nx*ny))

In [32]:
nsamples, nx, ny = y_train.shape
y_train = y_train.reshape((nsamples,nx*ny))

In [33]:
X_train.shape , y_train.shape

((71, 7), (71, 7))

In [34]:
x_scaler = MinMaxScaler()
X_train = x_scaler.fit_transform(X_train)

In [35]:
y_scaler = MinMaxScaler()
y_train = y_scaler.fit_transform(y_train)

In [36]:
pd.DataFrame(X_train).head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.04928,0.097453,0.210963,0.338317,0.100221,0.494463,0.749169
1,0.097453,0.210963,0.338317,0.100221,0.494463,0.749169,0.359358
2,0.210963,0.338317,0.100221,0.494463,0.749169,0.359358,0.390365
3,0.338317,0.100221,0.494463,0.749169,0.359358,0.390365,0.548173
4,0.100221,0.494463,0.749169,0.359358,0.390365,0.548173,0.612403


In [37]:
pd.DataFrame(y_train).head()

Unnamed: 0,0,1,2,3,4,5,6
0,0.359358,0.390365,0.548173,0.612403,0.847176,0.732558,0.575858
1,0.390365,0.548173,0.612403,0.847176,0.732558,0.575858,0.741417
2,0.548173,0.612403,0.847176,0.732558,0.575858,0.741417,0.759136
3,0.612403,0.847176,0.732558,0.575858,0.741417,0.759136,0.459025
4,0.847176,0.732558,0.575858,0.741417,0.759136,0.459025,0.387597


In [38]:
X_train.shape

(71, 7)

In [39]:
X_train = X_train.reshape(71,7,1)
X_train.shape

(71, 7, 1)

### Build LSTM

In [40]:
model = Sequential()
model.add(LSTM(units=200 , activation='relu' , input_shape=(7,1)))
model.add(Dense(7))
model.compile(loss='mse' , optimizer='adam')

In [41]:
model.fit(X_train , y_train , epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x2219d6accd0>

### Prepare Test

In [42]:
data_test = np.array(test)

In [43]:
X_test , y_test = [] , []

for i in range(7 , len(data_test)-7):
    X_test.append(data_test[i-7:i])
    y_test.append(data_test[i:i+7])

In [44]:
X_test , y_test = np.array(X_test) , np.array(y_test)

In [45]:
nsamples, nx, ny = X_test.shape
X_test = X_test.reshape((nsamples,nx*ny))

In [46]:
nsamples, nx, ny = y_test.shape
y_test = y_test.reshape((nsamples,nx*ny))

In [47]:
X_test = x_scaler.transform(X_test)
y_test = y_scaler.transform(y_test)

In [48]:
X_test.shape

(15, 7)

In [49]:
X_test = X_test.reshape(15,7,1)

In [50]:
y_pred = model.predict(X_test)

In [51]:
y_pred = y_scaler.inverse_transform(y_pred)
y_pred

array([[1404.8923, 1403.897 , 1429.9331, 1416.4365, 1359.4917, 1334.6504,
        1303.119 ],
       [1511.9249, 1489.0331, 1515.8518, 1505.4606, 1411.8763, 1393.234 ,
        1336.4097],
       [1650.2562, 1611.792 , 1625.6462, 1620.929 , 1505.9204, 1475.5676,
        1413.8112],
       [1756.1013, 1711.724 , 1710.6151, 1703.9535, 1587.9812, 1541.1554,
        1485.5498],
       [1818.6064, 1762.314 , 1758.4314, 1741.0999, 1614.4403, 1566.4889,
        1497.6926],
       [1867.933 , 1806.0372, 1804.1521, 1779.7946, 1647.4832, 1596.1217,
        1527.971 ],
       [1827.0803, 1773.1417, 1773.6523, 1729.9297, 1625.9539, 1562.5393,
        1513.0732],
       [1779.1053, 1722.7162, 1737.7792, 1678.3195, 1575.8153, 1525.1229,
        1463.7643],
       [1762.0295, 1705.8203, 1725.4603, 1667.3872, 1563.6122, 1516.4028,
        1454.1111],
       [1803.8708, 1743.2266, 1758.941 , 1700.1875, 1595.249 , 1538.2948,
        1481.3502],
       [1775.4597, 1718.2227, 1737.2799, 1673.2931, 1573.278

In [52]:
y_true = y_scaler.inverse_transform(y_test)

In [53]:
print(y_true)

[[1992. 1908. 1773. 2049. 2063. 1684. 2045.]
 [1908. 1773. 2049. 2063. 1684. 2045. 1985.]
 [1773. 2049. 2063. 1684. 2045. 1985. 1898.]
 [2049. 2063. 1684. 2045. 1985. 1898. 1941.]
 [2063. 1684. 2045. 1985. 1898. 1941. 1658.]
 [1684. 2045. 1985. 1898. 1941. 1658. 1678.]
 [2045. 1985. 1898. 1941. 1658. 1678. 1861.]
 [1985. 1898. 1941. 1658. 1678. 1861. 1851.]
 [1898. 1941. 1658. 1678. 1861. 1851. 1841.]
 [1941. 1658. 1678. 1861. 1851. 1841. 1539.]
 [1658. 1678. 1861. 1851. 1841. 1539. 1742.]
 [1678. 1861. 1851. 1841. 1539. 1742. 1584.]
 [1861. 1851. 1841. 1539. 1742. 1584. 1623.]
 [1851. 1841. 1539. 1742. 1584. 1623. 1802.]
 [1841. 1539. 1742. 1584. 1623. 1802. 1475.]]


In [54]:
X_train.shape

(71, 7, 1)

### Evaluate the model

In [57]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [58]:
def evaluate_model(y_true , y_predicted):
    score=[]
    
    for i in range(y_true.shape[1]):
        mse = mean_squared_error(y_true[:,i] , y_predicted[:,i])
        rmse = np.sqrt(mse)
        score.append(rmse)
        
    total_score=0
    for row in range(y_true.shape[0]):
        for col in range(y_predicted.shape[1]):
            total_score = total_score + (y_true[row,col] - y_predicted[row,col])**2
    total_score = np.sqrt(total_score/y_true.shape[0] * y_predicted.shape[1])
    
    return total_score , score

In [59]:
evaluate_model(y_true , y_pred)

(2114.490321420428,
 [261.53555641910026,
  265.77422557709747,
  243.86326413429734,
  273.4378681845974,
  319.8901732675819,
  335.93816656302914,
  387.6411590271771])

In [60]:
np.std(y_true[0])

138.64946256436684