# Import packages
---
---

In [None]:
import pandas as pd
import numpy as np
import sklearn
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.pipeline import Pipeline
%config InlineBackend.figure_format = 'svg'
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score
import warnings
warnings.filterwarnings('ignore')
import random

#from yellowbrick.regressor import prediction_error

# Get the Data
---

In [None]:
df=pd.read_csv('/Users/reema/Downloads/hotel_booking.csv')
#df = pd.read_csv('hotel_booking.csv')
df

# Take a Look at the Data Structure
---

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.info()

In [None]:
#Numeric
numerical = df.loc[:, (df.dtypes == 'int64') | (df.dtypes == 'float64')].columns.tolist()
numerical

In [None]:
#Categorical
categorical = df.loc[:, (df.dtypes != 'int64') & (df.dtypes != 'float64')].columns.tolist()
categorical

In [None]:
df[numerical].describe()

In [None]:
df[categorical].describe()

In [None]:
# Check classes for prediction
df['is_canceled'].value_counts()#(normalize=True)

In [None]:
# Check value counts for catogrical columns
for col in categorical:
    print(f'''Value count column {col}:''')
    print('-' * 50)
    print(df[col].value_counts())
    print()

In [None]:
# Check Duplicates
df.duplicated().sum()

In [None]:
#df.hist(bins=60, figsize=(20,15));

# Data Pre-Processing
---

In [None]:
df.isnull().sum()

In [None]:
df_clean = df.copy()

In [None]:
#Delete Irrelevant Features
df_clean.drop(['name', 'email', 'phone-number', 'credit_card'], axis = 1, inplace=True)

In [None]:
#Remove rows that have no children data
df_clean.dropna(subset=['children'], inplace=True)

In [None]:
#Change value company to 0 and 1
df_clean['company'] = df_clean['company'].apply(lambda x: 0 if pd.isnull(x) else 1)
df_clean['company'].value_counts()

In [None]:
#Change value agent to 0 and 1
df_clean['agent'] = df_clean['agent'].apply(lambda x: 0 if pd.isnull(x) else 1)
df_clean['agent'].value_counts()

In [None]:
#Fill NaN value in country with the value of mode
df_clean['country'].fillna(df_clean['country'].mode()[0], inplace = True)
df_clean['country'].value_counts()

In [None]:
df_clean.isna().sum()

# Handel incorrect value
---

ADR

In [None]:
df_clean['adr'].describe()

In [None]:
df_clean[df_clean.adr < 0]

In [None]:
df_clean.drop(df_clean[df_clean['adr'] < 0].index, inplace = True)

In [None]:
df_clean['adr'].describe()

Adults


In [None]:
df_clean['adults'].describe()

In [None]:
df_clean.drop(df_clean[df_clean['adults']<1].index, inplace= True)

In [None]:
df_clean['adults'].describe()

# Change Type

In [None]:
# Changing children to integer type: 
df_clean['children'] = df_clean['children'].astype(int)

# Changing reservation status date to datetime type: 
df_clean['reservation_status_date'] = pd.to_datetime(df['reservation_status_date'])

# Changing agent to integer type: 
df_clean['agent'] = df_clean['agent'].astype(int)

# Changing company to integer type: 
df_clean['company'] = df_clean['company'].astype(int)

In [None]:
df_clean.info()

# Feature Engineering
---

Total Stays


In [None]:
df_clean['total_stays'] = df_clean['stays_in_weekend_nights'] + df_clean['stays_in_week_nights']

Total Guests

In [None]:
df_clean['total_guests'] = df_clean['adults'] + df_clean['children'] + df_clean['babies']

Kids

In [None]:
df_clean['kids'] = df_clean['children'] + df_clean['babies']

Guest Location

In [None]:
df_clean['guest_location'] = df_clean['country'].apply(lambda x: 'Local' if x == 'PRT' else 'International')
df_clean['guest_location'].value_counts()

In [None]:
# Delete 
df_clean.drop(['stays_in_weekend_nights', 'stays_in_week_nights', 'children', 'babies'], axis = 1, inplace=True)

In [None]:
# check Duplicates
df_clean.duplicated().sum()

In [None]:
# drop Duplicates
df_clean.drop_duplicates(inplace=True)

In [None]:
df_clean.duplicated().sum()

In [None]:
df_clean['is_canceled'].value_counts()

In [None]:
df_clean.info()

# Split Data
---

In [None]:
total_previous_cancellations = df_clean['previous_cancellations'].unique() 
total_previous_cancellations

In [None]:
#train/test split on randomly selected groups of previous_cancellations
np.random.seed(80)
total_previous_cancellations = df_clean['previous_cancellations'].unique() 
test_previous_cancellations = np.random.choice(total_previous_cancellations, size=int(total_previous_cancellations.shape[0] * .20))

In [None]:
df_train, df_test = df_clean[~df_clean['previous_cancellations'].isin(test_previous_cancellations)], df_clean[df_clean['previous_cancellations'].isin(test_previous_cancellations)]

In [None]:
#train/val split on randomly selected groups of previous_cancellations
total_train_previous_cancellations = df_train['previous_cancellations'].unique() 
val_previous_cancellations = np.random.choice(total_train_previous_cancellations, size=int(total_train_previous_cancellations.shape[0] * .25))

In [None]:
df_train, df_val = df_train[~df_train['previous_cancellations'].isin(val_previous_cancellations)], df_train[df_train['previous_cancellations'].isin(val_previous_cancellations)]

# ------------------------------------------

In [None]:
df_train['previous_cancellations'].unique()

In [None]:
df_test['previous_cancellations'].unique()

In [None]:
df_val['previous_cancellations'].unique()

# ------------------------------------------

In [None]:
df_train['is_canceled'].value_counts()

In [None]:
df_val['is_canceled'].value_counts()

In [None]:
df_test['is_canceled'].value_counts()

# Get Dummies 

In [None]:
df_clean.dtypes

In [None]:
df_clean.nunique()

In [None]:
df_clean.reserved_room_type.value_counts()

In [None]:
df_clean.assigned_room_type.value_counts()

In [None]:
df1=df1 = df_train.select_dtypes(exclude=['object']).copy()

In [None]:
df_obj_train=df_train.loc[:,['hotel','arrival_date_month','market_segment','distribution_channel','assigned_room_type','deposit_type','customer_type','reservation_status','guest_location']]
           

In [None]:
df_obj_val=df_val.loc[:,['hotel','arrival_date_month','market_segment','distribution_channel','assigned_room_type','deposit_type','customer_type','reservation_status','guest_location']]  

In [None]:
df_obj_test=df_test.loc[:,['hotel','arrival_date_month','market_segment','distribution_channel','assigned_room_type','deposit_type','customer_type','reservation_status','guest_location']]  

In [None]:
df_dum_tr = pd.get_dummies(df_obj_train, drop_first=True)

df_train1 = pd.concat([df1, df_dum_tr], axis=1)
df_train1.head(2)

In [None]:
df_dum_val = pd.get_dummies(df_obj_val, drop_first=True)

df_val1 = pd.concat([df1, df_dum_val], axis=1)
df_val1.head(2)

In [None]:
df_dum_test = pd.get_dummies(df_obj_test, drop_first=True)

df_test1 = pd.concat([df1, df_dum_test], axis=1)
df_test1.head(2)

In [None]:
#sns.heatmap(df_train1.corr(), cmap="seismic", annot=True, vmin=-1, vmax=1);


In [None]:
df_train1.corr()

# EDA

In [None]:
# plt.figure(figsize=(15, 25))
# for i in range(0, len(numerical)):
#     plt.subplot(10, int(len(numerical)/9), i+1)
#     sns.distplot(df_clean[numerical[i]], color='gray')
#     plt.tight_layout()

In [None]:
df_train1.shape

In [None]:
df_val1.shape

In [None]:
df_test1.shape

In [None]:
train_sample= df_train1[['is_canceled','lead_time','previous_cancellations','previous_bookings_not_canceled',
                           'assigned_room_type_K','assigned_room_type_L','deposit_type_Non Refund','deposit_type_Refundable',
                           'customer_type_Group','customer_type_Transient','customer_type_Transient-Party',
                           'reservation_status_Check-Out','reservation_status_No-Show','guest_location_Local',
                           'hotel_Resort Hotel','market_segment_Complementary','market_segment_Corporate',
                           'market_segment_Direct','market_segment_Groups','market_segment_Offline TA/TO',
                           'market_segment_Online TA','distribution_channel_Direct','distribution_channel_GDS',
                           'distribution_channel_TA/TO','distribution_channel_Undefined','assigned_room_type_B',
                           'assigned_room_type_C','assigned_room_type_D','assigned_room_type_E','assigned_room_type_F',
                           'assigned_room_type_G','assigned_room_type_H','assigned_room_type_I','assigned_room_type_K',
                           'assigned_room_type_L','deposit_type_Non Refund','deposit_type_Refundable','customer_type_Group',
                           'customer_type_Transient','customer_type_Transient-Party','reservation_status_Check-Out',
                           'reservation_status_No-Show'
                           ]]







val_sample = df_val1[['hotel_Resort Hotel','is_canceled','lead_time','previous_cancellations','distribution_channel_Direct',
                      'distribution_channel_TA/TO','market_segment_Complementary','market_segment_Corporate',
                      'market_segment_Direct',
                      'market_segment_Groups','market_segment_Offline TA/TO','market_segment_Online TA',
                      'country_PRT','country_ITA','country_GBR','country_FRA','deposit_type_Non Refund',
                      'previous_bookings_not_canceled','customer_type_Group','customer_type_Transient',
                      'customer_type_Transient-Party','assigned_room_type_B','assigned_room_type_C',
                      'assigned_room_type_D','assigned_room_type_E','assigned_room_type_F','assigned_room_type_G',
                      'assigned_room_type_H'
                     ]]
    
                          

test_sample = df_test1[['is_canceled','lead_time','previous_cancellations','previous_bookings_not_canceled',
                        'hotel_Resort Hotel','country_PRT','market_segment_Direct',
                      'market_segment_Groups','market_segment_Offline TA/TO','market_segment_Online TA',
                        'market_segment_Corporate','distribution_channel_Direct','distribution_channel_TA/TO',
                        'assigned_room_type_B','assigned_room_type_D','assigned_room_type_E','assigned_room_type_F',
                        'assigned_room_type_G','deposit_type_Non Refund','reservation_status_Check-Out',
                        'reservation_status_No-Show','customer_type_Transient-Party']]
    
    
   

In [None]:
# Classes
plt.figure(figsize=(5, 6))
cancel_ax = sns.countplot(df_clean['is_canceled'])

for p in cancel_ax.patches:
    cancel_ax.annotate(format(p.get_height(), '.0f'), 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   xytext = (0, 9), 
                   textcoords = 'offset points')

In [None]:
# Cancellation Rate
labels = ['Confirmed Booking',  'Cancelled Booking']
myexplode = [0.2, 0]
cancel_fig, cancel_ax = plt.subplots(figsize=[10,6])
cancel_ax.pie(df_clean['is_canceled'].value_counts(), autopct='%1.1f%%',
        shadow=True, startangle=90, explode = myexplode, textprops={'color':"black", 'fontsize':20}, labels=labels)
cancel_ax.axis('equal'); 

---
## What is the month in which there were the most cancellations, and what is the month in which there were the least cancellations?

In [None]:
plt.figure(figsize=(12, 10))
sns.barplot(df_clean['arrival_date_month'], df_clean['is_canceled'], color='steelblue')

plt.xlabel('arrival_date_month', fontsize=12)
plt.ylabel('is_canceled', fontsize=12)
plt.title('Cancellation Rate By Months', fontsize=20, pad=30)
plt.show()

# Select what year was the most reservation in?

In [None]:
counts_year = df_train.arrival_date_year.value_counts().sort_values(ascending=False).reset_index()
counts_year.columns = ['year', 'count']
counts_year

In [None]:
sns.barplot(x = 'year', y = 'count', data = counts_year , color = '#4682B4')
plt.xlabel('arrival date year')
plt.ylabel('counts')
plt.title('The Most reservation  year',fontsize =10,weight='bold',color='black')

# What is the cancellation rate for each hotel?

In [None]:
Most_hotel_cancellation = df_train.groupby('hotel').is_canceled.sum()
Most_hotel_cancellation

In [None]:
fig = plt.figure(figsize =(7,7))
c = ['#4682B4','#B0C4DE']
plt.title('Cancellation rate per hotel', fontsize = 12, weight = 'bold')
plt.pie(Most_hotel_cancellation , labels=Most_hotel_cancellation.index,autopct="%0.1f%%", pctdistance=0.6 ,colors=c);

# Is the number of changes of the reservation affected by (market_segment)?

In [None]:
plt.figure(figsize=(8,4))
sns.barplot(y=df_train.booking_changes,x=df_train.market_segment
            ,ci=True,color='steelblue')
plt.ylabel('booking_changes')
plt.xlabel('market_segment');


# Is booking changes affected by customer type or by days in waiting list?

In [None]:
plt.bar(df_train.booking_changes,df_train.days_in_waiting_list)

In [None]:
plt.bar(df_train.booking_changes,df_train.customer_type);