**Data Preprocessing:**

In [1]:
#Importing libraries for Preprocessing
import numpy as np # used for handling numbers
import pandas as pd # used for handling the dataset
import datetime

In [2]:
#importing dataset and printing the first few values
hotel = pd.read_csv('../input/hotel-booking-demand/hotel_bookings.csv')
hotel.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [3]:
hotel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [4]:
#Checking For missing values
x=hotel.isna().sum()
x

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [5]:
#Replacing Missing Children value with 0 and Missing Country with not available
replacing={'children':0, 'country':'Not Available'}
hotel_new=hotel.fillna(replacing)

#Dropping company and agent columns
hotel_new.drop(['company', 'agent'], axis=1, inplace=True)

hotel_new.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,assigned_room_type,booking_changes,deposit_type,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,C,3,No Deposit,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,C,4,No Deposit,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,C,0,No Deposit,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,A,0,No Deposit,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,A,0,No Deposit,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [6]:
#Changing datatype from float to int
hotel_new['children'] = hotel_new['children'].astype(int)

In [7]:
#Including Babies in Children
hotel_new['children']=hotel_new['children']+hotel_new['babies']
hotel_new.drop(['babies'], axis=1, inplace=True)

In [8]:
#Undefined is equal to SC that is no meal package.
hotel_new['meal'].replace("Undefined", "SC", inplace=True)

In [9]:
#making two new columns and dropping the redundant column
hotel_new['total_guests'] = hotel_new['adults'] + hotel_new['children']
hotel_new['total_stays'] = hotel_new['stays_in_weekend_nights'] + hotel_new['stays_in_week_nights']
hotel_new.drop(['stays_in_weekend_nights', 'stays_in_week_nights'], axis=1, inplace=True)

In [10]:
# dropping entries that include 0 Total Guests
hotel_new = hotel_new[hotel_new['total_guests'] != 0]

In [11]:
# making column arrival date
# Converting string month to numerical one
datetime_object = hotel['arrival_date_month'].str[0:3]
month_number = np.zeros(len(datetime_object))

# Creating a new column based on numerical representation of the months
for i in range(0, len(datetime_object)):
    datetime_object[i] = datetime.datetime.strptime(datetime_object[i], "%b")
    month_number[i] = datetime_object[i].month

# Float to integer conversion
month_number = pd.DataFrame(month_number).astype(int)

In [12]:
hotel_new['arrival_date'] = hotel['arrival_date_year'].map(str) + '-' + month_number[0].map(str) + '-' \
                       + hotel['arrival_date_day_of_month'].map(str)

In [13]:
hotel_new=hotel_new.drop(['arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month',
                  'arrival_date_week_number'], axis=1)
hotel_new.head()

Unnamed: 0,hotel,is_canceled,lead_time,adults,children,meal,country,market_segment,distribution_channel,is_repeated_guest,...,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,total_guests,total_stays,arrival_date
0,Resort Hotel,0,342,2,0,BB,PRT,Direct,Direct,0,...,0,Transient,0.0,0,0,Check-Out,2015-07-01,2,0,2015-7-1
1,Resort Hotel,0,737,2,0,BB,PRT,Direct,Direct,0,...,0,Transient,0.0,0,0,Check-Out,2015-07-01,2,0,2015-7-1
2,Resort Hotel,0,7,1,0,BB,GBR,Direct,Direct,0,...,0,Transient,75.0,0,0,Check-Out,2015-07-02,1,1,2015-7-1
3,Resort Hotel,0,13,1,0,BB,GBR,Corporate,Corporate,0,...,0,Transient,75.0,0,0,Check-Out,2015-07-02,1,1,2015-7-1
4,Resort Hotel,0,14,2,0,BB,GBR,Online TA,TA/TO,0,...,0,Transient,98.0,0,1,Check-Out,2015-07-03,2,2,2015-7-1


In [14]:
# Converting wrong datatype columns
hotel_new['arrival_date'] = pd.to_datetime(hotel_new['arrival_date'])
hotel_new['reservation_status_date'] = pd.to_datetime(hotel_new['reservation_status_date'])

In [15]:
#Final info
hotel_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 119210 entries, 0 to 119389
Data columns (total 26 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   hotel                           119210 non-null  object        
 1   is_canceled                     119210 non-null  int64         
 2   lead_time                       119210 non-null  int64         
 3   adults                          119210 non-null  int64         
 4   children                        119210 non-null  int64         
 5   meal                            119210 non-null  object        
 6   country                         119210 non-null  object        
 7   market_segment                  119210 non-null  object        
 8   distribution_channel            119210 non-null  object        
 9   is_repeated_guest               119210 non-null  int64         
 10  previous_cancellations          119210 non-null  int64  

In [16]:
#Checking if any missing values remaining
hotel_new.isna().sum()

hotel                             0
is_canceled                       0
lead_time                         0
adults                            0
children                          0
meal                              0
country                           0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
assigned_room_type                0
booking_changes                   0
deposit_type                      0
days_in_waiting_list              0
customer_type                     0
adr                               0
required_car_parking_spaces       0
total_of_special_requests         0
reservation_status                0
reservation_status_date           0
total_guests                      0
total_stays                       0
arrival_date                      0
dtype: int64