In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("hotel_bookings.csv")

# ✅ Step 1: Drop duplicates
df.drop_duplicates(inplace=True)

In [2]:
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,01-07-15
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,01-07-15
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,02-07-15
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,02-07-15
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,03-07-15


In [3]:
# ✅ Step 2: Fill missing values
df["children"].fillna(df["children"].median(), inplace=True)  # Fill children with median
df["country"].fillna(df["country"].mode()[0], inplace=True)   # Fill country with most common
df["agent"].fillna(0, inplace=True)  # Replace missing agent IDs with 0
df["company"].fillna(0, inplace=True)  # Replace missing company IDs with 0

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["children"].fillna(df["children"].median(), inplace=True)  # Fill children with median
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["country"].fillna(df["country"].mode()[0], inplace=True)   # Fill country with most common
The behavior will change in pandas 3.0. This inp

In [4]:
# ✅ Step 3: Convert data types
df["reservation_status_date"] = pd.to_datetime(df["reservation_status_date"])  # Convert date column
df[["children", "company", "agent"]] = df[["children", "company", "agent"]].astype("int64")  # Convert to integers


  df["reservation_status_date"] = pd.to_datetime(df["reservation_status_date"])  # Convert date column


In [6]:

# ✅ Step 4: Remove impossible bookings (where total guests = 0)
df = df[(df["adults"] + df["children"] + df["babies"]) > 0]

In [7]:
# ✅ Step 5: Save cleaned data
cleaned_file_path = "hotel_bookings_cleaned.csv"
df.to_csv(cleaned_file_path, index=False)


In [8]:

print(f"Cleaned dataset saved as {cleaned_file_path}")

Cleaned dataset saved as hotel_bookings_cleaned.csv


In [9]:
df = pd.read_csv("hotel_bookings_cleaned.csv")
print(df.info())  # Check for missing values
print(df.head())  # Preview first few rows


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87230 entries, 0 to 87229
Data columns (total 32 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   hotel                           87230 non-null  object 
 1   is_canceled                     87230 non-null  int64  
 2   lead_time                       87230 non-null  int64  
 3   arrival_date_year               87230 non-null  int64  
 4   arrival_date_month              87230 non-null  object 
 5   arrival_date_week_number        87230 non-null  int64  
 6   arrival_date_day_of_month       87230 non-null  int64  
 7   stays_in_weekend_nights         87230 non-null  int64  
 8   stays_in_week_nights            87230 non-null  int64  
 9   adults                          87230 non-null  int64  
 10  children                        87230 non-null  int64  
 11  babies                          87230 non-null  int64  
 12  meal                            