###Load both datasets and inspect their structure

In [1]:
import pandas as pd

# load the dataset

users_df = pd.read_csv('../data/users.csv')
hotels_df = pd.read_csv('../data/hotels.csv')

#display the first few rows of each dataframe
print(users_df.head())
print(hotels_df.head())

#check the basic information of each dataframe
print(users_df.info())
print(hotels_df.info())

#check for the missing values in each dataframe
print(users_df.isnull().sum())
print(hotels_df.isnull().sum())

   code company             name  gender  age
0     0    4You        Roy Braun    male   21
1     1    4You   Joseph Holsten    male   37
2     2    4You    Wilma Mcinnis  female   48
3     3    4You     Paula Daniel  female   23
4     4    4You  Patricia Carson  female   44
   travelCode  userCode     name               place  days   price    total  \
0           0         0  Hotel A  Florianopolis (SC)     4  313.02  1252.08   
1           2         0  Hotel K       Salvador (BH)     2  263.41   526.82   
2           7         0  Hotel K       Salvador (BH)     3  263.41   790.23   
3          11         0  Hotel K       Salvador (BH)     4  263.41  1053.64   
4          13         0  Hotel A  Florianopolis (SC)     1  313.02   313.02   

         date  
0  09/26/2019  
1  10/10/2019  
2  11/14/2019  
3  12/12/2019  
4  12/26/2019  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1340 entries, 0 to 1339
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  -----

In [2]:
# we will convert necessary columns into appropriate formats to ensure smooth analysis
# convert date columns to datetime format

hotels_df["date"]  = pd.to_datetime(hotels_df["date"], format="%m/%d/%Y")

#convert categorical values to lowercase for consistency
users_df["company"] = users_df["company"].str.lower().str.strip()
hotels_df["place"] = hotels_df["place"].str.lower().str.strip()

#convert price and total to float if needed
hotels_df["price"] = hotels_df["price"].astype(float)
hotels_df["total"] = hotels_df["total"].astype(float)

In [3]:
# display missing value count

print(users_df.isnull().sum())
print(hotels_df.isnull().sum())

# fill the missing values with median
hotels_df["price"].fillna(hotels_df["price"].median(), inplace=True)
hotels_df["total"].fillna(hotels_df["total"].median(), inplace=True)

#fill the missing categorical values with unknown
users_df["company"].fillna("unknown", inplace=True)
users_df["name"].fillna("unknown", inplace=True)

code       0
company    0
name       0
gender     0
age        0
dtype: int64
travelCode    0
userCode      0
name          0
place         0
days          0
price         0
total         0
date          0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hotels_df["price"].fillna(hotels_df["price"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hotels_df["total"].fillna(hotels_df["total"].median(), inplace=True)


In [4]:
# find out duplicate entries and rows
print(users_df.duplicated().sum())
print(hotels_df.duplicated().sum())

# remove duplicate entries
users_df.drop_duplicates(inplace=True)
hotels_df.drop_duplicates(inplace=True)

0
0


In [5]:
# Save cleaned datasets for reuse
users_df.to_csv("../data/cleaned_users.csv", index=False)
hotels_df.to_csv("../data/cleaned_hotels.csv", index=False)

print("Cleaned datasets saved successfully!")


Cleaned datasets saved successfully!
