In [35]:
import pandas as pd

# Read the CSVs into a variable
price_data = pd.read_csv('hotels-europe_price.csv')
features_data = pd.read_csv('hotels-europe_features.csv')

In [36]:
# merge the two datasets with the key 'hotel_id'
merged_hotel_data = pd.merge(price_data, features_data, on='hotel_id')

In [37]:
# convert accommodation_type and city_actual to string so that it can be filtered
merged_hotel_data['accommodation_type'] = merged_hotel_data['accommodation_type'].astype('string')
merged_hotel_data['city_actual']=merged_hotel_data['city'].astype('string')

In [38]:
# Dropping duplicates after merging
merged_hotel_data = merged_hotel_data.drop_duplicates(subset='hotel_id')

In [39]:
# filter the merged data for Moscow
merged_hotel_data_moscow = merged_hotel_data.loc[(merged_hotel_data["city"].isin(['Moscow']))
                                                  ]
merged_hotel_data_moscow

Unnamed: 0,hotel_id,price,offer,offer_cat,year,month,weekend,holiday,nnights,scarce_room,...,country,city_actual,rating_reviewcount,center1label,center2label,neighbourhood,ratingta,ratingta_count,distance_alter,accommodation_type
77569,10946,16,0,0% no offer,2018,3,1,0,1,1,...,Russia,Moscow,,City centre,Solntsevo Exhibition Hall,Akademicheskiy,4.0,1.0,7.8,Hostel
77578,10947,59,1,15-50% offer,2017,12,0,1,1,0,...,Russia,Moscow,240.0,City centre,Solntsevo Exhibition Hall,Alexeevsky,3.0,2351.0,15.0,Hotel
77587,10948,40,1,15-50% offer,2018,4,1,0,1,0,...,Russia,Moscow,19.0,City centre,Solntsevo Exhibition Hall,Alexeevsky,4.0,34.0,15.0,Hotel
77596,10949,73,1,15-50% offer,2018,1,1,0,1,0,...,Russia,Moscow,31.0,City centre,Solntsevo Exhibition Hall,Alexeevsky,4.0,90.0,15.0,Hotel
77604,10950,40,1,15-50% offer,2017,12,0,1,1,0,...,Russia,Moscow,69.0,City centre,Solntsevo Exhibition Hall,Alexeevsky,3.5,230.0,15.0,Hotel
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80890,11341,466,0,0% no offer,2017,12,0,1,4,1,...,Russia,Moscow,9.0,City centre,Solntsevo Exhibition Hall,Yakimanka,3.5,242.0,10.0,Hotel
80894,11342,92,1,15-50% offer,2017,12,0,1,1,0,...,Russia,Moscow,14.0,City centre,Solntsevo Exhibition Hall,Yakimanka,,,10.0,Apartment
80903,11343,339,0,0% no offer,2017,12,0,1,4,0,...,Russia,Moscow,10.0,City centre,Solntsevo Exhibition Hall,Yakimanka,3.5,146.0,9.9,Hotel
80908,11344,311,1,15-50% offer,2017,12,0,1,4,0,...,Russia,Moscow,,City centre,Solntsevo Exhibition Hall,Yakimanka,5.0,2.0,10.0,Inn


### Ratings Data Cleaning

In [41]:
# Step 1: Identify the number of null values in the 'rating' column
null_count = merged_hotel_data_moscow['rating'].isnull().sum()
print(f"Number of null values in the 'rating' column: {null_count}")

Number of null values in the 'rating' column: 92


In [42]:
# Step 2: Transform null values to 0
merged_hotel_data['rating'].fillna(0, inplace=True)

In [43]:
# Optionally, verify the transformation
print(merged_hotel_data['rating'].isnull().sum())  # Should output 0

0


### Create `highly_rated` column

In [44]:
# new column 'highly_rated'
# Set to 1 if 'rating' is greater than or equal to 4, and 0 otherwise
merged_hotel_data['highly_rated'] = merged_hotel_data['rating'].apply(lambda x: 1 if x >= 4 else 0)
print(merged_hotel_data.head())

    hotel_id  price  offer     offer_cat  year  month  weekend  holiday  \
0          1    172      0   0% no offer  2017     11        1        0   
6          2    119      0   0% no offer  2017     11        0        0   
9          3    118      1  15-50% offer  2017     12        0        1   
14         4    115      1  15-50% offer  2017     12        0        1   
16         5    696      1  15-50% offer  2018      4        1        0   

    nnights  scarce_room  ... city_actual  rating_reviewcount  center1label  \
0         1            0  ...   Amsterdam              1030.0   City centre   
6         1            0  ...   Amsterdam               372.0   City centre   
9         1            0  ...   Amsterdam               165.0   City centre   
14        1            0  ...   Amsterdam               298.0   City centre   
16        1            1  ...   Amsterdam                 4.0   City centre   

        center2label neighbourhood ratingta  ratingta_count distance_alter