In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
import re
df = pd.read_csv('hotels.csv')

describe = df.describe()

df = df.drop_duplicates()
duplicates = df.duplicated().sum()

missing_before = df.isnull().sum()
lat_lng = pd.read_csv('lat_lng.csv')

df = df.merge(lat_lng, on='Hotel_Name', how='left', suffixes=('', '_latlng'))
df['lat'] = df['lat'].fillna(df['lat_latlng'])
df['lng'] = df['lng'].fillna(df['lng_latlng'])

df.drop(columns=['lat_latlng', 'lng_latlng'], inplace=True)
count_nan = df.isnull().sum()

def extract_country_city(address):
    parts = address.split()
    country = parts[-1]
    city = parts[-3] if len(parts) >= 3 else ''
    return pd.Series([country, city])

def get_guest_type(tags):
    tags = tags.lower()
    if 'solo traveler' in tags:
        return 'solo'
    elif 'couple' in tags:
        return 'couple'
    elif 'group' in tags:
        return 'group'
    else:
        return 'other'

def parse_tags(tag_str):
    if isinstance(tag_str, list):
        return tag_str
    clean = tag_str.strip("[]").replace("'", "").split(", ")
    return [t.strip() for t in clean]

df[['country', 'city']] = df['Hotel_Address'].apply(extract_country_city)
df['Review_Date'] = pd.to_datetime(df['Review_Date'])

def get_season(date):
    if 6 <= date.month <= 8:
        return 'high'
    elif date.month == 12 or 1 <= date.month <= 2:
        return 'low'
    else:
        return 'shoulder'

df['season'] = df['Review_Date'].apply(get_season)
df['Reviewer_Nationality'] = df['Reviewer_Nationality'].str.strip()
top_100_nationalities = df['Reviewer_Nationality'].value_counts().nlargest(100).index
df['Reviewer_Nationality'] = df['Reviewer_Nationality'].apply(
    lambda x: x if x in top_100_nationalities else 'other'
)

tags_series = df['Tags'].dropna()
all_tags = tags_series.apply(parse_tags)
tags_flat = [tag for sublist in all_tags for tag in sublist]
tags_list = sorted(set(tags_flat))

df['guest_type'] = df['Tags'].astype(str).apply(get_guest_type)
drop_cols = ['Hotel_Address', 'Review_Date', 'Tags']
df.drop(columns=drop_cols, inplace=True)

cat_features = ['country', 'season', 'Reviewer_Nationality', 'guest_type']
encoder = OneHotEncoder(drop='first')  
encoded_df = pd.DataFrame(encoder, columns=encoder.get_feature_names_out(cat_features))

df_encoded = pd.concat([df.drop(columns=cat_features).reset_index(drop=True),
                        encoded_df.reset_index(drop=True)], axis=1)

NotFittedError: This OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.