In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/Hotel-Property-Value-Dataset/sample_submission.csv
/kaggle/input/Hotel-Property-Value-Dataset/train.csv
/kaggle/input/Hotel-Property-Value-Dataset/test.csv


In [2]:
train_df = pd.read_csv("/kaggle/input/Hotel-Property-Value-Dataset/train.csv")
test_df = pd.read_csv("/kaggle/input/Hotel-Property-Value-Dataset/test.csv")
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 81 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Id                       1200 non-null   int64  
 1   PropertyClass            1200 non-null   int64  
 2   ZoningCategory           1200 non-null   object 
 3   RoadAccessLength         977 non-null    float64
 4   LandArea                 1200 non-null   int64  
 5   RoadType                 1200 non-null   object 
 6   ServiceLaneType          75 non-null     object 
 7   PlotShape                1200 non-null   object 
 8   LandElevation            1200 non-null   object 
 9   UtilityAccess            1200 non-null   object 
 10  PlotConfiguration        1200 non-null   object 
 11  LandSlope                1200 non-null   object 
 12  District                 1200 non-null   object 
 13  NearbyTransport1         1200 non-null   object 
 14  NearbyTransport2        

In [3]:
train_df.drop_duplicates(inplace=True)
train_df.shape

missing_summary = (
    train_df.isnull().sum() / len(train_df) * 100
).sort_values(ascending=False).to_frame("missing_percent")

# Add dtypes for reference
missing_summary["dtype"] = train_df.dtypes

# Focus only on numeric columns with some missingness
numeric_missing = missing_summary[
    (missing_summary["missing_percent"] > 0) &
    ((missing_summary["dtype"]=="float64") |
    (missing_summary["dtype"]=="int64"))
]

print(numeric_missing)

categorical_missing = missing_summary[
    (missing_summary["missing_percent"] > 0) &
    (missing_summary["dtype"]=="object")    
]

print(categorical_missing)
'''
There are three features with missing values for numeric values.
The first is RoadAccessLength which is the length of road access available to the property. It could be that some properties do not have road
access at all, in which case we should fill 0 in all these columns, however we see that the column "RoadType" has no null values, which means 
every property has some road leading up to it. So the values are missing randomly, and not due to there being no road available. The missing
percentage is not very high (18.58%) so we can impute the values using median.
'''
road_access_median = train_df["RoadAccessLength"].median()
train_df["RoadAccessLength"] = train_df["RoadAccessLength"].fillna(road_access_median)
test_df["RoadAccessLength"] = test_df["RoadAccessLength"].fillna(road_access_median)

'''
The Parking columns have some missing data. By inspecting the data it can be seen that the rows which have missing data for some of the parking
columns are the rows which have ParkingArea and Parking Capacity = 0. So these are the houses with no parking. So the categorical features can 
be imputed with "None" and the numerical features with 0.
We also create a new feature called HasParking which takes 0 or 1 as a binary 
indicator of whether the house has parking or not, as this seems to be an influential feature to decide the housing price. This will help
prevent the imputed 0 values from acting as outliers,
'''

parking_cols = [col for col in train_df.columns if "parking" in col.lower()]

parking_numeric = train_df[parking_cols].select_dtypes(include=[np.number]).columns.tolist()
parking_categorical = train_df[parking_cols].select_dtypes(exclude=[np.number]).columns.tolist()

incomplete_parking_cols = [col for col in parking_cols if train_df[col].isna().any()]

# Define mask for properties with no parking
no_parking_mask = (train_df["ParkingArea"] == 0) & (train_df["ParkingCapacity"] == 0)
train_df["HasParking"] = np.where(no_parking_mask, 0, 1)

no_parking_mask_test = (test_df["ParkingArea"] == 0) & (test_df["ParkingCapacity"] == 0)
test_df["HasParking"] = np.where(no_parking_mask_test, 0, 1)

for col in incomplete_parking_cols:
    if col in parking_categorical:
        train_df.loc[no_parking_mask & train_df[col].isna(), col] = "None"
        test_df.loc[no_parking_mask_test & test_df[col].isna(), col] = "None"
    elif col in parking_numeric:
        train_df.loc[no_parking_mask & train_df[col].isna(), col] = 0
        test_df.loc[no_parking_mask_test & test_df[col].isna(), col] = 0

'''FacadeArea only missing for rows with FacadeType null, we can impute 0 here as they have no facade. We also impute None for missing
FacadeType values. '''
train_df["FacadeArea"] = train_df["FacadeArea"].fillna(0)
train_df["FacadeType"] = train_df["FacadeType"].fillna("None")

test_df["FacadeArea"] = test_df["FacadeArea"].fillna(0)
test_df["FacadeType"] = test_df["FacadeType"].fillna("None")


'''For categorical features with high missing values, they are all a result of structural randomness. They are not missing randomly but because
they are not present. So they can be imputed with "None."'''
structural_categorical = ["PoolQuality", "ExtraFacility", "ServiceLaneType", "BoundaryFence", "LoungeQuality"]

# Fill missing values with "None"
for col in structural_categorical:
    train_df[col] = train_df[col].fillna("None")
    test_df[col] = test_df[col].fillna("None")

'''ElectricalSystem is null only for one row. It can be assumed that this is missing randomly, so can be imputed with the mode'''
# Impute missing ElectricalSystem with the most common category
mode_value = train_df["ElectricalSystem"].mode()[0]
train_df["ElectricalSystem"] = train_df["ElectricalSystem"].fillna(mode_value)

basement_none_features = [ #These are missing for only those rows with BasementTotalSF = 0 which means no basement, so can impute with None.
    "BasementHeight",
    "BasementCondition",
    "BasementFacilityType1",
    "BasementFacilityType2"
]

# Fill missing with "None" for these columns
for col in basement_none_features:
    train_df[col] = train_df[col].fillna("None")
    test_df[col] = test_df[col].fillna("None")

test_df["BasementExposure"] = test_df["BasementExposure"].fillna("None")

# Fill "None" for rows with no basement
train_df.loc[train_df["BasementTotalSF"] == 0, "BasementExposure"] = "None"

# Fill remaining single missing row (which actually has a basement) with mode
mode_value = train_df["BasementExposure"].mode()[0]
train_df["BasementExposure"] = train_df["BasementExposure"].fillna(mode_value)

missing_counts = train_df.isnull().sum()
print(missing_counts[missing_counts > 0]) 

missing_counts_test = test_df.isnull().sum()
print(missing_counts_test[missing_counts_test > 0]) 

                         missing_percent    dtype
RoadAccessLength               18.583333  float64
ParkingConstructionYear         5.416667  float64
FacadeArea                      0.583333  float64
                       missing_percent   dtype
PoolQuality                  99.500000  object
ExtraFacility                96.166667  object
ServiceLaneType              93.750000  object
BoundaryFence                80.250000  object
FacadeType                   58.500000  object
LoungeQuality                46.666667  object
ParkingCondition              5.416667  object
ParkingType                   5.416667  object
ParkingFinish                 5.416667  object
ParkingQuality                5.416667  object
BasementExposure              2.500000  object
BasementHeight                2.416667  object
BasementCondition             2.416667  object
BasementFacilityType2         2.416667  object
BasementFacilityType1         2.416667  object
ElectricalSystem              0.083333  object
S

In [4]:
combined = pd.concat([train_df, test_df], axis=0, ignore_index=True)
numeric_cols = combined.select_dtypes(include=[np.number]).columns

# Check for negative values
negative_values = (train_df[numeric_cols] < 0).sum()
print("Negative values per column:\n", negative_values[negative_values > 0])

# Check for infinite values
infinite_values = np.isinf(train_df[numeric_cols]).sum()
print("Infinite values per column:\n", infinite_values[infinite_values > 0])

#No negative or infinte values are observed

Negative values per column:
 Series([], dtype: int64)
Infinite values per column:
 Series([], dtype: int64)


In [5]:
#Pre-processing specific to tree-based models like RandomForest and GradientBoost. CategoricalData must be label encoded
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns (same logic as before)
categorical_cols = train_df.select_dtypes(include=["object"]).columns

# Apply Label Encoding consistently
for col in categorical_cols:
    le = LabelEncoder()

    # Fit only on train data
    le.fit(train_df[col].astype(str))

    # Transform both train and test using the same encoder
    train_df[col] = le.transform(train_df[col].astype(str))

    # For test, handle unseen categories safely
    test_df[col] = test_df[col].astype(str).map(
        lambda x: le.transform([x])[0] if x in le.classes_ else -1
    )

print(train_df.shape)
print(train_df.head())
print("Pre-processing complete")

(1200, 82)
    Id  PropertyClass  ZoningCategory  RoadAccessLength  LandArea  RoadType  \
0  775             20               3             110.0     14226         1   
1  673             20               3              70.0     11250         1   
2  234             20               3              75.0     10650         1   
3  427             80               3              70.0     12800         1   
4  197             20               3              79.0      9416         1   

   ServiceLaneType  PlotShape  LandElevation  UtilityAccess  ...  PoolQuality  \
0                1          3              3              0  ...            3   
1                1          0              3              0  ...            3   
2                1          3              3              0  ...            3   
3                1          3              2              0  ...            3   
4                1          3              3              0  ...            3   

   BoundaryFence  ExtraFaci

In [6]:
# Training our Random Forest

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error


# Define X and y
X = train_df.drop('HotelValue', axis=1)
y = train_df['HotelValue']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Randomized Hyperparameter Tuning (Random Forest)

# 1. Define the model
rf_regressor = RandomForestRegressor(random_state=42)

# 2. Define the parameter distribution grid
# A wider range is used for RandomizedSearchCV
param_dist = {
    'n_estimators': [750, 1000, 1500, 2000, 2500, 3000, 3500],          # Number of trees in the forest
    'max_depth': [10, 20, 30, None],                # Max depth of the tree
    'min_samples_split': [2, 5, 10],                # Min samples required to split
    'min_samples_leaf': [1, 2, 4],                  # Min samples required at a leaf node
    'max_features': ['sqrt', 0.5, 0.7, 1.0],        # Number of features to consider for best split
    'bootstrap': [True, False]                      # Method of selecting samples for training each tree
}

# 3. Setup RandomizedSearchCV
# n_iter=50 means 50 different parameter combinations will be tested
random_search = RandomizedSearchCV(
    estimator=rf_regressor,
    param_distributions=param_dist,
    n_iter=50, 
    scoring='neg_mean_squared_error',
    cv=6, 
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# 4. Train and find the best parameters
random_search.fit(X_train, y_train)

# Get the best model
best_rf_model = random_search.best_estimator_

# Model Evaluation

y_pred = best_rf_model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print("\n--- Random Forest Model Results (Tuned) ---")
print(f"Best Hyperparameters: {random_search.best_params_}")
print(f"Test Root Mean Squared Error (RMSE): ${rmse:,.2f}")
print(f"Test Mean Absolute Error (MAE): ${mae:,.2f}")

Fitting 6 folds for each of 50 candidates, totalling 300 fits

--- Random Forest Model Results (Tuned) ---
Best Hyperparameters: {'n_estimators': 1500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 0.5, 'max_depth': None, 'bootstrap': False}
Test Root Mean Squared Error (RMSE): $29,549.11
Test Mean Absolute Error (MAE): $19,481.09


In [7]:
#Testing our Random Forest

X_pred_test = test_df
test_predictions = best_rf_model.predict(X_pred_test)

test_ids = test_df['Id'].copy()

submission_df = pd.DataFrame({
    'Id': test_ids,
    'HotelValue': test_predictions
})

submission_df.to_csv('test_predictions_final.csv', index=False)
print("\nTest predictions saved to 'test_predictions_final.csv'.")
print(submission_df)


Test predictions saved to 'test_predictions_final.csv'.
       Id     HotelValue
0     893  140114.693500
1    1106  322633.345944
2     414  114025.195556
3     523  154383.584722
4    1037  310805.039056
..    ...            ...
255  1422  135375.167833
256  1178  116289.234333
257  1292  115178.594111
258   782  180282.569278
259  1448  263327.636944

[260 rows x 2 columns]
