# Hotel Property Value — Preprocessing & EDA

**Author(s):** Gradient Gang

**Purpose:** EDA, cleaning, feature engineering, and preprocessing pipelines.

In [None]:
# === Basic Imports ===
import os, random, gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler


# === File Paths ===
TRAIN_PATH = "/kaggle/input/Hotel-Property-Value-Dataset/train.csv"
TEST_PATH  = "/kaggle/input/Hotel-Property-Value-Dataset/test.csv"
SAMPLE_PATH = "/kaggle/input/Hotel-Property-Value-Dataset/sample_submission.csv"

# === Load Data ===
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sample = pd.read_csv(SAMPLE_PATH)

print("Train shape:", train.shape)
print("Test shape:", test.shape)

In [None]:
# === Check for Duplicates ===
train.drop_duplicates(inplace=True)
test.drop_duplicates(inplace=True)

print("Train shape:", train.shape)
print("Test shape:", test.shape)


# === Missing Values ===
null_value_percentages=(train.isna().sum()/train.shape[0])*100
print("\nNull value percentages (training):\n", null_value_percentages)

columns_with_null_values = null_value_percentages[null_value_percentages>0]
print("\nColumns with null values (training):\n",columns_with_null_values)


null_value_percentages_test=(test.isna().sum()/test.shape[0])*100
print("\nNull value percentages (test):\n", null_value_percentages_test)

columns_with_null_values_test = null_value_percentages_test[null_value_percentages_test>0]
print("\nColumns with null values (test):\n",columns_with_null_values_test)

Here we have divided the columns based on the percentages of null values into the following categories:
* Columns with very less null values (0-5%)
* Columns with moderate amount of null values (5-50%)
* Columns with a lot of null values (>50%)

### Null Value Stuff

In [None]:
#=== Dropping Columns with lot of null values ===
columns_to_drop=null_value_percentages[null_value_percentages>50]

columns_to_drop=columns_to_drop.keys()
print("training data:",columns_to_drop)

train.drop(columns=columns_to_drop,inplace=True)
train.drop_duplicates(inplace=True)

#-----------------------------------------------------------
columns_to_drop_test=null_value_percentages_test[null_value_percentages_test>50]

columns_to_drop_test=columns_to_drop_test.keys()
print("testing data:",columns_to_drop_test)

test.drop(columns=columns_to_drop_test,inplace=True)
test.drop_duplicates(inplace=True)

In [None]:
#=== Checking Columns with moderate amount of null values ===
moderate_null_cols = null_value_percentages[(null_value_percentages >= 5) & (null_value_percentages <= 50)].index
moderate_null_cols_test = null_value_percentages_test[(null_value_percentages_test >= 5) & (null_value_percentages_test <= 50)].index

print("training data:",moderate_null_cols)
print("testing data:",moderate_null_cols_test)

print("\n")

for column in moderate_null_cols:
    if train[column].dtype == 'object':
        print(f"CATEGORICAL: {column}")
        print(train[column].value_counts(dropna=False))
    else:
        missing_count = train[column].isna().sum()
        print(f"NUMERIC: {column}")
        print(train[column].describe())
        print(f"Missing values: {missing_count}")
        
print("-" * 50)
for column in moderate_null_cols_test:
    if test[column].dtype == 'object':
        print(f"CATEGORICAL: {column}")
        print(test[column].value_counts(dropna=False))
    else:
        missing_count = test[column].isna().sum()
        print(f"NUMERIC: {column}")
        print(test[column].describe())
        print(f"Missing values: {missing_count}")

In [None]:
#=== Imputing Columns with moderate amount of null values ===

# Columns where null represents absence
categorical_absence_map = {
    'LoungeQuality': 'NoLounge',
    'ParkingType': 'NoParking',
    'ParkingFinish': 'NoParking',
    'ParkingQuality': 'NoParking',
    'ParkingCondition': 'NoParking'
}

# Seed for reproducibility
np.random.seed(42)

# ===== Imputation =====
for col in moderate_null_cols:
    # Add missing flag
    train[f"{col}_was_missing"] = train[col].isna().astype(int)
    test[f"{col}_was_missing"] = test[col].isna().astype(int)

    # Categorical columns with "absence" meaning
    if col in categorical_absence_map:
        train[col] = train[col].fillna(categorical_absence_map[col])
        if col in test.columns:
            test[col] = test[col].fillna(categorical_absence_map[col])

    # Other categorical columns (randomly assign observed values)
    elif train[col].dtype == 'object':
        possible_values = train[col].dropna().unique()
        null_mask = train[col].isna()
        n_missing = null_mask.sum()
        train.loc[null_mask, col] = np.random.choice(possible_values, size=n_missing, replace=True)

        if col in test.columns:
            possible_values_test = test[col].dropna().unique()
            null_mask_test = test[col].isna()
            n_missing_test = null_mask_test.sum()
            test.loc[null_mask_test, col] = np.random.choice(possible_values_test, size=n_missing_test, replace=True)

    # Numeric columns (sample from actual observed values)
    else:
        observed_values = train[col].dropna().values
        null_mask = train[col].isna()
        n_missing = null_mask.sum()
        train.loc[null_mask, col] = np.random.choice(observed_values, size=n_missing, replace=True)

        if col in test.columns:
            observed_values_test = test[col].dropna().values
            null_mask_test = test[col].isna()
            n_missing_test = null_mask_test.sum()
            test.loc[null_mask_test, col] = np.random.choice(observed_values_test, size=n_missing_test, replace=True)

In [None]:
#=== Verification ===

for column in moderate_null_cols:
    if train[column].dtype == 'object':
        print(f"CATEGORICAL: {column}")
        print(train[column].value_counts(dropna=False))
    else:
        missing_count = train[column].isna().sum()
        print(f"NUMERIC: {column}")
        print(train[column].describe())
        print(f"Missing values: {missing_count}")
        
print("-" * 50)
for column in moderate_null_cols_test:
    if test[column].dtype == 'object':
        print(f"CATEGORICAL: {column}")
        print(test[column].value_counts(dropna=False))
    else:
        missing_count = test[column].isna().sum()
        print(f"NUMERIC: {column}")
        print(test[column].describe())
        print(f"Missing values: {missing_count}")



In [None]:
#=== Checking Columns with less amount of null values ===
less_null_cols = null_value_percentages[(null_value_percentages > 0) & (null_value_percentages < 5)].index
less_null_cols_test = null_value_percentages_test[(null_value_percentages_test > 0) & (null_value_percentages_test < 5)].index

print("Training data columns with low nulls:", less_null_cols)
print("Testing data columns with low nulls:", less_null_cols_test)
print("="*80)

# Function to inspect a column safely
def inspect_column(df, col):
    missing_count = df[col].isna().sum()
    missing_pct = missing_count / len(df) * 100
    
    print(f"Column: {col}")
    print(f"Type: {'Categorical' if df[col].dtype=='object' else 'Numeric'}")
    print(f"Missing: {missing_count} ({missing_pct:.3f}%)")
    
    if df[col].dtype == 'object':
        # Include NaN explicitly
        print("Value counts (including NaN):")
        print(df[col].value_counts(dropna=False))
    else:
        print("Descriptive stats (numeric):")
        print(df[col].describe())
    print("-"*50)

# Inspect training data
print("TRAINING DATA:")
for col in less_null_cols:
    inspect_column(train, col)

# Inspect testing data
print("TESTING DATA:")
for col in less_null_cols_test:
    inspect_column(test, col)


In [None]:
import numpy as np
np.random.seed(42)

# --- Columns ---
less_null_cols = null_value_percentages[(null_value_percentages > 0) & (null_value_percentages < 5)].index
less_null_cols_test = null_value_percentages_test[(null_value_percentages_test > 0) & (null_value_percentages_test < 5)].index

# Basement related columns
basement_cols = [
    'BasementHeight', 'BasementCondition', 'BasementExposure',
    'BasementFacilityType1', 'BasementFacilityType2'
]

# Other columns
numeric_cols = ['FacadeArea']
categorical_cols = ['ElectricalSystem']

# --- Function to impute one dataset ---
def impute_low_nulls(df, is_test=False):
    # Add missing flags
    for col in less_null_cols:
        if col in df.columns:
            df[f"{col}_was_missing"] = df[col].isna().astype(int)
    
    # --- Basement columns ---
    for idx, row in df.iterrows():
        if all(col in df.columns for col in basement_cols):
            if row[basement_cols].isna().all():
                df.loc[idx, basement_cols] = "NoBasement"
            else:
                for col in basement_cols:
                    if pd.isna(row[col]):
                        possible_values = df[col].dropna().unique()
                        df.loc[idx, col] = np.random.choice(possible_values)

    # --- Numeric columns ---
    for col in numeric_cols:
        if col in df.columns:
            null_mask = df[col].isna()
            n_missing = null_mask.sum()
            if n_missing > 0:
                observed_vals = df[col].dropna().values
                df.loc[null_mask, col] = np.random.choice(observed_vals, size=n_missing, replace=True)

    # --- Other categorical columns ---
    for col in categorical_cols:
        if col in df.columns:
            null_mask = df[col].isna()
            n_missing = null_mask.sum()
            if n_missing > 0:
                observed_vals = df[col].dropna().unique()
                df.loc[null_mask, col] = np.random.choice(observed_vals, size=n_missing, replace=True)

# --- Apply to train and test ---
impute_low_nulls(train)
impute_low_nulls(test)


In [None]:
#=== Checking Columns with less amount of null values ===
print("Training data columns with low nulls:", less_null_cols)
print("Testing data columns with low nulls:", less_null_cols_test)
print("="*80)

# Function to inspect a column safely
def inspect_column(df, col):
    missing_count = df[col].isna().sum()
    missing_pct = missing_count / len(df) * 100
    
    print(f"Column: {col}")
    print(f"Type: {'Categorical' if df[col].dtype=='object' else 'Numeric'}")
    print(f"Missing: {missing_count} ({missing_pct:.3f}%)")
    
    if df[col].dtype == 'object':
        # Include NaN explicitly
        print("Value counts (including NaN):")
        print(df[col].value_counts(dropna=False))
    else:
        print("Descriptive stats (numeric):")
        print(df[col].describe())
    print("-"*50)

# Inspect training data
print("TRAINING DATA:")
for col in less_null_cols:
    inspect_column(train, col)

# Inspect testing data
print("TESTING DATA:")
for col in less_null_cols_test:
    inspect_column(test, col)


### Outlier detection

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Identify numeric columns
numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols_test = test.select_dtypes(include=[np.number]).columns.tolist()

print("Numeric columns (train):", numeric_cols)
print("Numeric columns (test):", numeric_cols_test)

# Class to cap outliers using IQR
class OutlierHandler:
    def __init__(self, col):
        q1 = col.quantile(0.25)
        q3 = col.quantile(0.75)
        iqr = q3 - q1
        self.lower_whisker = q1 - 1.5 * iqr
        self.upper_whisker = q3 + 1.5 * iqr

    def cap(self, value):
        if value < self.lower_whisker:
            return self.lower_whisker
        elif value > self.upper_whisker:
            return self.upper_whisker
        else:
            return value

# Apply outlier capping and plot for training data
for col in numeric_cols:
    handler = OutlierHandler(train[col])
    train[col] = train[col].apply(handler.cap)

 #   plt.figure(figsize=(6, 3))
 #   sns.boxplot(x=train[col])
#    plt.title(f"{col} distribution (train)")
 #   plt.show()

# Apply outlier capping for test data
for col in numeric_cols_test:
    handler = OutlierHandler(test[col])
    test[col] = test[col].apply(handler.cap)


### Standardisation

In [None]:
# Identify numeric columns in train
numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()

# Find numeric columns that exist in both train and test
common_numeric_cols = [col for col in numeric_cols if col in test.columns]

# Initialize scaler
scaler = StandardScaler()

# Standardize train
train[common_numeric_cols] = scaler.fit_transform(train[common_numeric_cols])

# Standardize test using the same scaler
test[common_numeric_cols] = scaler.transform(test[common_numeric_cols])

# Optional: check the new statistics
'''
target_col = "RoadAccessLength"  # replace if your target column has a different name

for col in common_numeric_cols:
    if col == target_col:
        continue
    plt.figure(figsize=(6, 4))
    sns.scatterplot(x=train[col], y=train[target_col])
    plt.title(f"Scatter plot: {col} vs {target_col}")
    plt.xlabel(col)
    plt.ylabel(target_col)
    plt.show()
'''


## EDA

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Drop all helper "_was_missing" columns
train_clean = train[[col for col in train.columns if not col.endswith('_was_missing')]]
test_clean = test[[col for col in test.columns if not col.endswith('_was_missing')]]

# Identify numeric and categorical columns again
numeric_cols = train_clean.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = train_clean.select_dtypes(include=['object']).columns.tolist()

# Plot histograms for numeric columns
plt.figure(figsize=(15, len(numeric_cols)*3))
for i, col in enumerate(numeric_cols):
    plt.subplot(len(numeric_cols), 1, i+1)
    sns.histplot(train_clean[col], kde=True, bins=30)
    plt.title(f"Histogram of {col}")
plt.tight_layout()
plt.show()


In [None]:

# Identify categorical columns
categorical_cols = train_clean.select_dtypes(include=['object']).columns.tolist()

# Plot boxplots of categorical features vs target variable
plt.figure(figsize=(15, len(categorical_cols)*3))
for i, col in enumerate(categorical_cols):
    plt.subplot(len(categorical_cols), 1, i+1)
    sns.boxplot(x=train_clean[col], y=train_clean['HotelValue'])
    plt.title(f"Boxplot of HotelValue vs {col}")
plt.tight_layout()
plt.show()


Saving the processed Datasets

In [None]:
# Save train and test to CSV
train_clean.to_csv("train_processed.csv", index=False)
test_clean.to_csv("test_processed.csv", index=False)
