In [None]:
# =====================================================
# MODEL PREPROCESSING
# =====================================================
import pandas as pd
import numpy as np

TRAIN_PATH = "/kaggle/input/cdc-dataset/train(1).xlsx"
TEST_PATH  = "/kaggle/input/cdc-test/test2.xlsx"
#Reading train and test files
train_df = pd.read_excel(TRAIN_PATH)
test_df  = pd.read_excel(TEST_PATH)

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)

TARGET_COL = "price"
SPATIAL_COLS = ["lat", "long"]

TABULAR_FEATURES = [
    "bedrooms",
    "bathrooms",
    "sqft_living",
    "sqft_lot",
    "floors",
    "waterfront",
    "view",
    "condition",
    "grade",
    "sqft_living15",
    "sqft_lot15"
]
#Basic Checks
required_train_cols = TABULAR_FEATURES + SPATIAL_COLS + [TARGET_COL]
required_test_cols  = TABULAR_FEATURES + SPATIAL_COLS

missing_train = set(required_train_cols) - set(train_df.columns)
missing_test  = set(required_test_cols)  - set(test_df.columns)

if missing_train:
    raise ValueError(f"Missing columns in TRAIN dataset: {missing_train}")

if missing_test:
    raise ValueError(f"Missing columns in TEST dataset: {missing_test}")

print("All required columns are present.")

# Cleaning of dataset 
train_df = train_df.dropna(subset=[TARGET_COL]).reset_index(drop=True)
num_cols = train_df[TABULAR_FEATURES + SPATIAL_COLS].columns
train_df[num_cols] = train_df[num_cols].fillna(train_df[num_cols].median())
test_df[num_cols]  = test_df[num_cols].fillna(train_df[num_cols].median())

print("\nTarget (price) summary:")
print(train_df[TARGET_COL].describe())

print("\nLatitude range :", train_df["lat"].min(), "→", train_df["lat"].max())
print("Longitude range:", train_df["long"].min(), "→", train_df["long"].max())

print("\nFinal train shape:", train_df.shape)
print("Final test shape :", test_df.shape)

