In [1]:
# %% 1) Load libraries & get data
import os, warnings
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GroupShuffleSplit
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor

warnings.filterwarnings("ignore")

# ---- Paths & columns
CSV_IN  = r"C:\AVM Project\DATA\Assessment_Parcels_20251003 (1).csv"
TARGET = "Total Assessed Value"
GROUP_COLUMN = "Neighbourhood Area"

# Load (let pandas parse thousands)
df = pd.read_csv(CSV_IN, low_memory=False, thousands=",")
print("Raw shape:", df.shape)
display(df.head(10))


Raw shape: (244574, 524)


Unnamed: 0,Roll Number,Street Number,Unit Number,Street Suffix,Street Direction,Street Name,Street Type,Full Address,Neighbourhood Area,Market Region,...,Unnamed: 514,Unnamed: 515,Unnamed: 516,Unnamed: 517,Unnamed: 518,Unnamed: 519,Unnamed: 520,Unnamed: 521,Unnamed: 522,Unnamed: 523
0,1000001000,1636,,,,MCCREARY,ROAD,1636 MCCREARY ROAD,WILKES SOUTH,"6, CHARLESWOOD",...,,,,,,,,,,
1,1000005500,1584,,,,MCCREARY,ROAD,1584 MCCREARY ROAD,WILKES SOUTH,"6, CHARLESWOOD",...,,,,,,,,,,
2,1000008000,1574,,,,MCCREARY,ROAD,1574 MCCREARY ROAD,WILKES SOUTH,"6, CHARLESWOOD",...,,,,,,,,,,
3,1000008200,1550,,,,MCCREARY,ROAD,1550 MCCREARY ROAD,WILKES SOUTH,"6, CHARLESWOOD",...,,,,,,,,,,
4,1000008400,1538,,,,MCCREARY,ROAD,1538 MCCREARY ROAD,WILKES SOUTH,"6, CHARLESWOOD",...,,,,,,,,,,
5,1000008500,1536,,,,MCCREARY,ROAD,1536 MCCREARY ROAD,WILKES SOUTH,"6, CHARLESWOOD",...,,,,,,,,,,
6,1000013200,1520,,,,MCCREARY,ROAD,1520 MCCREARY ROAD,WILKES SOUTH,"6, CHARLESWOOD",...,,,,,,,,,,
7,1000013300,1510,,,,MCCREARY,ROAD,1510 MCCREARY ROAD,WILKES SOUTH,"6, CHARLESWOOD",...,,,,,,,,,,
8,1000013600,1500,,,,MCCREARY,ROAD,1500 MCCREARY ROAD,WILKES SOUTH,"6, CHARLESWOOD",...,,,,,,,,,,
9,1000013700,1490,,,,MCCREARY,ROAD,1490 MCCREARY ROAD,WILKES SOUTH,"6, CHARLESWOOD",...,,,,,,,,,,


In [4]:
# %% Corrected Light Cleaning (preserves numeric values like "1,313")
import pandas as pd
import numpy as np

def parse_money_or_number(series: pd.Series) -> pd.Series:
    """Parse strings like '$1,234' or '1,313' into floats; keep valid numbers."""
    s = series.astype(str)
    s = (s.str.replace("$", "", regex=False)
           .str.replace(",", "", regex=False)
           .str.strip()
           .replace({"": np.nan, "nan": np.nan}))
    return pd.to_numeric(s, errors="coerce")

# Apply parsing correctly
for col in ["Total Living Area", "Assessed Land Area", "Total Assessed Value"]:
    if col in df.columns:
        df[col] = parse_money_or_number(df[col])

print("After corrected cleaning:", df.shape)
display(df[["Total Assessed Value", "Total Living Area", "Assessed Land Area"]].head(10))


After corrected cleaning: (244574, 524)


Unnamed: 0,Total Assessed Value,Total Living Area,Assessed Land Area
0,723000.0,0.0,0.0
1,1619000.0,0.0,0.0
2,570000.0,0.0,0.0
3,743000.0,0.0,0.0
4,577000.0,0.0,0.0
5,979000.0,0.0,0.0
6,1900000.0,0.0,0.0
7,995000.0,0.0,0.0
8,669000.0,0.0,0.0
9,882000.0,0.0,0.0
