<a href="https://colab.research.google.com/github/Clos425/pokemon-react-verse/blob/main/Copy_of_ECGR4105FinalProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Training

In [4]:
# --- Install Required Packages ---
!pip install xgboost scikit-learn pandas matplotlib seaborn joblib --quiet

# --- Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- Imports ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import joblib

# --- Load Dataset ---
print("\u2705 Loading dataset...")
df = pd.read_csv('/content/drive/MyDrive/ECGR4105 Final Project/realtor-data.zip.csv')
print("Original dataset shape:", df.shape)

# --- Filter to important columns ---
keep_cols = ['price', 'bed', 'bath', 'acre_lot', 'state', 'zip_code']
df = df[keep_cols]
print("Filtered dataset shape:", df.shape)

# --- Drop missing values and duplicates ---
df = df.dropna().drop_duplicates()
print("After cleaning missing/duplicates:", df.shape)

# --- Remove outliers ---
for col in ['price', 'bed', 'bath', 'acre_lot']:
    lower, upper = df[col].quantile([0.01, 0.99])
    df = df[(df[col] >= lower) & (df[col] <= upper)]
print("After removing outliers:", df.shape)

# --- Keep only Top 5 States + NC ---
top_states = ['California', 'Texas', 'Florida', 'New York', 'Pennsylvania', 'North Carolina']
df = df[df['state'].isin(top_states)]
print("After keeping top states only:", df.shape)

# --- Feature Engineering ---

# Numerical features
X_numeric = df[['bed', 'bath', 'acre_lot', 'zip_code']]

# Only one-hot encode 'state'
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_state = encoder.fit_transform(df[['state']])
state_encoded_cols = encoder.get_feature_names_out(['state'])
X_state = pd.DataFrame(X_state, columns=state_encoded_cols, index=X_numeric.index)

# Combine
X = pd.concat([X_numeric, X_state], axis=1)
y = df['price']

# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Scale Only Numeric Features (Not one-hot) ---
numeric_features = ['bed', 'bath', 'acre_lot', 'zip_code']
scaler = StandardScaler()
X_train_numeric_scaled = scaler.fit_transform(X_train[numeric_features])
X_test_numeric_scaled = scaler.transform(X_test[numeric_features])

# Keep categorical features as they are (state one-hot)
X_train_final = np.hstack([X_train_numeric_scaled, X_train[state_encoded_cols].values])
X_test_final = np.hstack([X_test_numeric_scaled, X_test[state_encoded_cols].values])

# --- Train XGBoost Model ---
print("\u2705 Training XGBoost Regressor...")
model = xgb.XGBRegressor(
    n_estimators=3000,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    tree_method='hist',
    device='cuda',
    early_stopping_rounds=20
)

model.fit(
    X_train_final, y_train,
    eval_set=[(X_test_final, y_test)],
    verbose=100
)

# --- Evaluate Model ---
y_pred = model.predict(X_test_final)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n✅ Model Evaluation:")
print(f"MAE: ${mae:,.2f}")
print(f"RMSE: ${rmse:,.2f}")
print(f"R² Score: {r2*100:.2f}%")

# --- Save model and tools ---
joblib.dump(model, '/content/drive/MyDrive/ECGR4105 Final Project/real_estate_gb_model.pkl')
joblib.dump(scaler, '/content/drive/MyDrive/ECGR4105 Final Project/real_estate_scaler.pkl')
joblib.dump(encoder, '/content/drive/MyDrive/ECGR4105 Final Project/real_estate_encoder.pkl')

print("\u2705 Model, scaler, and encoder saved successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Loading dataset...
Original dataset shape: (2226382, 12)
Filtered dataset shape: (2226382, 6)
After cleaning missing/duplicates: (1308823, 6)
After removing outliers: (1245439, 6)
After keeping top states only: (527041, 6)
✅ Training XGBoost Regressor...
[0]	validation_0-rmse:486506.37652
[100]	validation_0-rmse:296732.41866
[200]	validation_0-rmse:291641.19784
[300]	validation_0-rmse:289739.90157
[400]	validation_0-rmse:288837.73271
[500]	validation_0-rmse:288298.86663
[582]	validation_0-rmse:288059.10975


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.





✅ Model Evaluation:
MAE: $163,005.31
RMSE: $288,047.03
R² Score: 66.93%
✅ Model, scaler, and encoder saved successfully!


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Upgraded Script Training

In [None]:
# --- Install Required Packages ---
!pip install xgboost scikit-learn pandas matplotlib seaborn joblib --quiet

# --- Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- Imports ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
import joblib

# --- Load Dataset ---
print("\u2705 Loading dataset...")
df = pd.read_csv('/content/drive/MyDrive/ECGR4105 Final Project/realtor-data.zip.csv')
print("Original dataset shape:", df.shape)

# --- Filter to important columns ---
keep_cols = ['price', 'bed', 'bath', 'acre_lot', 'state', 'zip_code']
df = df[keep_cols]
print("Filtered dataset shape:", df.shape)

# --- Drop missing values and duplicates ---
df = df.dropna().drop_duplicates()
print("After cleaning missing/duplicates:", df.shape)

# --- Remove outliers ---
for col in ['price', 'bed', 'bath', 'acre_lot']:
    lower, upper = df[col].quantile([0.01, 0.99])
    df = df[(df[col] >= lower) & (df[col] <= upper)]
print("After removing outliers:", df.shape)

# --- Keep only Top 5 States + NC ---
top_states = ['California', 'Texas', 'Florida', 'New York', 'Pennsylvania', 'North Carolina']
df = df[df['state'].isin(top_states)]
print("After keeping top states only:", df.shape)

# --- Feature Engineering ---

# Create new features
df['bed_bath_ratio'] = df['bed'] / df['bath']
df['acreage_density'] = df['bed'] / (df['acre_lot'] + 1e-5)  # prevent div by zero
df['log_lot_size'] = np.log(df['acre_lot'] + 1)
df['log_price'] = np.log(df['price'] + 1)

# Replace target
y = df['log_price']

# Numerical features
X_numeric = df[['bed', 'bath', 'acre_lot', 'zip_code', 'bed_bath_ratio', 'acreage_density', 'log_lot_size']]

# Categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_state = encoder.fit_transform(df[['state']])
state_encoded_cols = encoder.get_feature_names_out(['state'])
X_state = pd.DataFrame(X_state, columns=state_encoded_cols, index=X_numeric.index)

# Combine
X = pd.concat([X_numeric, X_state], axis=1)

# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Scale Only Numeric Features ---
numeric_features = ['bed', 'bath', 'acre_lot', 'zip_code', 'bed_bath_ratio', 'acreage_density', 'log_lot_size']
scaler = StandardScaler()
X_train_numeric_scaled = scaler.fit_transform(X_train[numeric_features])
X_test_numeric_scaled = scaler.transform(X_test[numeric_features])

# Combine numeric and state
X_train_final = np.hstack([X_train_numeric_scaled, X_train[state_encoded_cols].values])
X_test_final = np.hstack([X_test_numeric_scaled, X_test[state_encoded_cols].values])

# --- Train XGBoost Model ---
print("\u2705 Training XGBoost Regressor...")
model = xgb.XGBRegressor(
    n_estimators=3000,
    learning_rate=0.05,
    max_depth=8,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42,
    tree_method='hist',
    device='cuda',
    early_stopping_rounds=20
)

model.fit(
    X_train_final, y_train,
    eval_set=[(X_test_final, y_test)],
    verbose=100
)

# --- Evaluate Model ---
y_pred = np.exp(model.predict(X_test_final)) - 1  # reverse log transform
true_y = np.exp(y_test) - 1

mae = mean_absolute_error(true_y, y_pred)
rmse = np.sqrt(mean_squared_error(true_y, y_pred))
r2 = r2_score(true_y, y_pred)

print("\n✅ Model Evaluation with Feature Engineering:")
print(f"MAE: ${mae:,.2f}")
print(f"RMSE: ${rmse:,.2f}")
print(f"R² Score: {r2*100:.2f}%")

# --- Save model and tools ---
joblib.dump(model, '/content/drive/MyDrive/ECGR4105 Final Project/real_estate_gb_model_fe.pkl')
joblib.dump(scaler, '/content/drive/MyDrive/ECGR4105 Final Project/real_estate_scaler_fe.pkl')
joblib.dump(encoder, '/content/drive/MyDrive/ECGR4105 Final Project/real_estate_encoder_fe.pkl')

print("\u2705 Model, scaler, and encoder with Feature Engineering saved successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Loading dataset...
Original dataset shape: (2226382, 12)
Filtered dataset shape: (2226382, 6)
After cleaning missing/duplicates: (1308823, 6)
After removing outliers: (1245439, 6)
After keeping top states only: (527041, 6)
✅ Training XGBoost Regressor...
[0]	validation_0-rmse:0.72042
[100]	validation_0-rmse:0.39784
[200]	validation_0-rmse:0.38779
[300]	validation_0-rmse:0.38419
[400]	validation_0-rmse:0.38259
[500]	validation_0-rmse:0.38161
[600]	validation_0-rmse:0.38106
[700]	validation_0-rmse:0.38067
[800]	validation_0-rmse:0.38045
[900]	validation_0-rmse:0.38022
[951]	validation_0-rmse:0.38015

✅ Model Evaluation with Feature Engineering:
MAE: $157,138.27
RMSE: $292,753.31
R² Score: 65.84%
✅ Model, scaler, and encoder with Feature Engineering saved successfully!


Tuning Script

In [None]:
# --- Install Required Packages ---
!pip install xgboost scikit-learn optuna joblib --quiet

# --- Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- Imports ---
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import joblib

# --- Load Dataset ---
print("\u2705 Loading dataset...")
df = pd.read_csv('/content/drive/MyDrive/ECGR4105 Final Project/realtor-data.zip.csv')

# --- Preprocessing ---
keep_cols = ['price', 'bed', 'bath', 'acre_lot', 'state', 'zip_code']
df = df[keep_cols]
df = df.dropna().drop_duplicates()

# Remove outliers
for col in ['price', 'bed', 'bath', 'acre_lot']:
    lower, upper = df[col].quantile([0.01, 0.99])
    df = df[(df[col] >= lower) & (df[col] <= upper)]

# Keep Top States
top_states = ['California', 'Texas', 'Florida', 'New York', 'Pennsylvania', 'North Carolina']
df = df[df['state'].isin(top_states)]

# Prepare X and y
X_numeric = df[['bed', 'bath', 'acre_lot', 'zip_code']]
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_state = encoder.fit_transform(df[['state']])
state_encoded_cols = encoder.get_feature_names_out(['state'])
X_state = pd.DataFrame(X_state, columns=state_encoded_cols, index=X_numeric.index)

X = pd.concat([X_numeric, X_state], axis=1)
y = df['price']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale Numeric Columns
numeric_features = ['bed', 'bath', 'acre_lot', 'zip_code']
scaler = StandardScaler()
X_train_numeric_scaled = scaler.fit_transform(X_train[numeric_features])
X_test_numeric_scaled = scaler.transform(X_test[numeric_features])

# Combine numeric + categorical
X_train_final = np.hstack([X_train_numeric_scaled, X_train[state_encoded_cols].values])
X_test_final = np.hstack([X_test_numeric_scaled, X_test[state_encoded_cols].values])

# --- Define Optuna Objective ---
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 5, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'n_estimators': 2000,
        'tree_method': 'hist',
        'device': 'cuda',
        'early_stopping_rounds': 20  # moved into params
    }

    model = xgb.XGBRegressor(**params)
    model.fit(
        X_train_final, y_train,
        eval_set=[(X_test_final, y_test)],
        verbose=False
    )

    preds = model.predict(X_test_final)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse

# --- Start Optuna Study ---
print("\u2705 Starting hyperparameter tuning...")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

# --- Best Parameters Found ---
print("\n✅ Best Trial:")
print(f"  Value (RMSE): {study.best_trial.value:.2f}")
print(f"  Params: {study.best_trial.params}")

# --- Train Final Model with Best Params ---
best_params = study.best_trial.params
best_params.update({
    'n_estimators': 3000,
    'tree_method': 'hist',
    'device': 'cuda',
    'early_stopping_rounds': 20  # again, inside params
})

best_model = xgb.XGBRegressor(**best_params)
best_model.fit(
    X_train_final, y_train,
    eval_set=[(X_test_final, y_test)],
    verbose=100
)

# --- Evaluate Best Model ---
from sklearn.metrics import mean_absolute_error, r2_score

y_pred = best_model.predict(X_test_final)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n✅ Tuned Model Final Evaluation:")
print(f"MAE: ${mae:,.2f}")
print(f"RMSE: ${rmse:,.2f}")
print(f"R² Score: {r2*100:.2f}%")

# --- Save Best Model and Tools ---
joblib.dump(best_model, '/content/drive/MyDrive/ECGR4105 Final Project/real_estate_gb_model_tuned.pkl')
joblib.dump(scaler, '/content/drive/MyDrive/ECGR4105 Final Project/real_estate_scaler_tuned.pkl')
joblib.dump(encoder, '/content/drive/MyDrive/ECGR4105 Final Project/real_estate_encoder_tuned.pkl')

print("\u2705 Tuned model and tools saved successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Loading dataset...


[I 2025-04-29 00:16:54,484] A new study created in memory with name: no-name-6a947c2f-4412-4064-9738-63d696fe3c42


✅ Starting hyperparameter tuning...


[I 2025-04-29 00:16:56,418] Trial 0 finished with value: 289949.7427909197 and parameters: {'learning_rate': 0.1407068246696757, 'max_depth': 9, 'min_child_weight': 10, 'subsample': 0.6495887982654188, 'colsample_bytree': 0.6678155399719267}. Best is trial 0 with value: 289949.7427909197.
[I 2025-04-29 00:17:03,511] Trial 1 finished with value: 288410.9743431418 and parameters: {'learning_rate': 0.02458320530225894, 'max_depth': 7, 'min_child_weight': 8, 'subsample': 0.75230596889161, 'colsample_bytree': 0.993915674970141}. Best is trial 1 with value: 288410.9743431418.
[I 2025-04-29 00:17:04,803] Trial 2 finished with value: 289218.94040990365 and parameters: {'learning_rate': 0.19275894986839986, 'max_depth': 9, 'min_child_weight': 1, 'subsample': 0.8588090370936039, 'colsample_bytree': 0.8893028314844988}. Best is trial 1 with value: 288410.9743431418.
[I 2025-04-29 00:17:11,325] Trial 3 finished with value: 294870.393412205 and parameters: {'learning_rate': 0.0227723486665491, 'max


✅ Best Trial:
  Value (RMSE): 287481.79
  Params: {'learning_rate': 0.06047412665310763, 'max_depth': 8, 'min_child_weight': 5, 'subsample': 0.9969792152025334, 'colsample_bytree': 0.9577902739201193}
[0]	validation_0-rmse:487498.83947
[100]	validation_0-rmse:294872.43028
[200]	validation_0-rmse:290528.54680
[300]	validation_0-rmse:288950.83337
[400]	validation_0-rmse:287976.96535
[500]	validation_0-rmse:287660.05720
[600]	validation_0-rmse:287507.00380
[601]	validation_0-rmse:287505.41724

✅ Tuned Model Final Evaluation:
MAE: $162,534.83
RMSE: $287,481.79
R² Score: 67.06%
✅ Tuned model and tools saved successfully!


Upgrade Again

In [None]:
# --- Install Required Packages ---
!pip install xgboost scikit-learn optuna joblib --quiet

# --- Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- Imports ---
import pandas as pd
import numpy as np
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import joblib

# --- Load Dataset ---
print("\u2705 Loading dataset...")
df = pd.read_csv('/content/drive/MyDrive/ECGR4105 Final Project/realtor-data.zip.csv')

# --- Preprocessing ---
keep_cols = ['price', 'bed', 'bath', 'acre_lot', 'state', 'zip_code']
df = df[keep_cols]
df = df.dropna().drop_duplicates()

# Remove outliers
for col in ['price', 'bed', 'bath', 'acre_lot']:
    lower, upper = df[col].quantile([0.01, 0.99])
    df = df[(df[col] >= lower) & (df[col] <= upper)]

# Keep Top States
top_states = ['California', 'Texas', 'Florida', 'New York', 'Pennsylvania', 'North Carolina']
df = df[df['state'].isin(top_states)]

# --- Target Encoding Zip Code ---
print("\u2705 Target Encoding Zip Codes...")
zip_avg_price = df.groupby('zip_code')['price'].mean()
df['zip_encoded'] = df['zip_code'].map(zip_avg_price)

# Prepare Features
X_numeric = df[['bed', 'bath', 'acre_lot', 'zip_encoded']]  # use encoded zip now
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_state = encoder.fit_transform(df[['state']])
state_encoded_cols = encoder.get_feature_names_out(['state'])
X_state = pd.DataFrame(X_state, columns=state_encoded_cols, index=X_numeric.index)

X = pd.concat([X_numeric, X_state], axis=1)
y = df['price']

# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Scale Only Numeric Features ---
numeric_features = ['bed', 'bath', 'acre_lot', 'zip_encoded']
scaler = StandardScaler()
X_train_numeric_scaled = scaler.fit_transform(X_train[numeric_features])
X_test_numeric_scaled = scaler.transform(X_test[numeric_features])

# Combine numeric + categorical features
X_train_final = np.hstack([X_train_numeric_scaled, X_train[state_encoded_cols].values])
X_test_final = np.hstack([X_test_numeric_scaled, X_test[state_encoded_cols].values])

# --- Define Optuna Objective with New Data ---
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'max_depth': trial.suggest_int('max_depth', 5, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'n_estimators': 5000,  # << More estimators!
        'tree_method': 'hist',
        'device': 'cuda',
        'early_stopping_rounds': 50  # Increase patience slightly
    }

    model = xgb.XGBRegressor(**params)
    model.fit(
        X_train_final, y_train,
        eval_set=[(X_test_final, y_test)],
        verbose=False
    )

    preds = model.predict(X_test_final)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse

# --- Start Optuna Study ---
print("\u2705 Starting hyperparameter tuning...")
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=30)

# --- Best Parameters Found ---
print("\n✅ Best Trial:")
print(f"  Value (RMSE): {study.best_trial.value:.2f}")
print(f"  Params: {study.best_trial.params}")

# --- Train Final Model with Best Params ---
best_params = study.best_trial.params
best_params.update({
    'n_estimators': 5000,  # again here
    'tree_method': 'hist',
    'device': 'cuda',
    'early_stopping_rounds': 50
})

best_model = xgb.XGBRegressor(**best_params)
best_model.fit(
    X_train_final, y_train,
    eval_set=[(X_test_final, y_test)],
    verbose=100
)

# --- Evaluate Best Model ---
y_pred = best_model.predict(X_test_final)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n✅ Tuned Model Final Evaluation with Target Encoding + More Estimators:")
print(f"MAE: ${mae:,.2f}")
print(f"RMSE: ${rmse:,.2f}")
print(f"R² Score: {r2*100:.2f}%")

# --- Save Final Model and Tools ---
joblib.dump(best_model, '/content/drive/MyDrive/ECGR4105 Final Project/real_estate_gb_model_final.pkl')
joblib.dump(scaler, '/content/drive/MyDrive/ECGR4105 Final Project/real_estate_scaler_final.pkl')
joblib.dump(encoder, '/content/drive/MyDrive/ECGR4105 Final Project/real_estate_encoder_final.pkl')

print("\u2705 Final model and tools saved successfully!")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Loading dataset...
✅ Target Encoding Zip Codes...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['zip_encoded'] = df['zip_code'].map(zip_avg_price)
[I 2025-04-29 00:26:25,818] A new study created in memory with name: no-name-4931b73d-9272-4e93-8871-df706f3909d4


✅ Starting hyperparameter tuning...


[I 2025-04-29 00:26:41,207] Trial 0 finished with value: 231552.7638580776 and parameters: {'learning_rate': 0.016390318711021323, 'max_depth': 7, 'min_child_weight': 3, 'subsample': 0.716185639349664, 'colsample_bytree': 0.6922954981877867}. Best is trial 0 with value: 231552.7638580776.
[I 2025-04-29 00:26:45,249] Trial 1 finished with value: 230872.11202931608 and parameters: {'learning_rate': 0.05438562185353413, 'max_depth': 8, 'min_child_weight': 2, 'subsample': 0.8301792018681768, 'colsample_bytree': 0.9756583889349997}. Best is trial 1 with value: 230872.11202931608.
[I 2025-04-29 00:26:47,611] Trial 2 finished with value: 230312.8399585034 and parameters: {'learning_rate': 0.0939289114280325, 'max_depth': 10, 'min_child_weight': 5, 'subsample': 0.7211349256635667, 'colsample_bytree': 0.8191460820475265}. Best is trial 2 with value: 230312.8399585034.
[I 2025-04-29 00:26:59,428] Trial 3 finished with value: 232503.21108473386 and parameters: {'learning_rate': 0.027754240267673,


✅ Best Trial:
  Value (RMSE): 229811.31
  Params: {'learning_rate': 0.08109963050944859, 'max_depth': 10, 'min_child_weight': 10, 'subsample': 0.9950046035825004, 'colsample_bytree': 0.8771412972292869}
[0]	validation_0-rmse:482875.69298
[100]	validation_0-rmse:231012.06307
[200]	validation_0-rmse:229952.78779
[300]	validation_0-rmse:229859.57597
[336]	validation_0-rmse:229868.70697

✅ Tuned Model Final Evaluation with Target Encoding + More Estimators:
MAE: $128,132.45
RMSE: $229,811.31
R² Score: 78.95%
✅ Final model and tools saved successfully!


Prompt For Inputs After Training

In [5]:
# --- Load the final model and tools ---
import joblib
import pandas as pd

# Load
model = joblib.load('/content/drive/MyDrive/ECGR4105 Final Project/real_estate_gb_model_final.pkl')
scaler = joblib.load('/content/drive/MyDrive/ECGR4105 Final Project/real_estate_scaler_final.pkl')
encoder = joblib.load('/content/drive/MyDrive/ECGR4105 Final Project/real_estate_encoder_final.pkl')

# --- User Inputs ---
print("\n\u2705 Enter property details:")

bed = float(input("Number of Bedrooms: "))
bath = float(input("Number of Bathrooms: "))
acre_lot = float(input("Lot Size (in acres): "))
state = input("State (e.g., 'North Carolina'): ").strip()
zip_code = input("Zip Code (5 digits): ").strip()

# --- Target Encode Zip Code ---
# Use the zip_code -> average price mapping used during training
# (Assume you have access to it or redo target encoding if needed)

# For simplicity here, approximate zip_encoded manually:
# Let's just encode using the zip_code average value from the training set.
# (You would normally precompute and save this during training)

# TEMP FAKE: just assign median price ~300000 if not found
zip_avg_price = 300000  # Simplified assumption
try:
    zip_avg_data = pd.read_csv('/content/drive/MyDrive/ECGR4105 Final Project/realtor-data.zip.csv')
    zip_avg_data = zip_avg_data[['price', 'zip_code']].dropna()
    zip_encoded_mapping = zip_avg_data.groupby('zip_code')['price'].mean()
    zip_encoded_value = zip_encoded_mapping.get(int(zip_code), 300000)
except:
    zip_encoded_value = 300000

# --- Prepare Numeric Input as DataFrame ---
input_numeric = pd.DataFrame([[bed, bath, acre_lot, zip_encoded_value]], columns=['bed', 'bath', 'acre_lot', 'zip_encoded'])

# Scale numeric features
input_numeric_scaled = scaler.transform(input_numeric)

# Prepare state encoding as DataFrame
input_state = pd.DataFrame([[state]], columns=['state'])
state_encoded = encoder.transform(input_state)

# --- Combine All Inputs ---
import numpy as np
final_input = np.hstack([input_numeric_scaled, state_encoded])

# --- Predict ---
predicted_price = model.predict(final_input)[0]

print(f"\n💵 Predicted Property Price: ${predicted_price:,.2f}")


configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

for more details about differences between saving model and serializing.




✅ Enter property details:
Number of Bedrooms: 2
Number of Bathrooms: 2
Lot Size (in acres): 600
State (e.g., 'North Carolina'): 28105
Zip Code (5 digits): 28105

💵 Predicted Property Price: $460,425.53
