# Train XGBoost Baseline Model

This notebook trains a 5-fold cross-validated XGBoost model on tabular features to create a baseline prediction. It defines its own preprocessing logic (as originally) so it can run fully standalone. The same logic is also available in `preprocessing.py` as a reference. Outputs include processed CSV files for fusion model training and a baseline submission file.

In [None]:
# OOF , Line by line preprocessing , standard xgboost , data cleaning , feature engineering , zipcode encoding , date encoding , target transformation , feature selection , model training , model evaluation , model inference , model submission
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# ==========================================
# 1. PREPROCESSING FUNCTION
# ==========================================
def preprocess_data(df, is_train=True):
 
    df_out = df.copy()
    
    # --- A. Date Handling ---
    df_out['date'] = pd.to_datetime(df_out['date'])
    df_out['year_sold'] = df_out['date'].dt.year
    df_out['month_sold'] = df_out['date'].dt.month
    df_out['day_sold'] = df_out['date'].dt.day
    
    # --- B. Feature Engineering (Age & Renovation) ---
    ref_year = 2025
    df_out['house_age'] = ref_year - df_out['yr_built']
    df_out['was_renovated'] = (df_out['yr_renovated'] > 0).astype(int)
    last_update = df_out['yr_renovated'].where(df_out['yr_renovated'] != 0, df_out['yr_built'])
    df_out['years_since_update'] = ref_year - last_update
    
    # --- C. Zipcode ---
    df_out['zipcode'] = df_out['zipcode'].astype(int)
    
    # --- D. Target Transformation (Train Only) ---
    if is_train:
        df_out['log_price'] = np.log1p(df_out['price'])
    
    # --- E. Cleanup ---
    cols_to_drop = ['id', 'date', 'yr_built', 'yr_renovated']
    df_out = df_out.drop(columns=cols_to_drop, errors='ignore')
    
    return df_out

# ==========================================
# 2. LOAD DATA
# ==========================================
try:
    train_raw = pd.read_csv('train_tabular.csv')
    test_raw = pd.read_csv('test_tabular.csv')
    print("Files loaded successfully.")
    
    # --- FIX: REMOVE DUPLICATE IDs ---
    initial_len = len(train_raw)
    train_raw = train_raw.drop_duplicates(subset=['id'], keep='first')
    print(f"Removed {initial_len - len(train_raw)} duplicate IDs. New count: {len(train_raw)}")

except FileNotFoundError:
    print("Files not found. Please ensure 'train_tabular.csv' and 'test_tabular.csv' exist.")
    raise

# ==========================================
# 3. APPLY PREPROCESSING
# ==========================================
print("Preprocessing data...")
train_df = preprocess_data(train_raw, is_train=True)
test_df = preprocess_data(test_raw, is_train=False)

# Define Features
features = [c for c in train_df.columns if c not in ['price', 'log_price']]
print(f"Features used: {len(features)}")

X = train_df[features].values
y_log = train_df['log_price'].values
X_test = test_df[features].values

# ==========================================
# 4. K-FOLD TRAINING & ENSEMBLING
# ==========================================
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

oof_preds_log = np.zeros(len(train_df))
test_preds_list = []
rmse_scores = []

print(f"\nStarting {N_FOLDS}-Fold Cross Validation...")

for fold, (train_idx, val_idx) in enumerate(kf.split(X, y_log)):
    X_train_fold, y_train_fold = X[train_idx], y_log[train_idx]
    X_val_fold, y_val_fold = X[val_idx], y_log[val_idx]
    
    model = xgb.XGBRegressor(
        objective='reg:squarederror',
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        random_state=42,
        n_jobs=-1,
        early_stopping_rounds=50
    )
    
    model.fit(
        X_train_fold, y_train_fold,
        eval_set=[(X_val_fold, y_val_fold)],
        verbose=False
    )
    
    val_preds = model.predict(X_val_fold)
    oof_preds_log[val_idx] = val_preds
    
    test_fold_preds = model.predict(X_test)
    test_preds_list.append(test_fold_preds)
    
    fold_rmse = np.sqrt(mean_squared_error(y_val_fold, val_preds))
    rmse_scores.append(fold_rmse)
    print(f"Fold {fold+1} RMSE (Log Scale): {fold_rmse:.5f}")

print(f"\nAverage OOF RMSE: {np.mean(rmse_scores):.5f}")

# ==========================================
# 5. PROCESS TRAINING DATA (FOR FUSION MODEL)
# ==========================================
train_df['xgb_pred_log'] = oof_preds_log
train_df['price_pred_xgb'] = np.expm1(train_df['xgb_pred_log'])
train_df['alpha'] = train_df['price'] / train_df['price_pred_xgb']
train_df['residual_log'] = train_df['log_price'] - train_df['xgb_pred_log']
train_df['id'] = train_raw['id'].values

train_df.to_csv('train_processed_with_residuals.csv', index=False)
print("\n[TRAIN] Saved 'train_processed_with_residuals.csv' (Ready for Fusion Model)")
print(f"Alpha Mean: {train_df['alpha'].mean():.4f}, Std: {train_df['alpha'].std():.4f}")

# ==========================================
# 6. PROCESS TEST DATA (SUBMISSION)
# ==========================================
avg_test_preds_log = np.mean(test_preds_list, axis=0)

test_preds_actual = np.expm1(avg_test_preds_log)

test_df['xgb_pred_log'] = avg_test_preds_log
test_df['id'] = test_raw['id']
test_df.to_csv('test_processed_for_fusion.csv', index=False)
print("[TEST] Saved 'test_processed_for_fusion.csv' (Ready for Fusion Inference)")

submission = pd.DataFrame({
    'id': test_raw['id'],
    'price': test_preds_actual
})

submission.to_csv('submission_final.csv', index=False)
print("[SUBMISSION] Saved 'submission_final.csv' (Ensembled XGBoost Baseline)")



KeyboardInterrupt: 

In [None]:
# Prepare fusion CSV by attaching IDs
import pandas as pd

df_proc = pd.read_csv('train_processed_with_residuals.csv')
df_raw = pd.read_csv('train_tabular.csv') 

# Match the XGBoost logic - remove duplicates
df_raw = df_raw.drop_duplicates(subset=['id'], keep='first')

# Safety check
assert len(df_proc) == len(df_raw), f"Error: Row counts mismatch! Proc: {len(df_proc)}, Raw: {len(df_raw)}"

# Attach IDs
df_proc['id'] = df_raw['id'].values

# Save fusion-ready file
df_proc.to_csv('train_final_fusion.csv', index=False)
print("Success! Created 'train_final_fusion.csv'")
if 'alpha' in df_proc.columns:
    print(f"Alpha Mean: {df_proc['alpha'].mean():.4f} (Should be approx 1.0)")
print(df_proc[['id', 'residual_log', 'alpha']].head())

Files loaded successfully.
Removed 99 duplicate IDs. New count: 16110
Preprocessing data...
Features used: 22

Starting 5-Fold Cross Validation...
Fold 1 RMSE (Log Scale): 0.16062
Fold 2 RMSE (Log Scale): 0.16083
Fold 3 RMSE (Log Scale): 0.16134
Fold 4 RMSE (Log Scale): 0.15755


KeyboardInterrupt: 

In [None]:
# Calculate RÂ² score for diagnostics
import pandas as pd
from sklearn.metrics import r2_score

df = pd.read_csv('train_final_fusion.csv')
r2 = r2_score(df['price'], df['price_pred_xgb'])
print(f"R-squared: {r2:.5f}")

R-squared: 0.89463


# Optional: Visualize residual distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('train_processed_with_residuals.csv')

plt.figure(figsize=(12, 6))
sns.histplot(df['residual_log'], kde=True, binwidth=0.01, color='blue', edgecolor='black', alpha=0.6)
plt.axvline(x=0.03, color='red', linestyle='--', label='+/- 0.03 Threshold')
plt.axvline(x=-0.03, color='red', linestyle='--')
plt.axvline(x=0.01, color='green', linestyle=':', label='+/- 0.01 Threshold')
plt.axvline(x=-0.01, color='green', linestyle=':')
plt.title('Distribution of Log Residuals', fontsize=15)
plt.xlabel('Log Residual (Actual - Predicted)', fontsize=12)
plt.ylabel('Count of Houses', fontsize=12)
plt.legend()
plt.grid(axis='y', alpha=0.3)
plt.xlim(-0.5, 0.5)
plt.show()


NameError: name 'plt' is not defined