<a href="https://colab.research.google.com/github/Augusto-Seixas-UFV/seixas-ufv-iac/blob/main/C%C3%B3pia_de_ELT_574_Atividade_01_Melbourne.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Melbourne Housing Snapshot – ML Preparation & Baseline Model
# Environment: Google Colab (Python 3.10+)
# Goal: Build a preprocessing pipeline + baseline model that achieves R² ≥ 0.85 (80/20 split)
# Author: Decisão Sistêmica – generated on 2025‑07‑02

# --- 0. OPTIONAL: DOWNLOAD DATASET DIRECTLY FROM KAGGLE ---------------------
# Requires Kaggle API token. Skip if you'll upload the CSV manually via Colab.
# from google.colab import files, auth
# !pip install -q kaggle
# auth.authenticate_user()
# !kaggle datasets download -d dansbecker/melbourne-housing-snapshot -p /content
# !unzip -o /content/melbourne-housing-snapshot.zip -d /content

# --- 1. IMPORTS -------------------------------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from pathlib import Path
import seaborn as sns  # optional for nicer plots

# --- 2. LOAD DATA -----------------------------------------------------------
# ↳ Adjust path if needed (e.g., "/content/melbourne_housing_full.csv")
data_path = Path("melbourne_housing_full.csv")
if not data_path.exists():
    raise FileNotFoundError("Upload 'melbourne_housing_full.csv' to the Colab workspace or adjust 'data_path'.")

housing_df = pd.read_csv(data_path)
print(f"Dataset shape: {housing_df.shape}")
print(housing_df.head())

# --- 3. QUICK EDA -----------------------------------------------------------
print("\nMissing values per column (top 10):")
missing = housing_df.isna().sum().sort_values(ascending=False)
print(missing.head(10))

plt.figure()
sns.histplot(housing_df['Price'].dropna(), kde=True)
plt.title('Distribution of Sale Price')
plt.xlabel('Price (AUD)')
plt.ylabel('Frequency')
plt.show()

# --- 4. PREPROCESSING PIPELINE ---------------------------------------------
# Separate target
TARGET = 'Price'
X = housing_df.drop(columns=[TARGET])
y = housing_df[TARGET].dropna()

# Align X & y (drop rows where y is NaN)
X = X.loc[y.index]

numeric_features = X.select_dtypes(include=['number']).columns.tolist()
categorical_features = X.select_dtypes(exclude=['number']).columns.tolist()
print(f"Numeric cols: {len(numeric_features)} | Categorical cols: {len(categorical_features)}")

numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# --- 5. TRAIN‑TEST SPLIT ----------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

# --- 6. BASELINE MODEL ------------------------------------------------------
rf_model = RandomForestRegressor(
    n_estimators=500,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

model_pipeline = make_pipeline(preprocessor, rf_model)
model_pipeline.fit(X_train, y_train)

# --- 7. EVALUATION ----------------------------------------------------------
y_pred = model_pipeline.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
print(f"\nR² on test set: {r2:.4f}")
print(f"MAE on test set: {mae:,.0f} AUD")

# --- 8. CHECKPOINT ----------------------------------------------------------
if r2 >= 0.85:
    print("✅ Success: Criterion met!")
else:
    print("⚠️  R² below 0.85 – consider hyperparameter tuning or alternative models (GBM, XGBoost, CatBoost, etc.)")

# --- 9. NEXT STEPS (TODO) ---------------------------------------------------
# • Hyperparameter tuning with GridSearchCV or Optuna
# • Feature engineering (e.g., distance to CBD, age of property)
# • Model comparison: GradientBoostingRegressor, XGBRegressor, CatBoostRegressor
# • Cross‑validation and ensemble stacking
# • Save trained model with joblib / sklearn‑onnx


FileNotFoundError: Upload 'melbourne_housing_full.csv' to the Colab workspace or adjust 'data_path'.