In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load data
df = pd.read_csv("../data/clean/cleaned_data.csv")

# Keep target
y = df['log_price']

# Features
X = df.drop(['log_price', 'price'], axis=1)

# Handle high-cardinality 'location': keep top 10, others as 'Other'
top_locations = df['location'].value_counts().nlargest(10).index
X['location'] = X['location'].apply(lambda x: x if x in top_locations else 'Other')

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Identify categorical and numerical columns
cat_cols = ['category', 'type', 'location']
num_cols = [c for c in X.columns if c not in cat_cols]

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first'), cat_cols)
    ]
)

# Model pipelines
ridge_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', Ridge(alpha=1.0))
])

rf_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', RandomForestRegressor(
        n_estimators=200, max_depth=10, min_samples_leaf=5, random_state=42
    ))
])

# Fit Ridge
ridge_pipeline.fit(X_train, y_train)
y_pred_ridge = ridge_pipeline.predict(X_test)

print("Ridge Regression:")
print("MSE:", mean_squared_error(y_test, y_pred_ridge))
print("R2:", r2_score(y_test, y_pred_ridge))
print("AUC:", )

# Fit Random Forest
rf_pipeline.fit(X_train, y_train)
y_pred_rf = rf_pipeline.predict(X_test)

print("\nRandom Forest Regression:")
print("MSE:", mean_squared_error(y_test, y_pred_rf))
print("R2:", r2_score(y_test, y_pred_rf))


Ridge Regression:
MSE: 0.16248867813737725
R2: 0.9033952867107469

Random Forest Regression:
MSE: 0.15734068608276452
R2: 0.9064559325486726


In [69]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Binary target: expensive or not
threshold = df['price'].median()
y_binary = (df['price'] > threshold).astype(int)

# Split
X_train, X_test, y_train_bin, y_test_bin = train_test_split(
    X, y_binary, test_size=0.2, random_state=42
)

# Preprocessing (reuse your preprocessor)
clf = Pipeline([
    ('preprocess', preprocessor),
    ('model', RandomForestClassifier(n_estimators=200, random_state=42))
])

clf.fit(X_train, y_train_bin)
y_pred_proba = clf.predict_proba(X_test)[:,1]

auc = roc_auc_score(y_test_bin, y_pred_proba)
print("AUC:", auc)


AUC: 0.9704066913194556
