In [None]:
# Setup Python path to import src modules
import sys
from pathlib import Path

# Add project root to path
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Project root added to path: {project_root}")

In [1]:
import pandas as pd
import numpy as np

In [None]:
# Import preprocessing function
from src.preprocessing import preprocess

In [None]:
df = pd.read_csv('../data/raw/manhattan.csv')

In [None]:
df_preprocessed = preprocess(df)

In [None]:
df_preprocessed.shape

In [None]:
df_preprocessed.columns

In [None]:
df_preprocessed.head()

In [None]:
import joblib
from src.modeling import train_test_split_df, train_linear_regression, train_random_forest, train_xgboost, evaluate_model, model_comparison_table

In [None]:
X_train, X_test, y_train, y_test = train_test_split_df(df_preprocessed, 'rent')

In [None]:
print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

In [None]:
model_lr = train_linear_regression(X_train, y_train)

In [None]:
model_rf = train_random_forest(X_train, y_train)

In [None]:
model_xgb = train_xgboost(X_train, y_train)

In [None]:
model_lr

In [None]:
model_rf

In [None]:
model_xgb

In [None]:
results = {}
results["Linear Regression"] = evaluate_model(model_lr, X_test, y_test)
results["Random Forest"] = evaluate_model(model_rf, X_test, y_test)
results["XGBoost"] = evaluate_model(model_xgb, X_test, y_test)

In [None]:
comparison_df = model_comparison_table(results)

In [None]:
comparison_df

In [None]:
joblib.dump(model_xgb, '../outputs/models/best_model.pkl')
'../outputs/models/best_model.pkl'

In [None]:
import shap

In [None]:
explainer = shap.TreeExplainer(model_xgb)

In [None]:
shap_values = explainer.shap_values(X_test)

In [None]:
shap.summary_plot(shap_values, X_test)

In [None]:
shap.summary_plot(shap_values, X_test, plot_type='bar')

In [None]:
X_train.columns.tolist()