In [3]:
import pandas as pd

df1 = pd.read_csv("Building_Permits_20250818.csv", low_memory=False)
df2 = pd.read_csv("Building_Permits_20250818 (1).csv", low_memory=False)

df_model = pd.concat([df1, df2], axis=0, ignore_index=True)
print("Combined shape:", df_model.shape)


Combined shape: (2082213, 167)


In [4]:
df_model['REPORTED_COST'] = (
    df_model['REPORTED_COST']
    .astype(str)
    .str.replace(r'[\$,]', '', regex=True)
)

df_model['REPORTED_COST'] = pd.to_numeric(df_model['REPORTED_COST'], errors='coerce')

df_model = df_model.dropna(subset=['REPORTED_COST'])


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make sure REPORTED_COST exists
assert 'REPORTED_COST' in df_model.columns, "Target not found."

# Select numeric columns
num = df_model.select_dtypes(include=['number']).copy()

X = num.drop(columns=['REPORTED_COST'])
y = df_model['REPORTED_COST']

# Drop empty/constant columns
X = X.loc[:, X.notna().any()]
X = X.loc[:, X.nunique(dropna=True) > 1]

# Impute missing numeric values
X = X.fillna(X.median(numeric_only=True))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("MAE :", mean_absolute_error(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R2  :", r2_score(y_test, y_pred))

sample_pred = model.predict(X_test.head(1))[0]
print("Sample prediction:", sample_pred)


MAE : 406849.9335043175
RMSE: 17675185.90173577
R2  : -5.8649353458450904e-05
Sample prediction: 122860.8188970089




In [6]:
import joblib

joblib.dump(model, "cost_model.joblib")
joblib.dump(X.columns.tolist(), "model_features.joblib")

print("Model saved!")


Model saved!


In [None]:
import os
os.listdir()


: 