In [27]:
# === Imports === 
import streamlit as st
import joblib
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_absolute_error, mean_squared_error, root_mean_squared_error
from collections import Counter
import scipy.stats as stats
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
import pandas as pd
from sklearn.pipeline import Pipeline
import seaborn as sns
from statsmodels.tsa.arima.model import ARIMA
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from prophet import Prophet
import matplotlib.pyplot as plt

# Data Imports


In [28]:
# === Prophet Modeling === 
home_prices_df = pd.read_csv("Zillow Home Data.csv")
home_sales_count = pd.read_csv('Sold_Homes_US.csv')
mortgage_data = pd.read_csv("mortgage_rates.csv")

# Data Cleaning

In [29]:
Fayetteville_home_price_DF = home_prices_df[(home_prices_df["StateName"]== 'NC') & (home_prices_df['RegionName'] == 'Fayetteville, NC')]
fayetteville_prices = Fayetteville_home_price_DF.drop(columns = ['RegionID', 'SizeRank', 'RegionName', 'RegionType', 'StateName']).T
fayetteville_prices.columns = ["Price"]
fayetteville_prices.index = pd.to_datetime(fayetteville_prices.index)

Fayetteville_home_sales_count = home_sales_count[(home_prices_df["StateName"] == "NC")]
Fay_mean_sales_by_month = Fayetteville_home_sales_count.mean(axis=0, numeric_only=True).to_frame().T
Fay_mean_sales_by_month = Fay_mean_sales_by_month.drop(["RegionID", "SizeRank"], axis=1)
Fay_mean_sales_by_month.columns = pd.to_datetime(Fay_mean_sales_by_month.columns, )
test_df = Fay_mean_sales_by_month.T
Fay_mean_sales_by_month_T = Fay_mean_sales_by_month.T
Fay_mean_sales_by_month_T = Fay_mean_sales_by_month_T.reset_index()
Fay_mean_sales_by_month_T.columns = ['Date', 'Sales Per Month']  # Rename columns

clean_df = Fayetteville_home_price_DF.drop(["RegionID", "SizeRank", "RegionName", "StateName", "RegionType"], axis=1).T
cleaner_df = clean_df.reset_index()
df = cleaner_df.rename(columns={106: 'price', 'index': 'Date'})
df['Date'] = pd.to_datetime(df['Date'])                   
df["year"] = df['Date'].dt.year

# annual_avg = df.groupby('year')['price'].mean().reset_index()
rates_and_cost = pd.concat([mortgage_data, fayetteville_prices], ignore_index=True)

Fay_mean_sales_by_month = Fay_mean_sales_by_month.T
Fay_mean_sales_by_month = Fay_mean_sales_by_month.reset_index()
Fay_mean_sales_by_month.columns = ['Date', 'Sales_Volume']

mortgage_data['observation_date'] = pd.to_datetime(mortgage_data['observation_date'])

mortgage_data.rename(columns={'observation_date': 'Date'}, inplace=True)

combined_df = df.merge(mortgage_data, on='Date', how='outer')\
                .merge(Fay_mean_sales_by_month, on='Date', how='outer')


# Make sure you have a 'year' column
combined_df['year'] = combined_df['Date'].dt.year

# Fill NaN in SalesVolume with mean SalesVolume of that year
combined_df['Sales_Volume'] = combined_df.groupby('year')['Sales_Volume']\
    .transform(lambda x: x.fillna(x.mean()))


combined_df['price'] = combined_df.groupby('year')['price'].transform(lambda x: x.fillna(x.mean()))

combined_df['MORTGAGE30US'] = combined_df.groupby('year')['MORTGAGE30US'].transform(lambda x: x.fillna(x.mean()))


#  1 = Economic 
#  2 = US War
#  3 = International Conlfict

conflict_periods_years = [
    (2008, 2009, 1),  # Great Recession → Economic
    (2008, 2011, 2),  # War on Terror → US War
    (2011, 2014, 3),  # Arab Spring → International Conflict
    (2014, 2014, 3),  # Crimea Annexation → International Conflict
    (2018, 2019, 1),  # US-China Trade War → Economic
    (2020, 2022, 1),  # COVID → Economic
    (2022, 2025, 3),  # Russia-Ukraine → International Conflict
    (2023, 2025, 3)   # Israel–Hamas → International Conflict
]


# Ensure you have a 'year' column
combined_df['year'] = combined_df['Date'].dt.year

# Initialize the conflict column with default value, e.g. 0 (no conflict)
combined_df['conflict_type'] = 0

# Iterate over conflict periods and assign codes
for start_year, end_year, code in conflict_periods_years:
    mask = (combined_df['year'] >= start_year) & (combined_df['year'] <= end_year)
    combined_df.loc[mask, 'conflict_type'] = code


combined_df_post_2008_with_sales_volume = combined_df[combined_df['Date'] >= '2008-06-30']

combined_df_no_sales_volume = combined_df[combined_df['Date'] <= '2008-06-30']
combined_df_no_sales_volume = combined_df_no_sales_volume.drop('Sales_Volume', axis = 1)

combined_df_post_2008_with_sales_volume['Month'] = combined_df_post_2008_with_sales_volume['Date'].dt.month
combined_df_post_2008_with_sales_volume['Day'] = combined_df_post_2008_with_sales_volume['Date'].dt.day

combined_df_post_2008_with_sales_volume = combined_df_post_2008_with_sales_volume.drop('Date', axis = 1)

  Fayetteville_home_sales_count = home_sales_count[(home_prices_df["StateName"] == "NC")]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df_post_2008_with_sales_volume['Month'] = combined_df_post_2008_with_sales_volume['Date'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df_post_2008_with_sales_volume['Day'] = combined_df_post_2008_with_sales_volume['Date'].dt.day


# Creating Prophet Dataframe

In [30]:
combined_df = df.merge(mortgage_data, on='Date', how='outer')\
                .merge(Fay_mean_sales_by_month, on='Date', how='outer')

# Prophet Modeling


In [31]:
prophet_df = combined_df[['Date', 'price']]

prophet_df['ds'] = prophet_df['Date']

prophet_df['y'] =  prophet_df['price']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prophet_df['ds'] = prophet_df['Date']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prophet_df['y'] =  prophet_df['price']


In [32]:
prophet_df = prophet_df.drop(['Date', 'price'], axis =1)

model = Prophet(yearly_seasonality=True, weekly_seasonality=True,  daily_seasonality=False)

model.fit(prophet_df)

10:44:56 - cmdstanpy - INFO - Chain [1] start processing
10:44:57 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x2016e702f90>

In [33]:
future = model.make_future_dataframe(periods=100, freq='M')
forecast = model.predict(future)
forecast['yhat'] = forecast['yhat']

  dates = pd.date_range(


In [34]:
import matplotlib.pyplot as plt

# Generate the Prophet forecast plot
fig = model.plot(forecast)

# Customize the plot
ax = fig.gca()
ax.set_title('Prophet Forecast of Home Prices', fontsize=16)
ax.set_xlabel('Date', fontsize=12)
ax.set_ylabel('Home Price ($)', fontsize=12)
ax.grid(True, which='both', linestyle='--', alpha=0.5)
ax.legend(['Predicted', 'Trend', 'Upper Bound', 'Lower Bound'], fontsize=10)

# Optionally rotate dates for better visibility
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

  plt.show()


# Saving the model


In [38]:
joblib.dump(model, 'prophet_model.pkl')

['prophet_model.pkl']

In [39]:
joblib.dump(combined_df, 'combined_df.pkl')

['combined_df.pkl']