In [17]:
#importing libraries
import pandas as pd
from prophet import Prophet
import pickle
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np
import os
import matplotlib.pyplot as plt

In [18]:
 # Load the dataset
df = pd.read_csv("/content/State_time_series.csv")

In [19]:

# checking the datatype of  date column
print(df['Date'].dtype)

object


In [20]:
print(df.columns.tolist())


['Date', 'RegionName', 'DaysOnZillow_AllHomes', 'InventorySeasonallyAdjusted_AllHomes', 'InventoryRaw_AllHomes', 'MedianListingPricePerSqft_1Bedroom', 'MedianListingPricePerSqft_2Bedroom', 'MedianListingPricePerSqft_3Bedroom', 'MedianListingPricePerSqft_4Bedroom', 'MedianListingPricePerSqft_5BedroomOrMore', 'MedianListingPricePerSqft_AllHomes', 'MedianListingPricePerSqft_CondoCoop', 'MedianListingPricePerSqft_DuplexTriplex', 'MedianListingPricePerSqft_SingleFamilyResidence', 'MedianListingPrice_1Bedroom', 'MedianListingPrice_2Bedroom', 'MedianListingPrice_3Bedroom', 'MedianListingPrice_4Bedroom', 'MedianListingPrice_5BedroomOrMore', 'MedianListingPrice_AllHomes', 'MedianListingPrice_CondoCoop', 'MedianListingPrice_DuplexTriplex', 'MedianListingPrice_SingleFamilyResidence', 'MedianPctOfPriceReduction_AllHomes', 'MedianPctOfPriceReduction_CondoCoop', 'MedianPctOfPriceReduction_SingleFamilyResidence', 'MedianPriceCutDollar_AllHomes', 'MedianPriceCutDollar_CondoCoop', 'MedianPriceCutDollar

In [21]:
#converting the object type in to datetime
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

In [22]:
multi_df = df[['Date', 'RegionName', 'ZHVI_AllHomes']].copy()
multi_df.sort_values(['RegionName', 'Date'], inplace=True)

# Interpolate missing values per state
# so basically interpolate fills out the missing values using the surrounding data values
multi_df['ZHVI_AllHomes'] = (
    multi_df
    .groupby('RegionName')['ZHVI_AllHomes']#by using groupby we treat each region separately
    .transform(lambda g: g.interpolate(method='linear'))
)


In [23]:
multi_df.isnull().sum()


Unnamed: 0,0
Date,1
RegionName,1
ZHVI_AllHomes,691


In [24]:
#using rolling method to handle outliers
# here we replace the extreme outliers by using the below function with smoothed median values
def rolling_median(series, window=3, threshold=3):
    roll_med = series.rolling(window=window, center=True, min_periods=1).median()
    diff = (series - roll_med).abs()
    mad = diff.rolling(window=window, center=True, min_periods=1).median()
    return series.where(diff <= threshold * mad, roll_med)

multi_df['ZHVI_AllHomes'] = (
    multi_df
    .groupby('RegionName')['ZHVI_AllHomes']
    .transform(lambda s: rolling_median(s))
)


In [25]:

print(multi_df.head(20))
print(multi_df.info())

          Date RegionName  ZHVI_AllHomes
0   1996-04-30    Alabama        79500.0
44  1996-05-31    Alabama        79200.0
90  1996-06-30    Alabama        79500.0
136 1996-07-31    Alabama        79700.0
182 1996-08-31    Alabama        79700.0
228 1996-09-30    Alabama        79700.0
274 1996-10-31    Alabama        80100.0
320 1996-11-30    Alabama        80600.0
366 1996-12-31    Alabama        80800.0
412 1997-01-31    Alabama        80700.0
458 1997-02-28    Alabama        80900.0
504 1997-03-31    Alabama        81300.0
550 1997-04-30    Alabama        81300.0
596 1997-05-31    Alabama        80900.0
642 1997-06-30    Alabama        80900.0
688 1997-07-31    Alabama        81100.0
734 1997-08-31    Alabama        81400.0
780 1997-09-30    Alabama        81700.0
827 1997-10-31    Alabama        81800.0
874 1997-11-30    Alabama        81900.0
<class 'pandas.core.frame.DataFrame'>
Index: 11016 entries, 0 to 11015
Data columns (total 3 columns):
 #   Column         Non-Null Count  

In [26]:
df['Date'] = pd.to_datetime(df['Date']) #checking date column again

In [27]:
output_dir = "prophet_outputs"
plot_dir = os.path.join(output_dir, "plots")
os.makedirs(output_dir, exist_ok=True)
os.makedirs(plot_dir, exist_ok=True)

In [28]:
regions = df['RegionName'].unique() #get all unique regions

In [29]:
import pandas as pd
import matplotlib.pyplot as plt
from prophet import Prophet
import os
import joblib
import io

# Save files directly in /content
csv_file = "/content/all_regions_forecast.csv"
data_joblib_file = "/content/all_forecasts_data.joblib"
plots_joblib_file = "/content/all_forecasts_plots.joblib"

def train_and_forecast(region_df, reg):
    if len(region_df) < 24:
        print(f"Skipping {reg} (not enough data)")
        return None, None

    # Prepare for Prophet
    region_df = region_df.rename(columns={'Date': 'ds', 'ZHVI_AllHomes': 'y'})
    model = Prophet(yearly_seasonality=True, weekly_seasonality=False, daily_seasonality=False)
    model.fit(region_df)

    future = model.make_future_dataframe(periods=12, freq='MS')
    forecast = model.predict(future)
    forecast['RegionName'] = reg


    fig1 = model.plot(forecast)
    plt.title(f"Forecast for {reg}")
    buf1 = io.BytesIO()
    fig1.savefig(buf1, format='png')
    plt.close(fig1)
    buf1.seek(0)

    fig2 = model.plot_components(forecast)
    buf2 = io.BytesIO()
    fig2.savefig(buf2, format='png')
    plt.close(fig2)
    buf2.seek(0)

    return forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper', 'RegionName']], {
        'forecast_plot': buf1.getvalue(),
        'components_plot': buf2.getvalue()
    }

# Train for all regions
all_forecasts_list = []
all_plots_dict = {}
regions = df["RegionName"].dropna().unique()

for reg in regions:
    region_df = df[df["RegionName"] == reg][['Date', 'ZHVI_AllHomes']].dropna()
    result, plots = train_and_forecast(region_df, reg)
    if result is not None:
        all_forecasts_list.append(result)
        all_plots_dict[reg] = plots

all_forecasts = pd.concat(all_forecasts_list, ignore_index=True)

all_forecasts.to_csv(csv_file, index=False)
joblib.dump(all_forecasts, data_joblib_file)
joblib.dump(all_plots_dict, plots_joblib_file)

print(" All outputs saved successfully!")
print("Files available in the Files panel:")
print(csv_file)
print(data_joblib_file)
print(plots_joblib_file)

# Show the files in Colab Files section
!ls -lh /content


DEBUG:cmdstanpy:input tempfile: /tmp/tmpq0x8v9xa/7nxwzkyo.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpq0x8v9xa/viiomi1q.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=37152', 'data', 'file=/tmp/tmpq0x8v9xa/7nxwzkyo.json', 'init=/tmp/tmpq0x8v9xa/viiomi1q.json', 'output', 'file=/tmp/tmpq0x8v9xa/prophet_model341x8pki/prophet_model-20251005063331.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
06:33:31 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
06:33:31 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmpq0x8v9xa/sze_n7pv.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpq0x8v9xa/3qxk9y_d.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/

Skipping Louisiana (not enough data)


DEBUG:cmdstanpy:input tempfile: /tmp/tmpq0x8v9xa/7e76fxre.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpq0x8v9xa/ckyvg0s7.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.12/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=2443', 'data', 'file=/tmp/tmpq0x8v9xa/7e76fxre.json', 'init=/tmp/tmpq0x8v9xa/ckyvg0s7.json', 'output', 'file=/tmp/tmpq0x8v9xa/prophet_modelfiva2xz8/prophet_model-20251005063347.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
06:33:47 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
06:33:47 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmpq0x8v9xa/aosru8id.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpq0x8v9xa/j6y5pq3_.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/l

Skipping UnitedStates (not enough data)
 All outputs saved successfully!
Files available in the Files panel:
/content/all_regions_forecast.csv
/content/all_forecasts_data.joblib
/content/all_forecasts_plots.joblib
total 11M
-rw-r--r-- 1 root root 365K Oct  5 06:34 all_forecasts_data.joblib
-rw-r--r-- 1 root root 6.4M Oct  5 06:34 all_forecasts_plots.joblib
-rw-r--r-- 1 root root 813K Oct  5 06:34 all_regions_forecast.csv
drwxr-xr-x 2 root root 4.0K Oct  5 06:24 forecast_outputs
drwxr-xr-x 2 root root 4.0K Oct  5 06:24 forecast_plots
drwxr-xr-x 3 root root 4.0K Oct  5 06:19 prophet_outputs
drwxr-xr-x 1 root root 4.0K Oct  2 13:36 sample_data
-rw-r--r-- 1 root root 3.0M Oct  5 06:19 State_time_series.csv
