In [None]:
import pandas as pd
import numpy as np

from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error
from google.colab import files

import warnings
warnings.filterwarnings("ignore")


In [None]:
retail_raw = pd.read_csv("retail_price_data.csv")
wholesale_raw = pd.read_csv("wholesale_price_data.csv")


In [None]:
def parse_repeating_blocks(df):
    state_col = df.columns[0]
    date_col = df.columns[-1]
    commodity_cols = df.columns[1:-1]

    price_rows = ["Average Price", "Maximum Price", "Minimum Price", "Modal Price"]
    rows = []

    i = 0
    while i < len(df):
        if df.iloc[i][state_col] == "Average Price":
            date = pd.to_datetime(df.iloc[i][date_col])
            avg, mx, mn, md = df.iloc[i:i+4]
            i += 4

            while i < len(df) and df.iloc[i][state_col] not in price_rows:
                state = df.iloc[i][state_col]
                for c in commodity_cols:
                    rows.append({
                        "date": date,
                        "state": state,
                        "commodity": c,
                        "price": avg[c]
                    })
                i += 1
        else:
            i += 1

    return pd.DataFrame(rows)


In [None]:
import pandas as pd

def parse_repeating_blocks(df):
    state_col = df.columns[0]
    date_col = df.columns[-1]
    commodity_cols = df.columns[1:-1]

    price_rows = ["Average Price", "Maximum Price", "Minimum Price", "Modal Price"]
    rows = []

    i = 0
    while i < len(df):
        if df.iloc[i][state_col] == "Average Price":
            # Ensure there are at least 4 rows for a complete block (Average, Max, Min, Modal)
            if i + 4 > len(df):
                # Not enough rows left to form a complete block. Skip this 'Average Price' row
                # and move to the next iteration, potentially ending the loop if `i` reaches `len(df)`.
                i += 1
                continue

            # Safely get the four rows one by one
            avg = df.iloc[i]
            mx = df.iloc[i+1]
            mn = df.iloc[i+2]
            md = df.iloc[i+3]

            # Further check: ensure the next three rows are indeed Maximum, Minimum, Modal prices
            if not (mx[state_col] == "Maximum Price" and
                    mn[state_col] == "Minimum Price" and
                    md[state_col] == "Modal Price"):
                # Pattern broken, this block is malformed. Advance `i` past the current 'Average Price' row
                # to search for the next valid block.
                i += 1
                continue

            date = pd.to_datetime(avg[date_col])
            i += 4 # Move past the 4 price rows

            # Process state rows until another price row or end of dataframe
            while i < len(df) and df.iloc[i][state_col] not in price_rows:
                state = df.iloc[i][state_col]
                for c in commodity_cols:
                    rows.append({
                        "date": date,
                        "state": state,
                        "commodity": c,
                        "price": avg[c]
                    })
                i += 1
        else:
            i += 1 # Move to the next row if it's not "Average Price"

    return pd.DataFrame(rows)

retail = parse_repeating_blocks(retail_raw)
wholesale = parse_repeating_blocks(wholesale_raw)

retail["market"] = "Retail"
wholesale["market"] = "Wholesale"

data = pd.concat([retail, wholesale], ignore_index=True)

BUCKET_1 = ["Rice", "Atta (Wheat)", "Milk", "Tea Loose"]
data = data[data["commodity"].isin(BUCKET_1)]

In [None]:
def directional_accuracy(actual, forecast):
    return np.mean(np.sign(np.diff(actual)) == np.sign(np.diff(forecast)))

def evaluate_walk_forward(series, window=180, horizon=14):
    maes, devs, dirs = [], [], []

    for i in range(window, len(series) - horizon):
        train = series.iloc[i-window:i]
        test = series.iloc[i:i+horizon]

        try:
            model = ARIMA(train, order=(1,1,1)).fit()
            forecast = model.forecast(horizon)
        except:
            continue

        maes.append(mean_absolute_error(test, forecast))
        devs.append(np.mean(np.abs(test - forecast)))
        dirs.append(directional_accuracy(test.values, forecast.values))

    return {
        "MAE": np.mean(maes),
        "Avg_Deviation": np.mean(devs),
        "Directional_Accuracy": np.mean(dirs)
    }


In [None]:
forecast_rows, accuracy_rows = [], []

for (market, crop, state), grp in data.groupby(["market","commodity","state"]):
    series = grp.sort_values("date").set_index("date")["price"]

    # Convert the series to numeric, coercing errors to NaN and then dropping them.
    # This ensures that ARIMA receives numeric input.
    series = pd.to_numeric(series, errors='coerce').dropna()

    if len(series) < 250:
        continue

    # Check if series became empty after dropping NaNs
    if len(series) == 0:
        continue

    acc = evaluate_walk_forward(series)

    try:
        model = ARIMA(series, order=(1,1,1)).fit()
        future = model.forecast(30)

        for d, p in zip(pd.date_range(series.index[-1] + pd.Timedelta(days=1), periods=30), future):
            forecast_rows.append({
                "date": d,
                "commodity": crop,
                "state": state,
                "market": market,
                "forecast_price": p
            })

        # Only append accuracy if evaluation was successful (not all NaNs)
        if acc and not any(pd.isna(v) for v in acc.values()):
            accuracy_rows.append({
                "commodity": crop,
                "state": state,
                "market": market,
                **acc
            })
    except Exception as e:
        print(f"ARIMA model fitting failed for {(market, crop, state)}: {e}")
        continue


ARIMA model fitting failed for ('Retail', 'Atta (Wheat)', 'Andaman and Nicobar'): Neither `start` nor `end` can be NaT
ARIMA model fitting failed for ('Retail', 'Atta (Wheat)', 'Andhra Pradesh'): Neither `start` nor `end` can be NaT
ARIMA model fitting failed for ('Retail', 'Atta (Wheat)', 'Arunachal pradesh'): Neither `start` nor `end` can be NaT
ARIMA model fitting failed for ('Retail', 'Atta (Wheat)', 'Assam'): Neither `start` nor `end` can be NaT
ARIMA model fitting failed for ('Retail', 'Atta (Wheat)', 'Bihar'): Neither `start` nor `end` can be NaT
ARIMA model fitting failed for ('Retail', 'Atta (Wheat)', 'Chandigarh'): Neither `start` nor `end` can be NaT
ARIMA model fitting failed for ('Retail', 'Atta (Wheat)', 'Chhattisgarh'): Neither `start` nor `end` can be NaT
ARIMA model fitting failed for ('Retail', 'Atta (Wheat)', 'DNH and DD'): Neither `start` nor `end` can be NaT
ARIMA model fitting failed for ('Retail', 'Atta (Wheat)', 'Delhi'): Neither `start` nor `end` can be NaT
ARI

In [None]:
pd.DataFrame(forecast_rows).to_csv("bucket1_forecasts.csv", index=False)
pd.DataFrame(accuracy_rows).to_csv("bucket1_accuracy.csv", index=False)

files.download("bucket1_forecasts.csv")
files.download("bucket1_accuracy.csv")
