In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import joblib
from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
DATA_PATH = Path('dataset.csv')
MODEL_DIR = Path('models')
MODEL_DIR.mkdir(exist_ok=True)
df = pd.read_csv(DATA_PATH)

In [2]:
def monthly_pivot(df):
    grouped = df.groupby(['month_year','category'])['amount'].sum().reset_index()
    pivot = grouped.pivot(index='month_year', columns='category', values='amount').fillna(0)
    pivot.index = pd.to_datetime(pivot.index).to_period('M')
    pivot = pivot.sort_index()
    pivot.index = pivot.index.astype(str)
    return pivot

pivot_df = monthly_pivot(df)


In [3]:
def prepare_series(pivot_df, category):
  
    if category not in pivot_df.columns:
        s = pd.Series(0.0, index=pivot_df.index, name=category)
    else:
        s = pivot_df[category].astype(float)
    df_ts = s.reset_index().rename(columns={category: 'amount'})
    df_ts['t'] = np.arange(len(df_ts))
    return s, df_ts  


In [4]:
def train_predict_lr(df_ts):

    if len(df_ts) < 2:
        last = float(df_ts['amount'].iloc[-1]) if len(df_ts) else 0.0
        return last, None
    X = df_ts[['t']].values
    y = df_ts['amount'].values
    model = LinearRegression().fit(X[:-1], y[:-1]) 
    pred = float(model.predict([[df_ts['t'].iloc[-1] + 1]])[0])
    return pred, model

def train_predict_rf(df_ts):
    if len(df_ts) < 2:
        last = float(df_ts['amount'].iloc[-1]) if len(df_ts) else 0.0
        return last, None
    X = df_ts[['t']].values
    y = df_ts['amount'].values
    model = RandomForestRegressor(n_estimators=100, random_state=42).fit(X[:-1], y[:-1])
    pred = float(model.predict([[df_ts['t'].iloc[-1] + 1]])[0])
    return pred, model

def predict_ma(df_ts, k=3):
    if len(df_ts) == 0:
        return 0.0
    k = min(k, len(df_ts))
    return float(df_ts['amount'].iloc[-k:].mean())


In [5]:
def evaluate_prediction(df_ts, pred_value):
    if len(df_ts) < 1:
        return {'rmse': None, 'mae': None}
    y_true = df_ts['amount'].iloc[-1]
    mse = (y_true - pred_value)**2
    rmse = float(np.sqrt(mse))
    mae = float(abs(y_true - pred_value))
    return {'rmse': rmse, 'mae': mae}


In [6]:
def train_all_and_save(pivot_df, save_models=True):
    predictions = {}
    models_info = {}
    for cat in pivot_df.columns:
        s, df_ts = prepare_series(pivot_df, cat)

        if len(df_ts) < 4:
            pred = predict_ma(df_ts)
            chosen = 'ma'
            model_file = None
        else:
            pred_lr, model_lr = train_predict_lr(df_ts)
            pred_rf, model_rf = train_predict_rf(df_ts)
            pred_ma = predict_ma(df_ts)

            last = df_ts['amount'].iloc[-1]
            errs = {'linear': abs(last - pred_lr), 'rf': abs(last - pred_rf), 'ma': abs(last - pred_ma)}
            chosen = min(errs, key=errs.get)
            if chosen == 'linear':
                pred = pred_lr; model_obj = model_lr
                model_file = MODEL_DIR / f"{cat}__linear.pkl"
            elif chosen == 'rf':
                pred = pred_rf; model_obj = model_rf
                model_file = MODEL_DIR / f"{cat}__rf.pkl"
            else:
                pred = pred_ma; model_obj = None
                model_file = None
            if save_models and model_file and model_obj is not None:
                joblib.dump(model_obj, model_file)
                model_file = str(model_file)
        predictions[cat] = float(pred)
        models_info[cat] = {'chosen': chosen, 'model_file': model_file}
    return predictions, models_info

preds, minfo = train_all_and_save(pivot_df)


In [7]:
# Cell 8 â€” Produce a minimal JSON to share with frontend
out = {'predictions': preds, 'trend_data': pivot_df.reset_index().to_dict(orient='records'), 'models': minfo}
with open('predictions_output.json', 'w') as f:
    json.dump(out, f, indent=2)

