In [14]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
import joblib
import os

K = 10

def load_data(path: str = "./stock_levels_by_sensors.csv"):
    df = pd.read_csv(path)
    df.drop(columns=["Unnamed: 0"], inplace=True, errors='ignore')
    return df
def create_target_and_predictors(data: pd.DataFrame = None, target: str = "estimated_stock_pct"):
    if target not in data.columns:
        raise Exception(f"Target: {target} is not present in the data")

    # gonna have to force convert for some reason i dont know why
    data['estimated_stock_pct'] = pd.to_numeric(data['estimated_stock_pct'], errors='coerce')

    numeric_columns = ['estimated_stock_pct'] 

   
   # print("Selected numeric columns:", numeric_columns)

    X = data[numeric_columns]
    y = data[target]

    return X, y

def train_algorithm_with_cross_validation(X: pd.DataFrame = None, y: pd.Series = None):
    accuracy = []
    kf = KFold(n_splits=K, shuffle=True, random_state=42)
    
    for fold, (train_index, test_index) in enumerate(kf.split(X)):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        print(f"Before scaling - Fold {fold + 1}: X_train shape = {X_train.shape}, X_test shape = {X_test.shape}")

        model = RandomForestRegressor()
        scaler = StandardScaler()

        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        trained_model = model.fit(X_train, y_train)
        
        model_filename = f"trained_model_fold_{fold + 1}.joblib"
        joblib.dump(trained_model, model_filename)
        print(f"Trained model saved as {model_filename}")

        y_pred = trained_model.predict(X_test)
        mae = mean_absolute_error(y_true=y_test, y_pred=y_pred)
        accuracy.append(mae)
        print(f"Fold {fold + 1}: MAE = {mae:}")

    print(f"Average MAE: {(sum(accuracy) / len(accuracy)):.2f}")




def run():
    df = load_data()
    X, y = create_target_and_predictors(data=df)
    train_algorithm_with_cross_validation(X=X, y=y)

run()


Before scaling - Fold 1: X_train shape = (13500, 1), X_test shape = (1500, 1)
Trained model saved as trained_model_fold_1.joblib
Fold 1: MAE = 0.21983923
Before scaling - Fold 2: X_train shape = (13500, 1), X_test shape = (1500, 1)
Trained model saved as trained_model_fold_2.joblib
Fold 2: MAE = 0.22309023
Before scaling - Fold 3: X_train shape = (13500, 1), X_test shape = (1500, 1)
Trained model saved as trained_model_fold_3.joblib
Fold 3: MAE = 0.23490293
Before scaling - Fold 4: X_train shape = (13500, 1), X_test shape = (1500, 1)
Trained model saved as trained_model_fold_4.joblib
Fold 4: MAE = 0.25919232
Before scaling - Fold 5: X_train shape = (13500, 1), X_test shape = (1500, 1)
Trained model saved as trained_model_fold_5.joblib
Fold 5: MAE = 0.24198293
Before scaling - Fold 6: X_train shape = (13500, 1), X_test shape = (1500, 1)
Trained model saved as trained_model_fold_6.joblib
Fold 6: MAE = 0.231782738
Before scaling - Fold 7: X_train shape = (13500, 1), X_test shape = (1500, 