In [6]:
import joblib
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [7]:
# Load data
def load_data(path: str = "/path/to/csv/"):
    df = pd.read_csv(f"{path}")
    df.drop(columns=["Unnamed: 0"], inplace=True, errors='ignore')
    return df

In [8]:
# Create target variable and predictor variables
def create_target_and_predictors(
        data: pd.DataFrame = None,
        target: str = "estimated_stock_pct"
):
    # Check to see if the target variable is present in the data
    if target not in data.columns:
        raise Exception(f"Target: {target} is not present in the data")

    X = data.drop(columns=[target])
    y = data[target]
    return X, y

In [11]:
# Train algorithm with K-Fold Cross Validation
def train_algorithm_with_cross_validation(
        X: pd.DataFrame = None,
        y: pd.Series = None,
        n_splits: int = None
):
    """
    This function takes the predictor and target variables and
    trains a Random Forest Regressor model across K folds. Using
    cross-validation, performance metrics will be output for each
    fold during training.

    :param      X: pd.DataFrame, predictor variables
    :param      y: pd.Series, target variable
    :param      n_splits: int (optional), number of folds for cross-validation

    :return     model: Trained RandomForestRegressor model
    """
    # Create a list that will store the MAE of each fold
    maes = []

    # Initialize KFold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

    for fold, (train_index, test_index) in enumerate(kf.split(X)):
        # Split data
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Instantiate algorithm and scaler
        model = RandomForestRegressor(random_state=42)
        scaler = StandardScaler()

        # Scale X data
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        # Train model
        model.fit(X_train, y_train)

        # Generate predictions on test sample
        y_pred = model.predict(X_test)

        # Compute MAE
        mae = mean_absolute_error(y_test, y_pred)
        maes.append(mae)
        print(f"Fold {fold + 1}: MAE = {mae:.3f}")

    # Finish by computing the average MAE across all folds
    avg_mae = sum(maes) / len(maes)
    print(f"Average MAE: {avg_mae:.2f}")

    return model

In [10]:
# Save the trained model
def save_model(model, file_path):
    """
    Save the trained model to a file.
    
    Args:
    model: Trained model.
    file_path (str): Path to save the model.
    """
    joblib.dump(model, file_path)

In [None]:
# Main execution
if __name__ == "__main__":
    # File paths
    data_file_path = 'path/to/your/data.csv'  # Change this to your CSV file path
    model_file_path = 'path/to/save/your/model.pkl'  # Change this to your desired model save path

    # Load data
    data = load_data(data_file_path)

    # Create target and predictors
    X, y = create_target_and_predictors(data, target="estimated_stock_pct")

    # Train model
    model = train_algorithm_with_cross_validation(X, y, n_splits=10)

    # Save model
    save_model(model, model_file_path)