## Import Libraries Load data and clean

In [None]:
# Random Forest Regression
# Intall Libraries

#install numpy
#install pandas
#install matplotlib
#install seaborn
#install sklearn
#install statsmodels
#install openpyxl

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error


sns.set()  #if you want to use seaborn themes with matplotlib functions
import statsmodels.api as sm

import warnings

warnings.filterwarnings('ignore')



dfUnclean=pd.read_excel('F 1021 to 1025.xlsx')

In [None]:
#check that data looks allright
dfUnclean.head()

In [None]:
# There are dividends paid and as such there are more dates and "open values" but in reality those are the dividend amounts
dfUnclean.info()

In [None]:
# Show sample rows where any column has 'Dividend'
div_check = dfUnclean[dfUnclean.apply(lambda row: row.astype(str).str.contains("Dividend", case=False, na=False).any(), axis=1)]

print(div_check)

In [None]:
# Identify dividend rows
div_mask = dfUnclean["Open"].astype(str).str.contains("Dividend", case=False, na=False)

div_rows = dfUnclean[div_mask]
price_rows = dfUnclean[~div_mask]

# Extract dividend amount
div_rows["Dividend"] = div_rows["Open"].astype(str).str.extract(r"([0-9]*\.?[0-9]+)").astype(float)
div_rows = div_rows[["Date", "Dividend"]]

# Convert Date to datet
div_rows["Date"] = pd.to_datetime(div_rows["Date"])
price_rows["Date"] = pd.to_datetime(price_rows["Date"])

dfUnclean = price_rows.merge(div_rows, on="Date", how="left")

if(dfUnclean["Dividend"].isna().any()==True):
    dfUnclean["Dividend"] = dfUnclean["Dividend"].fillna(0)

#convert volume to int
dfUnclean["Volume"] = dfUnclean["Volume"].astype(int)

df = dfUnclean

print(df.iloc[50])


In [None]:
#Check for missing data
df.info()
df.isnull().sum()/len(df)

In [None]:
#check for duplicates
sum(df.duplicated())
df.nunique()

## Adding Autoregressive Varriables

In [None]:
def create_lagged_features(df, lags, cols):
    for col in cols:
        for i in range(1,lags+1):
            df[f'{col}_lag_{i}'] = df[col].shift(i)

    return df


In [None]:
dfCopy =df.copy()
df.head()

In [None]:
lag_features = ['Volume', 'Open', 'Close']
create_lagged_features(df, lags=2, cols=lag_features)
df.head()

In [None]:
df.head

## Splitting Data into Training and Test

In [None]:
target = "Close"

# All columns that contain "_lag_" become features
Y = df[target]
X = df[[c for c in df.columns if "_lag_" in c]+ [c for c in df.columns if c not in lag_features and c not in ["Date", "Adj Close","High","Low", target]]]

#20% of data to test
#Double check that its the newest data that is being used for the test
split = int(len(df) * 0.2)

X_train, X_test = X.iloc[split:], X.iloc[:split]
Y_train, Y_test = Y.iloc[split:], Y.iloc[:split]


In [None]:
#Check that Data was split right
print(f"X_train Length {len(X_train)}")
print(f"X_test Length {len(X_test)}")
print(f"Y_train Length {len(Y_train)}")
print(f"Y_test Length {len(Y_test)}")

## Create the Model

In [None]:
from sklearn.ensemble import RandomForestRegressor


model = RandomForestRegressor(
    n_estimators=1000,
    max_depth=50,
    random_state=25565
)

model.fit(X_train, Y_train)


In [None]:
Y_pred_train = model.predict(X_train)
Y_pred_test = model.predict(X_test)

## Test the models accuracy

In [None]:
def evaluate(Y_true, Y_pred):
    mae = mean_absolute_error(Y_true, Y_pred)
    rmse = np.sqrt(mean_squared_error(Y_true, Y_pred))
    r2 = r2_score(Y_true, Y_pred)

    return mae, rmse, r2

train_mae, train_rmse, train_r2 = evaluate(Y_train, Y_pred_train)
test_mae, test_rmse, test_r2 = evaluate(Y_test, Y_pred_test)

print("TRAINING PERFORMANCE:")
print(f"MAE:  {train_mae:.4f}")
print(f"RMSE: {train_rmse:.4f}")
print(f"R²:   {train_r2:.4f}")

print("\nTEST PERFORMANCE:")
print(f"MAE:  {test_mae:.4f}")
print(f"RMSE: {test_rmse:.4f}")
print(f"R²:   {test_r2:.4f}")

In [None]:
plt.figure(figsize=(14,5))
plt.plot(Y_test.values, label="Actual")
plt.plot(Y_pred_test, label="Predicted")
plt.title("Actual vs Predicted (Test Set)")
plt.legend()
plt.show()

In [None]:
importance = pd.Series(model.feature_importances_, index=X_train.columns)
importance.sort_values(ascending=False).head(20)


In [None]:
importance.sort_values().tail(20).plot(kind='barh', figsize=(8,6))

## Walk forward multi-horizon testing 1,5,10,20,30,60,90 Days into the future

In [None]:
def walk_forward_horizon_test(model, X, Y,horizons):

    results = {}

    for h in horizons:

        preds = []
        actuals = []

        for i in range(len(X) - h):
            # Predict based on the feature row i
            pred = model.predict([X.iloc[i].values])[0]
            actual = Y.iloc[i + h]

            preds.append(pred)
            actuals.append(actual)

        mae = mean_absolute_error(actuals, preds)
        rmse = np.sqrt(mean_squared_error(actuals, preds))
        r2 = r2_score(actuals, preds)

        results[h] = {"MAE": mae, "RMSE": rmse, "R2": r2}

    return results

In [None]:
horizons =[1,5]#,10,20,30,60,90]
results = walk_forward_horizon_test(model, X_test, Y_test, horizons)

for h, metrics in results.items():
    print(f"{h}-DAY FORECAST:")
    print(f"MAE:  {metrics['MAE']:.4f}")
    print(f"RMSE: {metrics['RMSE']:.4f}\n")


In [None]:
def evaluate_multi_step_forecast(model, X_test, Y_test, max_horizon=10):

    results = []

    for horizon in range(1, max_horizon + 1):

        preds = []
        actuals = []

        # For each starting point, iteratively forecast ahead
        for i in range(len(X_test) - horizon):

            X_input = X_test.iloc[i].copy()

            # Iteratively predict up to the chosen horizon
            for step in range(horizon):
                pred = model.predict(X_input.values.reshape(1, -1))[0]

                # shift autoregressive variables
                if "Close_lag_1" in X_input.index:
                    X_input["Close_lag_1"] = pred
                if "Open_lag_1" in X_input.index:
                    X_input["Open_lag_1"] = pred
                if "Volume_lag_1" in X_input.index:
                    X_input["Volume_lag_1"] = pred

            preds.append(pred)
            actuals.append(Y_test.iloc[i + horizon])

        preds = np.array(preds)
        actuals = np.array(actuals)

        mae = mean_absolute_error(actuals, preds)
        rmse = np.sqrt(mean_squared_error(actuals, preds))
        r2 = r2_score(actuals, preds)

        results.append([horizon, mae, rmse, r2])

    return pd.DataFrame(results, columns=["Horizon", "MAE", "RMSE", "R2"])

In [None]:
results = evaluate_multi_step_forecast(model, X_test, Y_test, max_horizon=10)
print(results)