1. Import all libraries that we'll use

In [14]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.inspection import PartialDependenceDisplay
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV

2. Load all datasets (Perform essential combination if needed)

In [15]:
sentiment_df_path = "/Users/eric_p/Desktop/Fall 2025/MFIN 7036/Group Project/Text_Data/vader_score_daily.parquet"
volatility_df_path = "/Users/eric_p/Desktop/Fall 2025/MFIN 7036/Group Project/Text_Data/btc_usd_daily_price_change_cryptocompare.csv"

sentiment_df = pd.read_parquet(sentiment_df_path)
volatility_df = pd.read_csv(volatility_df_path)

print(sentiment_df.head(5))
print(volatility_df.head(5))

         date  vader_score  post_count
0  2009-05-08       0.2239           4
1  2009-07-18       0.4968           1
2  2009-09-24       0.4906           2
3  2009-12-31       0.0000           1
4  2010-03-28       0.0516           1
         Date    Open    High    Low   Close  VolumeFrom  VolumeTo  \
0  2011-01-02  0.3000  0.3000  0.289  0.3000     5352.11   1584.66   
1  2011-01-03  0.3000  0.3000  0.290  0.2950     1425.19    420.85   
2  2011-01-04  0.2950  0.2999  0.289  0.2989     1879.00    548.33   
3  2011-01-05  0.2989  0.2990  0.290  0.2990      357.16    106.19   
4  2011-01-06  0.2990  0.2990  0.290  0.2980     3456.49   1031.28   

   price_change  abs_price_change  
0      0.000000          0.000000  
1     -0.016667          0.016667  
2      0.013220          0.013220  
3      0.000335          0.000335  
4     -0.003344          0.003344  


3. Make essential transformations on the dataset so that the forms are the same

In [16]:
# Adjust the date time
sentiment_df["date"] = pd.to_datetime(sentiment_df["date"])
volatility_df["date"] = pd.to_datetime(volatility_df["Date"])

# Sort the dataset by date
sentiment_df = sentiment_df.sort_values("date")
volatility_df = volatility_df.sort_values("date")

# Merge two datasets
combined_df = volatility_df.merge(
    sentiment_df,
    on="date",
    how="left"
)

combined_df = combined_df.rename(columns = {
    "vader_score": "sentiment_score",
    "abs_price_change": "volatility"
})

df = combined_df[["date", "sentiment_score", "volatility"]]

# Filter dates with no posts
df["sentiment_score"] = df["sentiment_score"].fillna(0.0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sentiment_score"] = df["sentiment_score"].fillna(0.0)


4. Construct rolling windows for Random Forest Measurement

In [17]:
# Create lagged variables 
df["sentiment_lag1"] = df["sentiment_score"].shift(1)
df["volatility_lag1"] = df["volatility"].shift(1)

df_model = df.dropna().copy()

# Split df_model into X and Y
X = df_model[["sentiment_lag1", "volatility_lag1"]]
Y = df_model["volatility"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sentiment_lag1"] = df["sentiment_score"].shift(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["volatility_lag1"] = df["volatility"].shift(1)


In [18]:
# Split the entire dataset into training and testing subsets
split_idx = int(len(df_model) * 0.8)

X_train = X.iloc[:split_idx]
Y_train = Y.iloc[:split_idx]

X_test = X.iloc[split_idx:]
Y_test = Y.iloc[split_idx:]

5. Build Random Forest Regressor

In [19]:
random_forest_model = RandomForestRegressor(
    n_estimators = 800,
    max_depth = 6,
    min_samples_leaf = 10,
    random_state = 42,
    n_jobs = -1
)

random_forest_model.fit(X_train, Y_train)

6. Model Performance Evaluation

In [12]:
Y_pred = random_forest_model.predict(X_test)

rmse = mean_squared_error(Y_test, Y_pred, squared=False)
mae = mean_absolute_error(Y_test, Y_pred)
r2 = r2_score(Y_test, Y_pred)

print("Out-of-sample performance:")
print(f"RMSE: {rmse:.6f}")
print(f"MAE : {mae:.6f}")
print(f"R²  : {r2:.4f}")


Out-of-sample performance:
RMSE: 0.019308
MAE : 0.016276
R²  : -0.2139


7. Finetuning & Cross-validation

In [20]:
tscv = TimeSeriesSplit(n_splits=5)

rf = RandomForestRegressor(
    random_state=42,
    n_jobs=-1
)

param_dist = {
    "n_estimators": [300, 500, 800, 1000],
    "max_depth": [3, 4, 5, 6, 7, None],
    "min_samples_leaf": [5, 10, 20, 30],
    "max_features": ["sqrt", 0.5, 1.0]
}

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,  # 15–30 次足够
    cv=tscv,
    scoring="neg_root_mean_squared_error",
    random_state=42,
    n_jobs=-1,
    verbose=2
)

random_search.fit(X, Y)

Fitting 5 folds for each of 20 candidates, totalling 100 fits




[CV] END max_depth=3, max_features=1.0, min_samples_leaf=30, n_estimators=500; total time=   0.8s
[CV] END max_depth=6, max_features=sqrt, min_samples_leaf=30, n_estimators=500; total time=   1.0s
[CV] END max_depth=6, max_features=sqrt, min_samples_leaf=30, n_estimators=500; total time=   1.1s
[CV] END max_depth=3, max_features=1.0, min_samples_leaf=30, n_estimators=500; total time=   1.1s
[CV] END max_depth=6, max_features=sqrt, min_samples_leaf=30, n_estimators=500; total time=   1.3s
[CV] END max_depth=3, max_features=1.0, min_samples_leaf=30, n_estimators=500; total time=   1.3s
[CV] END max_depth=3, max_features=1.0, min_samples_leaf=30, n_estimators=500; total time=   1.4s
[CV] END max_depth=3, max_features=1.0, min_samples_leaf=30, n_estimators=500; total time=   1.5s
[CV] END max_depth=6, max_features=sqrt, min_samples_leaf=30, n_estimators=500; total time=   1.5s
[CV] END max_depth=6, max_features=sqrt, min_samples_leaf=30, n_estimators=500; total time=   1.7s
[CV] END max_de

In [21]:
print("Best parameters:")
print(random_search.best_params_)

print("Best CV RMSE:")
print(-random_search.best_score_)

Best parameters:
{'n_estimators': 800, 'min_samples_leaf': 20, 'max_features': 1.0, 'max_depth': 3}
Best CV RMSE:
0.04651327608363375


8. Construct the Optimal Model

In [None]:
best_rf = random_search.best_estimator_

best_rf.fit(X_train, Y_train)
Y_pred = best_rf.predict(X_test)

In [23]:
best_rmse = mean_squared_error(Y_test, Y_pred, squared=False)
print(f"Final OOS RMSE: {best_rmse:.6f}")

Final OOS RMSE: 0.019308
