In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
import joblib

In [3]:
# Adjust paths if needed
reddit_df = pd.read_csv("../data/raw/reddit_wsb.csv")
stock_df = pd.read_csv("../data/raw/stock_prices.csv")

# Parse dates
reddit_df["timestamp"] = pd.to_datetime(reddit_df["timestamp"])
stock_df["date"] = pd.to_datetime(stock_df["date"])

reddit_df.head(), stock_df.head()

(                                               title  score      id  \
 0  It's not about the money, it's about sending a...     55  l6ulcx   
 1  Math Professor Scott Steiner says the numbers ...    110  l6uibd   
 2                                    Exit the system      0  l6uhhn   
 3  NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...     29  l6ugk6   
 4  Not to distract from GME, just thought our AMC...     71  l6ufgy   
 
                                                  url  comms_num       created  \
 0                    https://v.redd.it/6j75regs72e61          6  1.611863e+09   
 1                    https://v.redd.it/ah50lyny62e61         23  1.611862e+09   
 2  https://www.reddit.com/r/wallstreetbets/commen...         47  1.611862e+09   
 3  https://sec.report/Document/0001193125-21-019848/         74  1.611862e+09   
 4                https://i.redd.it/4h2sukb662e61.jpg        156  1.611862e+09   
 
                                                 body           timestam

In [7]:
reddit_df["timestamp"] = pd.to_datetime(reddit_df["timestamp"])

# Normalize to midnight datetime (NOT .dt.date)
reddit_df["date"] = reddit_df["timestamp"].dt.floor("D")

reddit_daily = (
    reddit_df
    .groupby("date")
    .agg(
        reddit_posts_24h=("id", "count"),
        reddit_avg_score=("score", "mean"),
        reddit_total_score=("score", "sum"),
        reddit_avg_comments=("comms_num", "mean")
    )
    .reset_index()
)

reddit_daily.dtypes

date                   datetime64[ns]
reddit_posts_24h                int64
reddit_avg_score              float64
reddit_total_score              int64
reddit_avg_comments           float64
dtype: object

In [10]:
stock_df["date"] = pd.to_datetime(stock_df["date"]).dt.floor("D")
stock_df = stock_df.sort_values("date")

stock_df["target_close"] = stock_df.groupby("ticker")["close"].shift(-1)

stock_features = stock_df[
    ["date", "ticker", "close", "volume", "target_close"]
]

stock_features.dtypes

date            datetime64[ns]
ticker                  object
close                  float64
volume                 float64
target_close           float64
dtype: object

In [12]:
dataset = stock_features.merge(
    reddit_daily,
    on="date",
    how="left"
)

# Fill missing reddit days
reddit_cols = [
    "reddit_posts_24h",
    "reddit_avg_score",
    "reddit_total_score",
    "reddit_avg_comments"
]

dataset[reddit_cols] = dataset[reddit_cols].fillna(0)

dataset = dataset.dropna(subset=["target_close"])

dataset.head()

Unnamed: 0,date,ticker,close,volume,target_close,reddit_posts_24h,reddit_avg_score,reddit_total_score,reddit_avg_comments
0,2020-09-29,AAPL,111.243,102615800.0,112.918,1.0,4.0,4.0,11.0
1,2020-09-29,AMC,48.5809,252275.6,47.0815,1.0,4.0,4.0,11.0
2,2020-09-29,BB,4.63,5846179.0,4.59,1.0,4.0,4.0,11.0
3,2020-09-29,GME,2.5875,20977360.0,2.55,1.0,4.0,4.0,11.0
4,2020-09-29,NOK,3.91604,25285650.0,3.87638,1.0,4.0,4.0,11.0


In [13]:
FEATURES = [
    "close",
    "volume",
    "reddit_posts_24h",
    "reddit_avg_score",
    "reddit_total_score",
    "reddit_avg_comments"
]

X = dataset[FEATURES]
y = dataset["target_close"]

In [14]:
tscv = TimeSeriesSplit(n_splits=5)
model = LinearRegression()

errors = []

for train_idx, test_idx in tscv.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    errors.append(mean_absolute_error(y_test, preds))

print("Mean MAE:", np.mean(errors))

Mean MAE: 17469.204301086273


In [15]:
model.fit(X, y)
joblib.dump(model, "price_predictor_v1.joblib")
print("Model saved as price_predictor_v1.joblib")

Model saved as price_predictor_v1.joblib


In [19]:
predictions = []

for _, row in dataset.iterrows():
    X_row = pd.DataFrame([row[FEATURES]], columns=FEATURES)
    predicted_price = model.predict(X_row)[0]

    current_close = row["close"]

    prediction_doc = {
        "ticker": row["ticker"],
        "timestamp": pd.Timestamp(row["date"]),
        "prediction_date": pd.Timestamp(row["date"]) + timedelta(days=1),
    
        "predicted_price": float(predicted_price),
        "actual_price": float(current_close),
    
        "prediction_error": float(predicted_price - current_close),
        "prediction_pct_error": float(
            (predicted_price - current_close) / current_close * 100
        ),
    
        "model_type": "linear_regression_v1",
    
        "features_used": {
            "current_close": float(current_close),
            "volume": int(row["volume"]),
            "reddit_posts_24h": int(row["reddit_posts_24h"]),
            "reddit_avg_score": float(row["reddit_avg_score"]),
            "reddit_total_score": float(row["reddit_total_score"]),
            "reddit_avg_comments": float(row["reddit_avg_comments"])
        },
    
        # confidence derived from relative error
        "confidence": float(
            max(0.5, 1 - abs(predicted_price - current_close) / current_close)
        ),
    
        "created_at": pd.Timestamp.now()
    }

    predictions.append(prediction_doc)

predictions[1]

{'ticker': 'AMC',
 'timestamp': Timestamp('2020-09-29 00:00:00'),
 'prediction_date': Timestamp('2020-09-30 00:00:00'),
 'predicted_price': 49.64007112744577,
 'actual_price': 48.5809,
 'prediction_error': 1.0591711274457722,
 'prediction_pct_error': 2.1802212957062803,
 'model_type': 'linear_regression_v1',
 'features_used': {'current_close': 48.5809,
  'volume': 252275,
  'reddit_posts_24h': 1,
  'reddit_avg_score': 4.0,
  'reddit_total_score': 4.0,
  'reddit_avg_comments': 11.0},
 'confidence': 0.9781977870429373,
 'created_at': Timestamp('2026-01-19 23:47:12.535069')}