In [1]:
import sys
from dotenv import load_dotenv
import os

load_dotenv()

sys.path.append(os.getenv("ROOT"))

In [2]:
import polars as pl
import numpy as np
from datetime import date
from silverfund.datasets.crsp_daily import CRSPDaily
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# Load dataset
df = (
    CRSPDaily(start_date=date(2006, 1, 1), end_date=date(2024, 12, 31))
    .load_all()
    .select(["permno", "ticker", "date", "shrout", "prc", "ret", "vol"])
)

df

Loading CRSP Daily years:   0%|          | 0/19 [00:00<?, ?it/s]

Loading CRSP Daily years: 100%|██████████| 19/19 [00:01<00:00, 13.88it/s]


permno,ticker,date,shrout,prc,ret,vol
i64,str,date,f64,f64,f64,f64
10001,"""EWST""",2006-01-03,2930.0,9.69,-0.001031,700.0
10001,"""EWST""",2006-01-04,2930.0,9.8,0.011352,970.0
10001,"""EWST""",2006-01-05,2930.0,9.55,-0.02551,6682.0
10001,"""EWST""",2006-01-06,2930.0,9.399,-0.015812,12303.0
10001,"""EWST""",2006-01-09,2930.0,8.92,-0.050963,4723.0
…,…,…,…,…,…,…
93436,"""TSLA""",2024-12-24,3.21006e6,462.28,0.073572,5.9351506e7
93436,"""TSLA""",2024-12-26,3.21006e6,454.13,-0.01763,7.6392273e7
93436,"""TSLA""",2024-12-27,3.21006e6,431.66,-0.049479,8.2370345e7
93436,"""TSLA""",2024-12-30,3.21006e6,417.41,-0.033012,6.4705452e7


In [8]:
# Form reversal signal

# Log returns
df = df.with_columns(pl.col("ret").log1p().alias("logret"))


df = df.with_columns(pl.col("vol").pct_change().over("permno").alias("diff_vol"))

df = df.with_columns((pl.col("diff_vol") >= 1).cast(pl.Float64).alias("goodnews"))
print(df["goodnews"].sum())
df = df.with_columns((pl.col("diff_vol") <= -1).cast(pl.Float64).alias("badnews"))
print(df["badnews"].sum())

df = df.with_columns((pl.col("goodnews") - pl.col("badnews")).alias("news"))
print(df["news"].sum())

# df = df.with_columns((abs(pl.col('diff_vol'))>=2).cast(pl.Float64).alias('news2'))
# print(df['news'].sum())
# print(df['news2'].sum())

# # Rolling sum from t-23 to t-1
# df = df.with_columns(
#     pl.col("logret")
#     .rolling_sum(window_size=22, min_periods=22)
#     .shift(1)
#     .over("permno")
#     .alias("rev")
# )


# lagged news
df = df.with_columns(
    [pl.col("news").shift(i).over("permno").alias(f"news_lag{i}") for i in range(1, 31)]
)


df

2696671.0
166799.0
2529872.0


permno,ticker,date,shrout,prc,ret,vol,logret,diff_vol,news,goodnews,badnews,news_lag1,news_lag2,news_lag3,news_lag4,news_lag5,news_lag6,news_lag7,news_lag8,news_lag9,news_lag10,news_lag11,news_lag12,news_lag13,news_lag14,news_lag15,news_lag16,news_lag17,news_lag18,news_lag19,news_lag20,news_lag21,news_lag22,news_lag23,news_lag24,news_lag25,news_lag26,news_lag27,news_lag28,news_lag29,news_lag30
i64,str,date,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
10001,"""EWST""",2006-01-03,2930.0,9.69,-0.001031,700.0,-0.001032,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10001,"""EWST""",2006-01-04,2930.0,9.8,0.011352,970.0,0.011288,0.385714,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10001,"""EWST""",2006-01-05,2930.0,9.55,-0.02551,6682.0,-0.025841,5.88866,1.0,1.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10001,"""EWST""",2006-01-06,2930.0,9.399,-0.015812,12303.0,-0.015938,0.841215,0.0,0.0,0.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,
10001,"""EWST""",2006-01-09,2930.0,8.92,-0.050963,4723.0,-0.052307,-0.61611,0.0,0.0,0.0,0.0,1.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
93436,"""TSLA""",2024-12-24,3.21006e6,462.28,0.073572,5.9351506e7,0.070991,-0.180596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93436,"""TSLA""",2024-12-26,3.21006e6,454.13,-0.01763,7.6392273e7,-0.017787,0.287116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93436,"""TSLA""",2024-12-27,3.21006e6,431.66,-0.049479,8.2370345e7,-0.050745,0.078255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93436,"""TSLA""",2024-12-30,3.21006e6,417.41,-0.033012,6.4705452e7,-0.033569,-0.214457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# Filters

# Price greater than 5
df = df.with_columns(pl.col("prc").shift(1).over("permno").alias("prclag"))
df = df.filter(pl.col("prclag") > 5)

df = df.drop_nulls(subset=["news_lag30"])

# # Non-null reversal signal
# df = df.drop_nulls(subset=["rev"])

df

permno,ticker,date,shrout,prc,ret,vol,logret,diff_vol,news,goodnews,badnews,news_lag1,news_lag2,news_lag3,news_lag4,news_lag5,news_lag6,news_lag7,news_lag8,news_lag9,news_lag10,news_lag11,news_lag12,news_lag13,news_lag14,news_lag15,news_lag16,news_lag17,news_lag18,news_lag19,news_lag20,news_lag21,news_lag22,news_lag23,news_lag24,news_lag25,news_lag26,news_lag27,news_lag28,news_lag29,news_lag30,prclag
i64,str,date,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
10001,"""EWST""",2006-02-16,2931.0,9.5,-0.003754,2385.0,-0.003761,1.773256,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,9.5358
10001,"""EWST""",2006-02-17,2931.0,9.6054,0.011095,2216.0,0.011034,-0.07086,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,9.5
10001,"""EWST""",2006-02-21,2931.0,9.6,-0.000562,4611.0,-0.000562,1.080776,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,9.6054
10001,"""EWST""",2006-02-22,2931.0,9.5,-0.010417,500.0,-0.010472,-0.891564,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9.6
10001,"""EWST""",2006-02-23,2931.0,9.3,-0.021053,3335.0,-0.021278,5.67,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,9.5
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
93436,"""TSLA""",2024-12-24,3.21006e6,462.28,0.073572,5.9351506e7,0.070991,-0.180596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,430.60001
93436,"""TSLA""",2024-12-26,3.21006e6,454.13,-0.01763,7.6392273e7,-0.017787,0.287116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,462.28
93436,"""TSLA""",2024-12-27,3.21006e6,431.66,-0.049479,8.2370345e7,-0.050745,0.078255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,454.13
93436,"""TSLA""",2024-12-30,3.21006e6,417.41,-0.033012,6.4705452e7,-0.033569,-0.214457,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,431.66


In [10]:
import statsmodels.api as sm

In [11]:
df = df.with_columns(pl.lit(1).alias("constant"))

In [12]:
X = df.select(["news"] + [f"news_lag{i}" for i in range(1, 31)] + ["constant"]).to_numpy()
y = df["logret"].to_numpy()
model = sm.OLS(y, X).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                         nan
Model:                            OLS   Adj. R-squared:                    nan
Method:                 Least Squares   F-statistic:                       nan
Date:                Wed, 29 Jan 2025   Prob (F-statistic):                nan
Time:                        17:11:53   Log-Likelihood:                    nan
No. Observations:            14140062   AIC:                               nan
Df Residuals:                14140030   BIC:                               nan
Df Model:                          31                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
x1                nan        nan        nan        n