In [4]:
import numpy as np
import pandas as pd
import os
from matplotlib import pyplot as plt
from settings import config
from pathlib import Path
from pandas_datareader.famafrench import get_available_datasets
import pandas_datareader.data as web

# Load environment variables
DATA_DIR = Path(config("DATA_DIR"))
DATA_MANUAL = Path(config("LOCAL_MANUAL_DATA_DIR"))
OUTPUT_DIR = Path(config("OUTPUT_DIR"))
WRDS_USERNAME = config("WRDS_USERNAME")
START_DATE = config("START_DATE")
END_DATE = config("END_DATE")

In [5]:
from dask.distributed import Client
import dask.dataframe as dd
import sys 
import math
from numba import njit

scheduler = config("DASK_SCHEDULER_ADDRESS", default=None)

if scheduler:
    client = Client(scheduler)
else:
    # fall‐back to launching a local cluster
    client = Client()
print("Connected to scheduler at:", client.scheduler.address)

Connected to scheduler at: tcp://13.220.85.47:8786



+-------------+-----------------+-----------------+-----------------+
| Package     | Client          | Scheduler       | Workers         |
+-------------+-----------------+-----------------+-----------------+
| cloudpickle | 3.0.0           | 3.1.1           | 3.1.1           |
| lz4         | 4.3.2           | 4.3.3           | 4.3.3           |
| msgpack     | 1.0.3           | 1.1.0           | 1.1.0           |
| python      | 3.10.16.final.0 | 3.10.12.final.0 | 3.10.12.final.0 |
| toolz       | 1.0.0           | 0.12.0          | 0.12.0          |
| tornado     | 6.5             | 6.4.2           | 6.4.2           |
+-------------+-----------------+-----------------+-----------------+


In [8]:
!pip install wrds 

Collecting wrds
  Downloading wrds-3.3.0-py3-none-any.whl.metadata (5.7 kB)
Collecting psycopg2-binary<2.10,>=2.9 (from wrds)
  Using cached psycopg2_binary-2.9.10-cp310-cp310-win_amd64.whl.metadata (5.0 kB)
Downloading wrds-3.3.0-py3-none-any.whl (13 kB)
Downloading psycopg2_binary-2.9.10-cp310-cp310-win_amd64.whl (1.2 MB)
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   ---------------------------------------- 1.2/1.2 MB 8.3 MB/s eta 0:00:00
Installing collected packages: psycopg2-binary, wrds

   ---------------------------------------- 2/2 [wrds]

Successfully installed psycopg2-binary-2.9.10 wrds-3.3.0


In [12]:
import os
from pathlib import Path
from decouple import config
import pandas as pd
from dask import delayed, compute
from dask.diagnostics import ProgressBar
import regressions

summary_tex_dir = Path('../reports/tables')
summary_tex_dir.mkdir(parents=True, exist_ok=True)

# Common regressions parameters
groups = {
    '6-Portfolios': '6_Portfolios_2x3',
    '25-Portfolios': '25_Portfolios_5x5',
    '100-Portfolios': '100_Portfolios_10x10'
}
WEIGHTING = 'BE_FYt-1_to_ME_June_t'
H = 1
# Use None so we keep full sample even if data start later than 1980
# Use far‐future date within pandas bounds (max 2262-04-11)
END_DATE = '2262-04-11'
START_TRAIN = '1930-01-01'
END_TRAIN = '1980-01-01'
END_FORECAST = '2011-01-01'

@delayed
def process_monthly(label, dataset_name):
    in_sample = regressions.run_in_sample_pls(
        dataset_name=dataset_name,
        weighting=WEIGHTING,
        h=H,
        end_date=END_DATE  # Keep full sample
    )
    recursive = regressions.run_recursive_forecast(
        dataset_name=dataset_name,
        weighting=WEIGHTING,
        h=H,
        start_train_date=START_TRAIN,
        end_train_date=END_TRAIN,
        end_forecast_date=END_FORECAST
    )
    return label, (in_sample['third_model'].rsquared if in_sample['third_model'] is not None else float('nan')), recursive['R2_oos']

@delayed
def process_annual(label, dataset_name):
    in_sample = regressions.run_in_sample_pls_annual(
        dataset_name=dataset_name,
        weighting=WEIGHTING,
        h=H,
        end_date=END_DATE  # Keep full sample
    )
    forecast_series, actual_series, R2_oos = regressions.run_recursive_forecast_annual(
        dataset_name=dataset_name,
        weighting=WEIGHTING,
        h=H,
        start_train_year=1930,
        end_train_year=1979,
        end_forecast_year=2010,
        n_components=1
    )
    return label, (in_sample['third_model'].rsquared if in_sample['third_model'] is not None else float('nan')), R2_oos


monthly_tasks = [process_monthly(label, name) for label, name in groups.items()]
annual_tasks  = [process_annual(label, name) for label, name in groups.items()]

with ProgressBar():
    monthly_results = compute(*monthly_tasks, scheduler='threads')
    annual_results  = compute(*annual_tasks,  scheduler='threads')

monthly_dict = {lbl: {'R2 In-Sample': r2_in, 'R2 Out-of-Sample': r2_oos}
                for lbl, r2_in, r2_oos in monthly_results}
summary_df_monthly = pd.DataFrame(monthly_dict).T
summary_df_monthly.to_latex(summary_tex_dir / 'summary_table_monthly.tex',
                            index=True, float_format='%.6f')

annual_dict =  {lbl: {'R2 In-Sample': r2_in, 'R2 Out-of-Sample': r2_oos}
                for lbl, r2_in, r2_oos in annual_results}
summary_df_annual = pd.DataFrame(annual_dict).T
summary_df_annual.to_latex(summary_tex_dir / 'summary_table_annual.tex',
                            index=True, float_format='%.6f')

print('Regression tasks complete. Summary tables saved.')


[########################################] | 100% Completed | 10.83 s
[########################################] | 100% Completed | 3.59 ss
Regression tasks complete. Summary tables saved.
