# EPS preprocessor
This program takes a CSV file as input containing quarterly EPS data of companies (date, estimated EPS, reported EPS, % surprise), and then calculates: the estimated EPS growth for the next quarter, the average EPS growth over the last two quarters, the average surprise over the last two quarters, and the YoY EPS growth (TTM current vs year ago). The data is calculated for every company and for every month in a given year period.

### 1. Imports and helper function

In [6]:
import pandas as pd
import numpy as np

def get_latest_data_index(ticker, date, grouped_data):
    ticker_data = grouped_data[ticker]
    report_dates = ticker_data['Date'].values

    # Data is in ascending order by the report date
    i = 0
    n = len(ticker_data) 
    while (i < n and report_dates[i] < date ):
        i += 1
        
    if i == 0:
        return np.nan

    return i - 1 # index of most recent publicly available EPS data


### 2. Main function

In [11]:
def calculate_eps_data(eps_filename, start_year, end_year):
    eps_data = pd.read_csv(eps_filename)
    print(f"Number of rows for EPS: {len(eps_data)}")
    
    eps_data['earnings_date'] = pd.to_datetime(eps_data['earnings_date'])
    eps_data['eps_estimate'] = eps_data['eps_estimate'].apply(pd.to_numeric, errors='coerce')
    eps_data['eps_reported'] = eps_data['eps_reported'].apply(pd.to_numeric, errors='coerce')
    eps_data['eps_surprise'] = eps_data['eps_surprise'].apply(pd.to_numeric, errors='coerce')
    eps_data = eps_data.iloc[::-1] # In increasing order by date

    metrics = {} 
    for symbol, group in eps_data.groupby('symbol'):
        df = pd.DataFrame({
            'Date': group['earnings_date'],
            'Forecast Growth': (group['eps_estimate'].shift(-1) - group['eps_reported']) / group['eps_reported'],
            'Avg 2Q Growth': group['eps_reported'].pct_change(fill_method=None).rolling(window=2, min_periods=2).mean(),
            'Avg 2Q Surprise': group['eps_surprise'].rolling(window=2, min_periods=2).mean(),
            'YoY Growth': (group['eps_reported'].rolling(window=4, min_periods=4).sum() - group['eps_reported'].shift(4).rolling(window=4, min_periods=4).sum()) / group['eps_reported'].shift(4).rolling(window=4, min_periods=4).sum()
        })
        metrics[symbol] = df

    # Creating dataset for every month for the given year period
    all_months = pd.date_range(start=f'{start_year}-01-01', end=f'{end_year}-12-31', freq='MS')
    tickers = eps_data['symbol'].unique()
    index = pd.MultiIndex.from_product([tickers, all_months], names=['Ticker', 'Date'])
    monthly_data = pd.DataFrame(index=index).reset_index()
    
    monthly_data['Year'] = monthly_data['Date'].dt.year
    monthly_data['Month'] = monthly_data['Date'].dt.month
    
    for i, row in monthly_data.iterrows():
        ticker = row['Ticker']
        index = get_latest_data_index(ticker, row['Date'], metrics)
        monthly_data.at[i, 'Forecast Growth'] = metrics[ticker]['Forecast Growth'].values[index] if not np.isnan(index) else np.nan
        monthly_data.at[i, 'Avg 2Q Growth'] = metrics[ticker]['Avg 2Q Growth'].values[index] if not np.isnan(index) else np.nan
        monthly_data.at[i, 'Avg 2Q Surprise'] = metrics[ticker]['Avg 2Q Surprise'].values[index] if not np.isnan(index) else np.nan
        monthly_data.at[i, 'YoY Growth'] = metrics[ticker]['YoY Growth'].values[index] if not np.isnan(index) else np.nan
        
    monthly_data = monthly_data.dropna()
    print(f"Final number of rows: {len(monthly_data)}")
    
    return monthly_data[['Ticker', 'Year', 'Month', 'Forecast Growth', 'Avg 2Q Growth', 'Avg 2Q Surprise', 'YoY Growth']]

### 3. Data and programm execution

In [12]:
eps_filename = "datasets/EPS.csv"
start_year = 2017
end_year = 2024
output_filename = 'datasets/EPS_growth_data.csv'

eps_data = calculate_eps_data(eps_filename, start_year, end_year)
eps_data.to_csv(output_filename, index=False)

Number of rows for EPS: 43898
Final number of rows: 44208
