# Data

## Setup

In [15]:
# Import necessary libraries
import os
import pandas as pd

# Data source abstraction
class DataSource:
    def __init__(self, name, source_name, url, file_name):
        self.name = name
        self.source_name = source_name
        self.url = url
        self.file_name = file_name

    def fetch_data(self):
        if not os.path.exists(self.file_name):
            raise FileNotFoundError(f"Data file not found. Please fetch the data sheet from: {self.url}")
        return self._read_data()

    def _read_data(self):
        raise NotImplementedError("Please implement the _read_data method")


In [16]:
# Initialize directory paths
if not os.path.exists('clean'):
    os.mkdir('clean')


## Data Sources

For each source:

1. define it
2. initialize it
3. fetch data
4. save clean data

### S&P 500 Earnings Yield

Source: <https://www.multpl.com/s-p-500-earnings-yield>

In [17]:
class SP500EarningsYield(DataSource):
    def _read_data(self):
        data = pd.read_csv(self.file_name)
        try:
            # Convert 'Date' column to datetime and extract the year
            data['Date'] = pd.to_datetime(data['Date'])
            data['Year'] = data['Date'].dt.year
            data['Month'] = data['Date'].dt.month

            # Convert 'Earnings Yield' from percentage string to float
            data['Earnings Yield'] = data['Earnings Yield'].str.rstrip('%').astype('float') / 100.0
        except ValueError as e:
            raise ValueError(f"An error occurred while processing the data: {e}")
        return data

# Initialize data sources
sp500_earnings_yield = SP500EarningsYield("S&P 500 Earnings Yield", "Multipl.com 2023", "https://www.multpl.com/s-p-500-earnings-yield", "raw/sp500_earnings_yield.csv")

# Fetch and clean data
sp500_data = sp500_earnings_yield.fetch_data()

# Save cleaned data
sp500_data.to_csv('clean/sp500_earnings_yield.csv', index=False)


### US 10-Year Treasury Yield

Source: <https://www.multpl.com/10-year-treasury-rate>

In [18]:
class US10YTreasuryYield(DataSource):
    def _read_data(self):
        data = pd.read_csv(self.file_name)
        try:
            # Convert 'Date' column to datetime and extract the year
            data['Date'] = pd.to_datetime(data['Date'])
            data['Year'] = data['Date'].dt.year
            data['Month'] = data['Date'].dt.month

            # Convert 'Treasury Yield' from percentage string to float
            data['Treasury Yield'] = data['Treasury Yield'].str.rstrip('%').astype('float') / 100.0
        except ValueError as e:
            raise ValueError(f"An error occurred while processing the data: {e}")
        return data

# Initialize data sources
us10y_yield = US10YTreasuryYield("US Treasury 10-Year Yield", "Multipl.com 2023", "https://www.multpl.com/10-year-treasury-rate", "raw/us10y_yield.csv")

# Fetch and clean data
us10y_data = us10y_yield.fetch_data()

# Save cleaned data
us10y_data.to_csv('clean/us10y_yield.csv', index=False)
