In [3]:
import os
from src import config
import pandas as pd

folder_path = config.YAHOO_DATA_DIR
all_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]


test_data = pd.read_csv(os.path.join(folder_path, all_files[0]))
print(test_data.head())


         Date     Close      High       Low      Open   Volume  Log_Return
0  2024-01-01  0.003734  0.003809  0.003549  0.003678  9169185         NaN
1  2024-01-02  0.003706  0.003780  0.003660  0.003734  6344244   -0.007527
2  2024-01-03  0.003374  0.003740  0.003322  0.003705  8036622   -0.093854
3  2024-01-04  0.003450  0.003458  0.003327  0.003375  5588532    0.022275
4  2024-01-05  0.003270  0.003451  0.003206  0.003449  6058802   -0.053584


In [4]:
test_data = test_data.iloc[:-2]
print(test_data.head())

         Date     Close      High       Low      Open   Volume  Log_Return
0  2024-01-01  0.003734  0.003809  0.003549  0.003678  9169185         NaN
1  2024-01-02  0.003706  0.003780  0.003660  0.003734  6344244   -0.007527
2  2024-01-03  0.003374  0.003740  0.003322  0.003705  8036622   -0.093854
3  2024-01-04  0.003450  0.003458  0.003327  0.003375  5588532    0.022275
4  2024-01-05  0.003270  0.003451  0.003206  0.003449  6058802   -0.053584


In [None]:
for file in all_files:
    try:
        data = pd.read_csv(os.path.join(folder_path, file))
        # Remove the last two row which is often incomplete
        if not data.empty:
            data = data.iloc[:-2]
            data.to_csv(os.path.join(folder_path, file))
    except Exception as e:
        print(f"Could not process file {file}: {e}")

In [5]:
import requests
import pandas as pd

def check_ticker(coin_id):
    url = f"https://finance.yahoo.com/quote/{coin_id.upper()}-USD"
    return requests.get(url).status_code == 200

df = pd.read_csv(config.GECKO_TICKERS_FILE)  # column: 'id'
mapped, unmapped = [], []

for coin in df["coin_id"]:
    ticker = f"{coin.upper()}-USD"
    if check_ticker(coin):
        mapped.append({"coin_id": coin, "yahoo_ticker": ticker})
    else:
        unmapped.append(coin)

# pd.DataFrame(mapped).to_csv("mapped.csv", index=False)
# pd.DataFrame(unmapped, columns=["coin_id"]).to_csv("unmapped.csv", index=False)
print(mapped)



[]


In [10]:
import requests
import pandas as pd
from datetime import datetime

def get_ohlcv_history(coin_id, days='365'):
    """
    Fetches historical OHLCV (Open, High, Low, Close, Volume) data for a given coin from CoinGecko.
    The 'days' parameter can be an integer (e.g., '30', '90', '365') or 'max'.
    """
    url = f"https://api.coingecko.com/api/v3/coins/{coin_id}/market_chart"
    params = {
        'vs_currency': 'usd',
        'days': days,
        'interval': 'daily'
    }
    
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        data = response.json()

        # Extract prices and volume
        prices = data.get('prices', [])
        total_volumes = data.get('total_volumes', [])

        if not prices or not total_volumes:
            return None

        # Build base DataFrame
        df = pd.DataFrame({
            'date': pd.to_datetime([p[0] for p in prices], unit='ms').normalize(),
            'close': [p[1] for p in prices],
            'volume': [v[1] for v in total_volumes]
        })

        # Sort by date just in case
        df = df.sort_values('date').reset_index(drop=True)

        df['coin_id'] = coin_id

        # Remove today's partial data if present
        if df['date'].iloc[-1].date() == datetime.utcnow().date():
            df = df.iloc[:-1]

        return df[['date', 'coin_id', 'close', 'volume']]

    except Exception as e:
        print(f"Error fetching data for {coin_id}: {e}")
        return None

In [11]:
test_data = get_ohlcv_history('bitcoin')
print(test_data.head())

        date  coin_id         close        volume
0 2024-06-28  bitcoin  61562.645326  1.885514e+10
1 2024-06-29  bitcoin  60317.096979  2.439942e+10
2 2024-06-30  bitcoin  60864.443162  1.143511e+10
3 2024-07-01  bitcoin  62734.393839  1.737359e+10
4 2024-07-02  bitcoin  62819.822269  2.557307e+10


In [16]:
import pandas_datareader.data as web

def get_risk_free_rate(start_date, end_date):
    """
    Fetches the 3-Month Treasury Bill Rate from FRED as the risk-free rate.
    """
    print("Fetching risk-free rate from FRED...")
    try:
        # The series for 3-Month Treasury Bill Secondary Market Rate
        rf = web.DataReader('DTB3', 'fred', start_date, end_date)
        # Forward-fill missing values and convert to daily rate
        rf = rf.ffill() / 100 / 360
        return rf
    except Exception as e:
        print(f"Could not fetch risk-free rate: {e}. Returning zero series.")
        return pd.Series(0, index=pd.date_range(start=start_date, end=end_date), name='DTB3')

In [14]:
import numpy as np

pca_index_data = pd.read_csv(config.INDEX_FILE_PATH, index_col='Date', parse_dates=True)
pca_index_data['log_return'] = np.log(pca_index_data['Value'] / pca_index_data['Value'].shift(1))
print(pca_index_data.head())

                  Value  log_return
Date                               
2024-05-28  1000.000000         NaN
2024-05-29   988.448863   -0.011618
2024-05-30   974.915221   -0.013786
2024-05-31   977.997462    0.003157
2024-06-01   972.938874   -0.005186


In [57]:
rf_rate = get_risk_free_rate(pca_index_data.index.min(), pca_index_data.index.max())
print(pca_index_data.index.min())
print(rf_rate.head())
# Align risk-free rate with our data's dates
rf = rf_rate['DTB3'].reindex(pca_index_data.index, method='ffill')
print(rf.head())
# Market return is the equal-weighted average of all crypto returns
market_return = pca_index_data['log_return']
mkt = market_return - rf
print(mkt.head())


Fetching risk-free rate from FRED...
2024-05-28 00:00:00
                DTB3
DATE                
2024-05-28  0.000146
2024-05-29  0.000146
2024-05-30  0.000146
2024-05-31  0.000146
2024-06-03  0.000146
Date
2024-05-28    0.000146
2024-05-29    0.000146
2024-05-30    0.000146
2024-05-31    0.000146
2024-06-01    0.000146
Name: DTB3, dtype: float64
Date
2024-05-28         NaN
2024-05-29   -0.011764
2024-05-30   -0.013933
2024-05-31    0.003011
2024-06-01   -0.005332
dtype: float64


In [58]:
daily_return_data = pd.read_csv(config.DAILY_RETURN_FILE_PATH, index_col='Date', parse_dates=True)
market_cap_data = pd.read_csv(config.MARKET_CAP_FILE_PATH, index_col='Date', parse_dates=True)

In [23]:
mcaps_today = market_cap_data.loc['2024-05-28'].dropna()
mcaps_today = mcaps_today[mcaps_today > 0]
print(mcaps_today)

goldfinch                2.663202e+08
tars-protocol            1.751760e+07
chain-key-bitcoin        1.752771e+07
civic                    1.328616e+08
dkargo                   1.106146e+08
                             ...     
ethereum-name-service    7.903534e+08
kishu-inu                2.872678e+07
truefi                   2.068764e+08
energy-web-token         1.710840e+08
bitmart-token            1.202714e+08
Name: 2024-05-28 00:00:00, Length: 772, dtype: float64


In [28]:
print(mcaps_today.shape[0])

772


In [59]:
returns_today = daily_return_data.loc['2024-05-28'].dropna()
print(returns_today.shape)
print(returns_today)

KeyError: '2024-05-28'

In [40]:
sorted_market_caps = mcaps_today.sort_values(ascending=False)
mcaps_today.sort_values(ascending=False)


bitcoin          1.366931e+12
ethereum         4.676140e+11
tether           1.118769e+11
binancecoin      9.279450e+10
solana           7.629053e+10
                     ...     
metfi-2          4.648668e+06
edge             4.628990e+06
undeads-games    4.538313e+06
messier          4.073071e+06
web3shot         2.060256e+06
Name: 2024-05-28 00:00:00, Length: 772, dtype: float64

In [41]:
sorted_index = sorted_market_caps.index

In [None]:
# --- 2. Calculate SMB (Small Minus Big) Factor ---
# This loop calculates SMB for each day
smb_list = []
portfolio_size = 100 # Define the size of our portfolios

for date in pca_index_data.index:
    # Get market caps and returns for the current day, dropping NaNs
    mcaps_today = market_cap_data.loc[date].dropna()
    mcaps_today = mcaps_today[mcaps_today > 0]
    
    returns_today = daily_return_data.loc[date].dropna()
    
    sorted_market_caps = mcaps_today.sort_values(ascending=False)
    
    if len(mcaps_today) < 2*portfolio_size:
        smb_list.append(0)
        continue
    
    # --- Identify Portfolios ---
    # Get the tickers for the top 100 (Big) and bottom 100 (Small)
    big_portfolio_tickers = sorted_market_caps.head(portfolio_size).index
    small_portfolio_tickers = sorted_market_caps.tail(portfolio_size).index
    
    # --- Get Returns for Each Portfolio ---
    # Select the daily returns for the assets in each portfolio
    returns_today = returns_today.loc[date]
    big_portfolio_returns = returns_today.loc[big_portfolio_tickers]
    small_portfolio_returns = returns_today.loc[small_portfolio_tickers]
        
    # --- Calculate SMB Value for the Day ---
    # Calculate the equal-weighted average return for each portfolio
    big_return = big_portfolio_returns.mean()
    small_return = small_portfolio_returns.mean()
    
    # SMB is the return of the small portfolio minus the return of the big one
    smb_value = small_return - big_return
    smb_list.append(smb_value)

smb = pd.Series(smb_list, index=pca_index_data.index)

In [55]:
# --- 3. Calculate WML (Winners Minus Losers) Momentum Factor ---
# This is computationally intensive. We rebalance monthly for performance.
wml_list = []
# Get month-end dates for rebalancing
rebalance_dates = daily_return_data.resample('M').last().index

# Using a 11-month lookback, holding for 1 month
for i in range(12, len(rebalance_dates)):
    # Define portfolio formation period
    formation_date = rebalance_dates[i - 1]
    start_lookback = rebalance_dates[i - 12]

    # Calculate past returns (momentum)
    past_returns = (daily_return_data.loc[start_lookback:formation_date] + 1).prod() - 1
    # Clean any infinities that might arise from the calculation
    past_returns = past_returns.replace([np.inf, -np.inf], np.nan).dropna()
    
    if len(past_returns) < 10:  # Need enough assets to form portfolios
        continue

    # Define quantile-based portfolios (e.g., top and bottom 30%)
    q30 = past_returns.quantile(0.3)
    q70 = past_returns.quantile(0.7)
    losers = past_returns[past_returns <= q30].index
    winners = past_returns[past_returns >= q70].index

    # Calculate returns for the NEXT month
    holding_period_start = formation_date
    holding_period_end = rebalance_dates[i]

    # This is your new line that drops columns with any NaN in the holding period
    returns_next_month = daily_return_data.loc[holding_period_start:holding_period_end].dropna(axis=1, how='any')
    
    # ========================================================================
    # --- FIX: Find the intersection of desired tickers and available tickers ---
    # ========================================================================
    safe_low_tickers = losers.intersection(returns_next_month.columns)
    safe_high_tickers = winners.intersection(returns_next_month.columns)
    
    # Now, use these "safe" lists for the calculation
    low_ret_returns = returns_next_month[safe_low_tickers].mean(axis=1)
    high_ret_returns = returns_next_month[safe_high_tickers].mean(axis=1)
    
    monthly_hml = low_ret_returns - high_ret_returns
    wml_list.append(monthly_hml)

wml = pd.concat(wml_list)
print(wml)

Date
2024-12-31   -0.021107
2025-01-01   -0.007247
2025-01-02   -0.003675
2025-01-03    0.016596
2025-01-04    0.000955
                ...   
2025-06-21   -0.021128
2025-06-22   -0.011429
2025-06-23    0.046712
2025-06-24    0.003774
2025-06-25   -0.010946
Length: 182, dtype: float64


  rebalance_dates = daily_return_data.resample('M').last().index
