In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from google.colab import files
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Data Preprocessing

In [None]:
# Lithium Carbonate 99%Min China as our target individual Return
file_path1 = '/content/Lithium Carbonate 99%Min China Spot Historical Data (2).csv'
LTCB_99 = pd.read_csv(file_path1)

# Global X Lithium & Battery Tech ETF (LIT) as Aggregate Return 1
file_path2 = '/content/LIT ETF Stock Price History.csv'
LIT_ETF = pd.read_csv(file_path2)

# Sprott Lithium Miners ETF (LITP) as Aggregate Return 2
file_path3 = '/content/LITP ETF Stock Price History.csv'
LITP_ETF = pd.read_csv(file_path3)

# Ishares Lithium Miners And Producers ETF (ILIT) as Aggregate Return 3
file_path4 = '/content/ILIT ETF Stock Price History.csv'
ILIT_ETF = pd.read_csv(file_path4)

In [None]:
def data_preprocess(dataset):
    """
    Preprocess the return time series data for a given dataset.
    Set the Date column as the index, and convert the 'Change %' column to a float.

    Parameters:
    dataset (pd.DataFrame): DataFrame containing the return time series data.

    Returns:
    pd.DataFrame: Preprocessed return time series data.

    """

    dataset['Date'] = pd.to_datetime(dataset['Date'])
    dataset = dataset.sort_values('Date')
    dataset.set_index('Date', inplace=True)
    dataset.rename(columns = {'Change %': 'Obs_Return'}, inplace = True)
    dataset['Obs_Return'] = (dataset['Obs_Return'].str.replace('%', '').astype(float))/100
    return_na_count = dataset['Obs_Return'].isna().sum()
    print("Number of missing values:", return_na_count)
    if return_na_count > 0:
      dataset = dataset.dropna(subset=['Obs_Return'])
    return dataset

In [None]:
LTCB_99 = data_preprocess(LTCB_99)
LIT_ETF = data_preprocess(LIT_ETF)
LITP_ETF = data_preprocess(LITP_ETF)
ILIT_ETF = data_preprocess(ILIT_ETF)

Number of missing values: 0
Number of missing values: 0
Number of missing values: 0
Number of missing values: 0


In [None]:
# Some date missed in LTCB_99 dataset, some date missed in ETF dataset
LTCB_LIT = pd.merge(LTCB_99, LIT_ETF, left_index=True, right_index=True, how='inner')
LTCB_LITP = pd.merge(LTCB_99, LITP_ETF, left_index=True, right_index=True, how='inner')
LTCB_ILIT = pd.merge(LTCB_99, ILIT_ETF, left_index=True, right_index=True, how='inner')

In [None]:
columns_to_keep = ['Obs_Return_x', 'Obs_Return_y']
LTCB_LIT = LTCB_LIT[columns_to_keep]
LTCB_LITP = LTCB_LITP[columns_to_keep]
LTCB_ILIT = LTCB_ILIT[columns_to_keep]

In [None]:
print(LTCB_LIT.tail())
# 'Obs_Return_x' refers to the observed return of LTCB_99 individual return
# 'Obs_Return_y' refers to the observed return of the aggregate ETF return

            Obs_Return_x  Obs_Return_y
Date                                  
2024-04-03        0.0095        0.0099
2024-04-08        0.0282        0.0169
2024-04-09        0.0000        0.0199
2024-04-10        0.0091       -0.0287
2024-04-11       -0.0090        0.0106


# 3-step Unsmoothing

In [None]:
def fit_ma_model(series):
    """
    Fits an MA model to a time series and selects the best lag using the Akaike Information Criterion (AIC).

    Parameters:
    series (pd.Series): Time series data for which the MA model is to be fitted.

    Returns:
    ARIMAResultsWrapper: Fitted ARIMA model with the best MA order based on AIC.
    """
    best_aic = np.inf
    best_order = None
    best_model_res = None

    # Try different MA lags from 0 to 5 (one week) MA(1)~MA(5)
    for ma_order in range(6):
        try:
            model = ARIMA(series, order=(0,0,ma_order))
            results = model.fit()
            if results.aic < best_aic:
                best_aic = results.aic
                best_order = ma_order
                best_model_res = results
        except:
            continue
    return best_model_res

In [None]:
def fit_sarimax_covariate(series, covariate):
    """
    Fits a SARIMAX model to excess returns with unsmoothed aggregate returns as an covariate variable,
    selecting the best MA order based on AIC.

    Parameters:
    series (pd.Series): Dependent time series data (excess returns).
    covariate (pd.Series): Covariate variable (unsmoothed aggregate returns).

    Returns:
    SARIMAXResultsWrapper: Fitted SARIMAX model with the best MA order based on AIC.
    """
    best_aic = np.inf
    best_order = None
    best_model_res = None

    # Try different MA lags from 0 to 5 (one week) MA(1)~MA(5)
    for ma_order in range(6):
        try:
            model = SARIMAX(series, exog=covariate, order=(0,0,ma_order), enforce_stationarity=False, enforce_invertibility=False)
            results = model.fit()
            if results.aic < best_aic:
                best_aic = results.aic
                best_order = ma_order
                best_model_res = results
        except:
            continue
    return best_model_res

In [None]:
def three_step_unsmooth(df):
    """
    Function to apply a 3-step MA Unsmoothing Method on a merged DataFrame.

    Parameters:
    df (DataFrame): DataFrame containing columns 'Obs_Return_x' refers to the observed individual return
                             'Obs_Return_y' refers to the observed aggregate return
                             'Date' as mutual index of two return series.

    Returns:
    DataFrame: final unsmoothed true returns of the individual return after 3-step MA Unsmoothing Method
    """

    # Step 1: Unsmooth observed aggregate returns
    aggregate_results = fit_ma_model(df['Obs_Return_y'])
    df['unsmoothed_aggregate_return'] = aggregate_results.resid + aggregate_results.params.get('const', 0)

    # Step 2: Unsmooth observed individual returns
    individual_results = fit_ma_model(df['Obs_Return_x'])
    df['unsmoothed_individual_return'] = individual_results.resid + individual_results.params.get('const', 0)

    # Step 3: Unsmooth excess returns
    df['excess_return'] = df['Obs_Return_x'] - df['Obs_Return_y']
    excess_results = fit_sarimax_covariate(df['excess_return'], df['unsmoothed_aggregate_return'])
    df['unsmoothed_excess_return'] = excess_results.resid + excess_results.params.get('const', 0)

    # Step 4: Combine unsmoothed returns to get final unsmoothed true returns of the individual return
    df['true_unsmoothed_return'] = df['unsmoothed_aggregate_return'] + df['unsmoothed_excess_return']

    # Only keep essential columns
    columns_to_keep = ['true_unsmoothed_return']
    df = df[columns_to_keep]

    return df

In [None]:
LTCB_LIT_unsmoothed = three_step_unsmooth(LTCB_LIT)
LTCB_LITP_unsmoothed = three_step_unsmooth(LTCB_LITP)
LTCB_ILIT_unsmoothed = three_step_unsmooth(LTCB_ILIT)

In [None]:
print(LTCB_LIT_unsmoothed.tail())
print(LTCB_LITP_unsmoothed.tail())
print(LTCB_ILIT_unsmoothed.tail())

            true_unsmoothed_return
Date                              
2024-04-03                0.012817
2024-04-08                0.037495
2024-04-09                0.002938
2024-04-10               -0.020379
2024-04-11               -0.006540
            true_unsmoothed_return
Date                              
2024-04-03                0.023130
2024-04-08                0.063860
2024-04-09                0.000175
2024-04-10               -0.006738
2024-04-11               -0.004863
            true_unsmoothed_return
Date                              
2024-04-03                0.026356
2024-04-08                0.027744
2024-04-09                0.028011
2024-04-10               -0.023662
2024-04-11                0.017213


In [None]:
print(LTCB_LIT_unsmoothed.head())
print(LTCB_LITP_unsmoothed.head())
print(LTCB_ILIT_unsmoothed.head())

            true_unsmoothed_return
Date                              
2017-05-10            4.848006e-03
2017-05-11            2.770289e-03
2017-05-12            1.163031e-15
2017-05-15            1.019071e-02
2017-05-16            4.749067e-03
            true_unsmoothed_return
Date                              
2023-02-03               -0.038187
2023-02-06               -0.031710
2023-02-07                0.014460
2023-02-08               -0.012423
2023-02-09               -0.017818
            true_unsmoothed_return
Date                              
2023-06-26                0.003174
2023-06-27                0.004093
2023-06-28               -0.008143
2023-06-29                0.003353
2023-06-30                0.017352


In [None]:
LTCB_LIT_unsmoothed.to_csv(r'LTCB99_3Step_trueMA_byLIT.csv', index=True, header=True)
files.download('LTCB99_3Step_trueMA_byLIT.csv')

LTCB_LITP_unsmoothed.to_csv(r'LTCB99_3Step_trueMA_byLITP.csv', index=True, header=True)
files.download('LTCB99_3Step_trueMA_byLITP.csv')

LTCB_ILIT_unsmoothed.to_csv(r'LTCB99_3Step_trueMA_byILIT.csv', index=True, header=True)
files.download('LTCB99_3Step_trueMA_byILIT.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>