# Time series - Li2Co3 zeroes calculations and statistics

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm
from tabulate import tabulate

# P1. Data Preprocess

In [None]:
# Data from 2017-05-10 to 2024-04-19
li2co3 = pd.read_csv(r'/content/Lithium Carbonate 99%Min China Spot Historical Data (5).csv')
li2co3['Date'] = pd.to_datetime(li2co3['Date'])
# The date order need to be inverted (from early to late)
li2co3 = li2co3.sort_values('Date')
li2co3.set_index('Date', inplace=True)
li2co3 = pd.DataFrame(li2co3["Price"])


li2co3['Price'] = (li2co3['Price'].str.replace(",","").astype(float))
na_count = li2co3['Price'].isna().sum()
print("Number of missing values:", na_count)
if na_count > 0:
  li2co3 = li2co3.dropna(subset=['Price'])

# daily log returns
li2co3['log_ret'] = np.log(li2co3['Price']).diff()
li2co3 = li2co3.dropna(subset=['log_ret'])

FileNotFoundError: [Errno 2] No such file or directory: '/content/Lithium Carbonate 99%Min China Spot Historical Data (5).csv'

In [None]:
li2co3.head(10)

In [None]:
li2co3.tail(10)

# P2. Weekly log return series and zero count series

In [None]:
Fridays = pd.DataFrame()
Thursdays = pd.DataFrame()
Wednesdays = pd.DataFrame()
Tuesdays = pd.DataFrame()
Mondays = pd.DataFrame()

In [None]:
def weekly_returns(data, chosen_day):
    # chosen_day = ['W-MON', 'W-TUE', 'W-WED', 'W-THU', 'W-FRI']
    weekly_log_return = data.groupby(pd.Grouper(freq=chosen_day))['log_ret'].sum()
    weekly_log_return = weekly_log_return.dropna()
    return weekly_log_return

In [None]:
def count_zero(df, chosen_day):
    # chosen_day = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
    # Create a zero dummy series: 1 if 'log_ret' is zero, 0 otherwise
    data = df.copy()
    data['zero_dummy'] = (data['log_ret'] == 0).astype(int)
    # Compute the sum of past 22 days and past 5 days for zero_dummy
    data['zero_count_22'] = data['zero_dummy'].rolling(window=22).sum()
    data['zero_count_5'] = data['zero_dummy'].rolling(window=5).sum()
    data = data.dropna()
    # Extract chosen day
    data['day_of_week'] = data.index.day_name()
    chosendays_data = data[data['day_of_week'] == chosen_day]

    # Select only the zero count columns and the index for Fridays
    chosendays_data = chosendays_data[['zero_count_22', 'zero_count_5']]
    chosendays_data['zero_count_22'] = chosendays_data['zero_count_22'].astype(int)
    chosendays_data['zero_count_5'] = chosendays_data['zero_count_5'].astype(int)
    return chosendays_data

In [None]:
# Friday to Friday
Fridays['Log_Return'] = weekly_returns(li2co3, 'W-FRI')
Fridays['Zero_Count_22'] = count_zero(li2co3, 'Friday')['zero_count_22']
Fridays['Zero_Count_5'] = count_zero(li2co3, 'Friday')['zero_count_5']
Fridays = Fridays.dropna()

# Thursday to Thursday
Thursdays['Log_Return'] = weekly_returns(li2co3, 'W-THU')
Thursdays['Zero_Count_22'] = count_zero(li2co3, 'Thursday')['zero_count_22']
Thursdays['Zero_Count_5'] = count_zero(li2co3, 'Thursday')['zero_count_5']
Thursdays = Thursdays.dropna()

# Wednesday to Wednesday
Wednesdays['Log_Return'] = weekly_returns(li2co3, 'W-WED')
Wednesdays['Zero_Count_22'] = count_zero(li2co3, 'Wednesday')['zero_count_22']
Wednesdays['Zero_Count_5'] = count_zero(li2co3, 'Wednesday')['zero_count_5']
Wednesdays = Wednesdays.dropna()

# Tuesday to Tuesday
Tuesdays['Log_Return'] = weekly_returns(li2co3, 'W-TUE')
Tuesdays['Zero_Count_22'] = count_zero(li2co3, 'Tuesday')['zero_count_22']
Tuesdays['Zero_Count_5'] = count_zero(li2co3, 'Tuesday')['zero_count_5']
Tuesdays = Tuesdays.dropna()

# Monday to Monday
Mondays['Log_Return'] = weekly_returns(li2co3, 'W-MON')
Mondays['Zero_Count_22'] = count_zero(li2co3, 'Monday')['zero_count_22']
Mondays['Zero_Count_5'] = count_zero(li2co3, 'Monday')['zero_count_5']
Mondays = Mondays.dropna()

In [None]:
print("Friday to Firday")
print(Fridays.tail(10))
print("Thursday to Thursday")
print(Thursdays.tail(10))
print("Wednesday to Wednesday")
print(Wednesdays.tail(10))
print("Tuesday to Tuesday")
print(Tuesdays.tail(10))
print("Monday to Monday")
print(Mondays.tail(10))

# P3. Summary statistics

In [None]:
summary_statistics_Fridays = Fridays.describe().loc[['mean', 'min', 'max', 'std']]
summary_statistics_Thursdays = Thursdays.describe().loc[['mean', 'min', 'max', 'std']]
summary_statistics_Wednesdays = Wednesdays.describe().loc[['mean', 'min', 'max', 'std']]
summary_statistics_Tuesdays = Tuesdays.describe().loc[['mean', 'min', 'max', 'std']]
summary_statistics_Mondays = Mondays.describe().loc[['mean', 'min', 'max', 'std']]

print("Friday to Firday")
print(summary_statistics_Fridays)
print("Thursday to Thursday")
print(summary_statistics_Thursdays)
print("Wednesday to Wednesday")
print(summary_statistics_Wednesdays)
print("Tuesday to Tuesday")
print(summary_statistics_Tuesdays)
print("Monday to Monday")
print(summary_statistics_Mondays)

# P4. Autocorrelogram and partial autocorrelogram for daily return

In [None]:
# Daily return
fig, axs = plt.subplots(1, 1, figsize=(10, 5))
axs.bar(li2co3.index, li2co3['log_ret'], width=2.5)
plt.axhline(0, linewidth=0.8, color='k')
plt.xlabel('Date')
plt.ylabel('Log Returns')
plt.title('Li2CO3 99% Daily Log Return')
plt.show()

# Plot autocorrelation and partial autocorrelation
acf = plot_acf(li2co3['log_ret'], lags=20, alpha=0.1, title='Li2CO3 99% Autocorrelation for Daily Return')
pacf = plot_pacf(li2co3['log_ret'], lags=20, alpha=0.1, method='ywm', title='Li2CO3 99% Partial Autocorrelation for Daily Return')

# P5. Autocorrelogram and partial autocorrelogram for weekly return (use Fridays as example)

In [None]:
# Weekly return, use Friday to Friday as example
fig, axs = plt.subplots(1, 1, figsize=(10, 5))
axs.bar(Fridays.index, Fridays['Log_Return'], width=2.5)
plt.axhline(0, linewidth=0.8, color='k')
plt.xlabel('Date')
plt.ylabel('Log Returns')
plt.title('Li2CO3 99% Weekly Log Return')
plt.show()

# Plot autocorrelation and partial autocorrelation
acf = plot_acf(Fridays['Log_Return'], lags=20, alpha=0.1, title='Li2CO3 99% Autocorrelation for Weekly Return')
pacf = plot_pacf(Fridays['Log_Return'], lags=20, alpha=0.1, method='ywm', title='Li2CO3 99% Partial Autocorrelation for Weekly Return')

# P6. Two AR(2) models for daily return

In [None]:
def estimate_ar2_model(df, lags):
    """
    Estimate a basic AR(2) model for returns.

    Parameters:
    - data: DataFrame containing the log returns series under 'log_ret'.
    - lags: Number of lags to use for HAC standard errors.

    Returns:
    - model: OLS regression results containing the fitted model.
    """

    data = df.copy()
    data['log_ret_lag1'] = data['log_ret'].shift(1)
    data['log_ret_lag2'] = data['log_ret'].shift(2)

    data.dropna(inplace=True)

    # Define the variables and add a constant term for the intercept (alpha)
    X = data[['log_ret_lag1', 'log_ret_lag2']]
    X = sm.add_constant(X)  # Adds a constant column to input data set
    Y = data['log_ret']

    # Fit the AR(2) model
    model = sm.OLS(Y, X).fit(cov_type='HAC', cov_kwds={'maxlags': lags})

    return model

In [None]:
# lags=5 acording to the daily pacf
daily_return_ar2_model = estimate_ar2_model(li2co3, lags=5)

In [None]:
print(daily_return_ar2_model.summary())

In [None]:
def estimate_ar2_model_with_zero_dummy(df, lags):
    """
    Estimate an AR(2) model for log returns with modifications to account for zero dummies.

    Parameters:
    - data: DataFrame containing the log returns 'log_ret'.
    - lags: Number of lags to use for HAC standard errors.

    Returns:
    - model: OLS regression results containing the fitted model.
    """

    data = df.copy()
    data['zero_dummy'] = (data['log_ret'] == 0).astype(int)

    # Generate lagged return series
    data['log_ret_lag1'] = data['log_ret'].shift(1)
    data['log_ret_lag2'] = data['log_ret'].shift(2)

    # Generate interaction terms
    data['log_ret_lag1_zero'] = data['log_ret_lag1'] * data['zero_dummy']
    data['log_ret_lag2_zero'] = data['log_ret_lag2'] * data['zero_dummy']

    data.dropna(inplace=True)

    # Define the new model with additional interaction terms
    X = data[['zero_dummy', 'log_ret_lag1', 'log_ret_lag2', 'log_ret_lag1_zero', 'log_ret_lag2_zero']]
    X = sm.add_constant(X)
    Y = data['log_ret']

    # Fit the AR(2) model
    model = sm.OLS(Y, X).fit(cov_type='HAC', cov_kwds={'maxlags': lags})

    return model

In [None]:
# lags=5 acording to the daily pacf
daily_return_ar2_model_with_zero_dummy = estimate_ar2_model_with_zero_dummy(li2co3, lags = 5)

In [None]:
print(daily_return_ar2_model_with_zero_dummy.summary())

# **New** P7. Four AR(2) models for weekly return (use Fridays as example)

## Model1: Standard AR(2) model

In [None]:
def estimate_ar2_model(df, lags):
    """
    Estimate a basic AR(2) model for returns.

    Parameters:
    - data: DataFrame containing the log returns series under 'Log_Return'.
    - lags: Number of lags to use for HAC standard errors.

    Returns:
    - model: OLS regression results containing the fitted model.
    """

    data = df.copy()
    data['log_ret_lag1'] = data['Log_Return'].shift(1)
    data['log_ret_lag2'] = data['Log_Return'].shift(2)

    data.dropna(inplace=True)

    # Define the variables and add a constant term for the intercept (alpha)
    X = data[['log_ret_lag1', 'log_ret_lag2']]
    X = sm.add_constant(X)  # Adds a constant column to input data set
    Y = data['Log_Return']

    # Fit the AR(2) model
    model = sm.OLS(Y, X).fit(cov_type='HAC', cov_kwds={'maxlags': lags})

    return model

In [None]:
# lags=4 acording to the weekly pacf
weekly_return_ar2_model = estimate_ar2_model(Fridays, lags=4)

print(weekly_return_ar2_model.summary())

## Model2: AR(2) model using interaction with the weekly zero count series

In [None]:
def estimate_ar2_model_with_weekly_zero(df, lags):
    """
    Estimate an AR(2) model for weekly log returns, incorporating
    weekly zero count series as interaction effect,
    and allowing for specification of lags for HAC standard errors.

    Parameters:
    - data: DataFrame containing the weekly log returns under 'Log_Return',
         and the weekly zero count series 'Zero_Count_5'.
    - lags: Maximum number of lags to use for HAC standard errors.

    Returns:
    - model: OLS regression results containing the fitted model with HAC standard errors.
    """

    data = df.copy()
    # Generate lagged return series
    data['Log_Return_Lag1'] = data['Log_Return'].shift(1)
    data['Log_Return_Lag2'] = data['Log_Return'].shift(2)

    # Generate interaction terms for lagged returns and 'Zero_Count_5'
    data['Log_Return_Lag1_Zero5'] = data['Log_Return_Lag1'] * data['Zero_Count_5']
    data['Log_Return_Lag2_Zero5'] = data['Log_Return_Lag2'] * data['Zero_Count_5']

    # Drop any rows with NaN values that were created by lagging
    data.dropna(inplace=True)

    # Define the model with additional interaction terms
    X = data[['Zero_Count_5', 'Log_Return_Lag1', 'Log_Return_Lag2',
              'Log_Return_Lag1_Zero5', 'Log_Return_Lag2_Zero5']]
    X = sm.add_constant(X)
    Y = data['Log_Return']

    # Fit the model with HAC standard errors
    model = sm.OLS(Y, X).fit(cov_type='HAC', cov_kwds={'maxlags': lags})

    return model

In [None]:
# lags=4 acording to the weekly pacf
ar2_model_with_weekly_zero = estimate_ar2_model_with_weekly_zero(Fridays, lags=4)

print(ar2_model_with_weekly_zero.summary())

## Model3: AR(2) model using interaction with the monthly zero count series

In [None]:
def estimate_ar2_model_with_monthly_zero(df, lags):
    """
    Estimate an AR(2) model for weekly log returns, incorporating
    monthly zero count series as interaction effect,
    and allowing for specification of lags for HAC standard errors.

    Parameters:
    - data: DataFrame containing the weekly log returns under 'Log_Return',
         and the monthly zero count series 'Zero_Count_22'.
    - lags: Maximum number of lags to use for HAC standard errors.

    Returns:
    - model: OLS regression results containing the fitted model with HAC standard errors.
    """

    data = df.copy()
    # Generate lagged return series
    data['Log_Return_Lag1'] = data['Log_Return'].shift(1)
    data['Log_Return_Lag2'] = data['Log_Return'].shift(2)

    # Generate interaction terms for lagged returns and 'Zero_Count_22'
    data['Log_Return_Lag1_Zero22'] = data['Log_Return_Lag1'] * data['Zero_Count_22']
    data['Log_Return_Lag2_Zero22'] = data['Log_Return_Lag2'] * data['Zero_Count_22']

    # Drop any rows with NaN values that were created by lagging
    data.dropna(inplace=True)

    # Define the model with additional interaction terms
    X = data[['Zero_Count_22', 'Log_Return_Lag1', 'Log_Return_Lag2',
              'Log_Return_Lag1_Zero22', 'Log_Return_Lag2_Zero22']]
    X = sm.add_constant(X)
    Y = data['Log_Return']

    # Fit the model with HAC standard errors
    model = sm.OLS(Y, X).fit(cov_type='HAC', cov_kwds={'maxlags': lags})

    return model

In [None]:
# lags=4 acording to the weekly pacf
ar2_model_with_monthly_zero = estimate_ar2_model_with_monthly_zero(Fridays, lags=4)

print(ar2_model_with_monthly_zero.summary())

 ## Model4: AR(2) model **separately** using interactions with the short-term illiquidity and long-term illiquidity variable

In [None]:
def estimate_ar2_model_separate_illiquidity(df, lags):
    """
    Estimate a complex AR(2) model for log returns, incorporating
    separate interactions with two types of zero counts.
    The formula incorporates separate interactions for different lags
    with different term-length zero count measures.

    Parameters:
    - data: DataFrame containing the log returns under 'Log_Return',
         two zero count series 'Zero_Count_5' and 'Zero_Count_22'.
    - maxlags: Maximum number of lags to use for HAC standard errors.

    Returns:
    - model: OLS regression results containing the fitted model with HAC standard errors.
    """

    data = df.copy()
    # Generate lagged return series
    data['Log_Return_Lag1'] = data['Log_Return'].shift(1)
    data['Log_Return_Lag2'] = data['Log_Return'].shift(2)

    # Generate separate interaction terms for lagged returns and zero counts
    data['Log_Return_Lag1_Zero5'] = data['Log_Return_Lag1'] * data['Zero_Count_5']
    data['Log_Return_Lag2_Zero22'] = data['Log_Return_Lag2'] * data['Zero_Count_22']

    # Drop any rows with NaN values that were created by lagging
    data.dropna(inplace=True)

    # Define the model with additional interaction terms
    X = data[['Zero_Count_5', 'Zero_Count_22', 'Log_Return_Lag1', 'Log_Return_Lag2',
              'Log_Return_Lag1_Zero5', 'Log_Return_Lag2_Zero22']]
    X = sm.add_constant(X)
    Y = data['Log_Return']

    # Fit the model with HAC standard errors
    model = sm.OLS(Y, X).fit(cov_type='HAC', cov_kwds={'maxlags': lags})

    return model

In [None]:
# lags=4 acording to the weekly pacf
ar2_model_with_separate_illiquidity = estimate_ar2_model_separate_illiquidity(Fridays, lags=4)

print(ar2_model_with_separate_illiquidity.summary())

## Comparsion among last three models

In [None]:
model1_results = ar2_model_with_weekly_zero
model2_results = ar2_model_with_monthly_zero
model3_results = ar2_model_with_separate_illiquidity

# Create a DataFrame to summarize the fit statistics
summary_stats = pd.DataFrame({
    'Model': ['Model with Weekly Zero', 'Model with Monthly Zero', 'Model with Separate Illiquidity'],
    'R-squared': [model1_results.rsquared, model2_results.rsquared, model3_results.rsquared],
    'Adj. R-squared': [model1_results.rsquared_adj, model2_results.rsquared_adj, model3_results.rsquared_adj],
    'AIC': [model1_results.aic, model2_results.aic, model3_results.aic],
    'BIC': [model1_results.bic, model2_results.bic, model3_results.bic],
    'F-statistic': [model1_results.fvalue, model2_results.fvalue, model3_results.fvalue]
})

# Print the summary statistics using tabulate
print(tabulate(summary_stats, headers='keys', tablefmt='pretty', showindex=False))

## Interpretation

**R-squared** & **Adj. R-squared**: The proportion of the variance in the dependent variable that is predictable from the independent variables. The higher these values, the better the model explains the variability of the response variable. From the table, the *Model with Separate Illiquidity* has both the highest R-squared and the highest Adjusted R-squared, which means it is the best model that explains the highest proportion of variance in the weekly log return series among the last three models.


**AIC** & **BIC**: Both criteria help in model selection where lower values generally indicate a better model. AIC shows the goodness of fit with a penalty for the number of parameters to avoid overfitting, and BIC considers a larger penalty for models with more parameters. The *Model with Separate Illiquidity* has both the lowest AIC and the lowest BIC, which means it is the best model from a complexity-fit trade-off perspective.


**F-statistic**: Indicates the overall significance of the regression AR(2) model. The higher the F-statistic, the more significant the model is. The *Model with Separate Illiquidity* has the highest F-statistic, which means it is statistically the most significant model in terms of the contribution of the explanatory variables used in the model.


## Conclusion
*Model with Separate Illiquidity* has advantages on all three aspects:
explaining the variance in the weekly log return; balance between model complexity and fit; and also, the prediction ability, because the statistical significance of this model's explanatory variables is highest, making it potentially more reliable when making predictions or inferences.

Besides, *Model with Weekly Zero* is the second best model, where its
variance explaining ability and complexity-fit balance are all very close
to the *Model with Separate Illiquidity*.


The conclusion is just for Friday to Friday return series, it might
be different for other days.