In [3]:
import fastparquet 
import pandas as pd, os
import numpy as np

In [22]:
input_file = 'C:/M4 courses/Quantitative Trading/assignment2/hw2_mfin7037_data.parquet'
df = pd.read_parquet(input_file)
# start the data in 1994, before which there's some weird stuff
df = df.query("date>='1994-01-01'")
# remember to fitler the data
df.head()

Unnamed: 0,permno,date,ret,intraday_ret_month,overnight_ret_month,mcap_lag1,prc_lag1,mom_intraday,mom,mom_overnight,mcap_bin
148,10094.0,1994-01-31,0.181818,-0.234356,0.54356,42515.0,5.5,-0.30743,-0.087011,0.229986,4.0
149,10094.0,1994-02-28,-0.115385,0.222005,-0.276094,50245.0,6.5,-0.923227,-0.646627,0.286166,5.0
150,10094.0,1994-03-31,-0.152174,0.187957,-0.286318,44447.5,5.75,-1.428256,-0.712195,0.725624,4.0
151,10094.0,1994-04-29,-0.076923,0.236076,-0.25322,37756.875,4.875,-1.398879,-0.834798,0.564079,4.0
152,10094.0,1994-05-31,0.027778,-0.138717,0.193312,34852.5,4.5,-0.764079,-0.640503,0.123572,4.0


## Q1

In [25]:
description = df[['mom_intraday', 'mom_overnight', 'mom']].describe()
print("Descriptive statistical results：")
print(description)

Descriptive statistical results：
       mom_intraday  mom_overnight           mom
count  1.681488e+06   1.681001e+06  1.671976e+06
mean  -5.036289e-02   7.490227e-03 -2.178160e-02
std    5.786294e-01   4.867831e-01  3.924463e-01
min   -1.989288e+01  -3.039382e+01 -1.053870e+01
25%   -2.152428e-01  -1.336102e-01 -1.719058e-01
50%    9.127390e-03   1.398969e-02  1.481509e-02
75%    1.916950e-01   1.588196e-01  1.723000e-01
max    3.006532e+01   1.781343e+01  4.218667e+00


In [27]:
correlation_matrix = df[['mom_intraday', 'mom_overnight', 'mom']].corr()
print("The correlation of intraday, overnight momentum and regular momentum：\n")
print(correlation_matrix)

The correlation of intraday, overnight momentum and regular momentum：

               mom_intraday  mom_overnight       mom
mom_intraday       1.000000      -0.691953  0.591998
mom_overnight     -0.691953       1.000000  0.105443
mom                0.591998       0.105443  1.000000


## Q2

In [30]:
import pandas as pd
from scipy.stats import ttest_1samp

def process_data(dataset):
    """data cleaning, and prepare the data for the function: produce_table"""
    # kick out data with prc_lag1 < 5
    filtered = dataset[dataset['prc_lag1'] >= 5].copy()
    
    # calculate the returns
    def calculate_returns(group):
        ew_ret = group['ret'].mean()
        vw_ret = (group['ret'] * group['mcap_lag1']).sum() / group['mcap_lag1'].sum()
        
        ew_intraday = group['intraday_ret_month'].mean()
        vw_intraday = (group['intraday_ret_month'] * group['mcap_lag1']).sum() / group['mcap_lag1'].sum()
        
        ew_overnight = group['overnight_ret_month'].mean()
        vw_overnight = (group['overnight_ret_month'] * group['mcap_lag1']).sum() / group['mcap_lag1'].sum()
        
        return pd.Series({
            'ew': ew_ret,
            'vw': vw_ret,
            'ew_intraday': ew_intraday,
            'vw_intraday': vw_intraday,
            'ew_overnight': ew_overnight,
            'vw_overnight': vw_overnight
        })
    bins_df = filtered.groupby(['date', 'mcap_bin']).apply(calculate_returns).reset_index()
    bins_df = bins_df.rename(columns={'mcap_bin': 'bin'})
    
    # construct Long-Short portfolio
    pivot_df = bins_df.pivot(index='date', columns='bin', 
                            values=['ew', 'vw', 'ew_intraday', 'vw_intraday', 'ew_overnight', 'vw_overnight'])

    pnl_data = {}
    for col_type in ['ew', 'vw', 'ew_intraday', 'vw_intraday', 'ew_overnight', 'vw_overnight']:
        try:
            pnl_data[col_type] = pivot_df[(col_type, 10)] - pivot_df[(col_type, 1)]
        except KeyError:
            pnl_data[col_type] = 0 
    
    pnl_df = pd.DataFrame(pnl_data)
    pnl_df = pnl_df.reset_index().rename(columns={'index': 'date'})
    pnl_df['bin'] = 11
    
    return {
        'bins': bins_df[['date', 'bin', 'ew', 'vw', 'ew_intraday', 'vw_intraday', 'ew_overnight', 'vw_overnight']],
        'pnl': pnl_df[['date', 'ew', 'vw', 'ew_intraday', 'vw_intraday', 'ew_overnight', 'vw_overnight', 'bin']]
    }

def produce_table(input_data, subsetting=lambda df: df):
    """
    Combines portfolio data from 'bins' and 'pnl', computes summary statistics
    (mean returns multiplied by 100 and one-sample t-test statistics) for each portfolio,
    and returns a pivoted table for portfolios 1..10 and a long-short portfolio.
    
    Parameters:
        input_data (dict): Dictionary with keys:
            - 'bins': a DataFrame with columns ['date','bin','ew','vw',
                      'ew_intraday','vw_intraday','ew_overnight','vw_overnight']
            - 'pnl': a DataFrame with columns ['date','ew','vw',
                      'ew_intraday','vw_intraday','ew_overnight','vw_overnight']
                      (this will be used to compute the long-short portfolio)
        subsetting (function): A function to subset/modify the combined DataFrame.
                               Defaults to the identity function.
    
    Returns:
        pd.DataFrame: A table where each row corresponds to a return measure (e.g., EW, VW, etc.)
                      with the first row showing the mean (×100, rounded to 3 decimals) and the second
                      row showing the t-statistic (in parentheses). The long-short portfolio is labeled "10-1".
    """
    # Extract the required columns from each DataFrame.
    bins_df = input_data['bins'][['date', 'bin', 'ew', 'vw', 
                                  'ew_intraday', 'vw_intraday', 
                                  'ew_overnight', 'vw_overnight']].copy()
    pnl_df = input_data['pnl'][['date', 'ew', 'vw', 
                                'ew_intraday', 'vw_intraday', 
                                'ew_overnight', 'vw_overnight']].copy()
    # Designate the pnl data as portfolio "11" (which later will be renamed to "10-1")
    pnl_df['bin'] = 11

    # Combine the two datasets and apply any subsetting
    combined = pd.concat([bins_df, pnl_df], ignore_index=True)
    combined = subsetting(combined)

    # List of return measure columns
    cols = ['ew', 'vw', 'ew_intraday', 'vw_intraday', 'ew_overnight', 'vw_overnight']

    # Function to compute mean multiplied by 100 and rounded to 3 decimals
    def meanna(x):
        return round(x.mean() * 100, 3)

    # Compute group-wise means by portfolio (bin)
    s1 = combined.groupby('bin')[cols].agg(meanna).reset_index()
    s1_melt = s1.melt(id_vars='bin', var_name='variable', value_name='value')
    s1_pivot = s1_melt.pivot(index='variable', columns='bin', values='value').reset_index()
    s1_pivot['order'] = 1

    # Function to compute one-sample t-test statistic (null: mean=0), rounded to 3 decimals and wrapped in parentheses.
    def ttesting(x):
        x = x.dropna()
        if len(x) == 0:
            return None
        stat, _ = ttest_1samp(x, popmean=0)
        return f"({round(stat, 3)})"

    # Compute group-wise t-statistics by portfolio (bin)
    s2 = combined.groupby('bin')[cols].agg(ttesting).reset_index()
    s2_melt = s2.melt(id_vars='bin', var_name='variable', value_name='value')
    s2_pivot = s2_melt.pivot(index='variable', columns='bin', values='value').reset_index()
    s2_pivot['order'] = 2

    # Combine the mean and t-statistic tables
    table = pd.concat([s1_pivot, s2_pivot], ignore_index=True)
    table = table.sort_values(by=['variable', 'order'])
    # For rows with t-statistics (order 2), clear the 'variable' name
    table.loc[table['order'] == 2, 'variable'] = ''
    table = table.drop(columns=['order'])

    # Format the variable names: uppercase and replace underscores with spaces
    table['variable'] = table['variable'].str.upper().str.replace('_', ' ', regex=False)
    table = table.rename(columns={'variable': 'Portfolio'})

    # Rename portfolio 11 to "10-1" (i.e. the long-short portfolio)
    if 11 in table.columns:
        table = table.rename(columns={11: '10-1'})
    
    # Ensure that only portfolios 1 through 10 and the long-short ("10-1") remain.
    valid_portfolios = list(range(1, 11)) + ['10-1']
    cols_to_keep = ['Portfolio'] + [col for col in table.columns if col in valid_portfolios]
    table = table[cols_to_keep]
    
    table = pd.concat([s1_pivot, s2_pivot], ignore_index=True)
    table = table.sort_values(by=['variable', 'order'])
    
    def create_index_label(row):
        base_name = row['variable'].upper().replace('_', ' ')
        return f"{base_name}" if row['order'] == 1 else f"{base_name} (t-stat)"
    
    table['index_label'] = table.apply(create_index_label, axis=1)
    table = table.drop(columns=['variable', 'order']).set_index('index_label')
    
    if 11 in table.columns:
        table = table.rename(columns={11: '10-1'})
    valid_cols = [c for c in table.columns if (c in range(1,11)) or (c == '10-1')]
    return table[valid_cols].fillna('')

processed_data = process_data(df)
result_table = produce_table(processed_data)

# confirm the form of output
result_table = result_table[[
    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, '10-1'
]]

# ensure the output's data form
result_table.columns = [
    int(col) if (isinstance(col, float) and col.is_integer()) 
    else col 
    for col in result_table.columns
]

print(result_table)

                              1         2         3        4        5  \
index_label                                                             
EW                        0.648      0.69     0.828    0.873    0.913   
EW (t-stat)             (2.542)   (3.367)   (3.535)  (3.145)  (2.887)   
EW INTRADAY               1.956     1.801     1.608    1.001    0.584   
EW INTRADAY (t-stat)    (9.763)  (11.125)   (8.679)  (4.291)  (2.192)   
EW OVERNIGHT             -1.509    -0.691    -0.111     0.47    0.815   
EW OVERNIGHT (t-stat)  (-6.724)  (-5.462)  (-0.834)  (3.383)  (5.214)   
VW                        0.669     0.689     0.818    0.866    0.899   
VW (t-stat)             (2.793)   (3.353)   (3.478)  (3.115)  (2.847)   
VW INTRADAY               1.936     1.774     1.566     0.96    0.576   
VW INTRADAY (t-stat)   (10.084)  (10.813)   (8.316)  (4.083)  (2.154)   
VW OVERNIGHT             -1.449    -0.633    -0.085    0.503    0.816   
VW OVERNIGHT (t-stat)  (-6.612)  (-4.945)  (-0.645)

### Answer:

Based on the provided tables, we can analyze the predictive relationships as follows:

### 1. **Intraday Momentum Predictiveness**
**a. Intraday Future Returns**  
- The "EW INTRADAY" and "VW INTRADAY" rows show that portfolios with higher intraday momentum (e.g., portfolio 10) exhibit **negative intraday returns** (e.g., EW INTRADAY: 0.23 for portfolio 10, t-stat = -1.13; VW INTRADAY: 0.209 for portfolio 10, t-stat = -1.134).  
- The "Long-Short (10-1)" portfolio also shows strongly **negative intraday returns** (-1.725 for EW, -1.727 for VW) with highly significant t-stats (-7.039 and -7.254).  
- **Conclusion**: Intraday momentum **inversely predicts intraday future returns** (higher momentum → lower future intraday returns).

**b. Overnight Returns**  
- The "EW OVERNIGHT" and "VW OVERNIGHT" rows show **positive returns** for high intraday momentum portfolios (e.g., portfolio 10: 0.761 for EW, 0.732 for VW) with significant t-stats (5.121 and 5.051).  
- The Long-Short portfolio further confirms this with **strongly positive overnight returns** (2.271 for EW, 2.18 for VW) and high t-stats (9.662 and 9.172).  
- **Conclusion**: Intraday momentum **positively predicts overnight future returns**.

**c. Total Returns**  
- The "EW" and "VW" rows (total returns) show **mixed results**:  
  - For high intraday momentum portfolios (10), total returns are modest (0.875 for EW, 0.848 for VW) with borderline significance (t-stats: 3.497 and 3.620).  
  - The correlation matrix shows intraday momentum (`mom_intraday`) is **positively correlated with total momentum** (`mom`: 0.59).  
- **Conclusion**: Intraday momentum has **weak predictive power for total returns**, likely driven by its offsetting effects on intraday vs. overnight returns.

---

### 2. **Overnight Momentum Predictiveness**
**a. Intraday Future Returns**  
- The "EW INTRADAY" and "VW INTRADAY" rows show **negative returns** for portfolios sorted by overnight momentum (e.g., portfolio 10: 0.23 for EW, 0.209 for VW) with insignificant t-stats.  
- The Long-Short portfolio ("10-1") reinforces this with **negative intraday returns** (-1.725 for EW, -1.727 for VW).  
- **Conclusion**: Overnight momentum **does not predict intraday future returns** (no clear relationship).

**b. Overnight Returns**  
- The "EW OVERNIGHT" and "VW OVERNIGHT" rows show **strong positive returns** for high overnight momentum portfolios (e.g., portfolio 10: 0.761 for EW, 0.732 for VW) with highly significant t-stats (5.121 and 5.051).  
- The Long-Short portfolio confirms this with **strongly positive returns** (2.271 for EW, 2.18 for VW) and significant t-stats.  
- **Conclusion**: Overnight momentum **strongly predicts future overnight returns**.

**c. Total Returns**  
- The "EW" and "VW" rows (total returns) show **moderate positive returns** for high overnight momentum portfolios (portfolio 10: 0.875 for EW, 0.848 for VW) with borderline significance.  
- The correlation matrix shows overnight momentum (`mom_overnight`) has **weak correlation with total momentum** (`mom`: 0.105).  
- **Conclusion**: Overnight momentum has **limited predictive power for total returns**, primarily driven by its overnight component.

---

### **Key Summary**
| Momentum Type       | Predicts Intraday Returns? | Predicts Overnight Returns? | Predicts Total Returns? |
|---------------------|----------------------------|------------------------------|--------------------------|
| **Intraday**        | ❌ Inverse relationship     | ✅ Strong positive           | ⚠️ Weak/offsetting       |
| **Overnight**       | ❌ No relationship          | ✅ Strong positive           | ⚠️ Limited               |

### **Interpretation**  
- **Intraday momentum** acts as a contrarian signal for intraday returns but strongly predicts overnight returns.  
- **Overnight momentum** is a persistent factor for overnight returns but has no meaningful link to intraday performance.  
- Both effects partially offset each other in total returns, explaining their weaker predictive power.  
- The negative correlation between intraday and overnight momentum (-0.69) further highlights their divergent roles in return prediction.