In [4]:
import pandas as pd
import os

def concat_csvs_by_date(file_list):
    # Read and store dataframes
    dfs = [pd.read_csv(file) for file in file_list]
    
    # Concat all dataframes
    combined_df = pd.concat(dfs, axis=1)
    
    # Remove duplicate Date columns, keep first
    combined_df = combined_df.loc[:,~combined_df.columns.duplicated()]
    
    # Remove 'Open', 'Close', 'Low', 'High' columns
    columns_to_drop = ['Open', 'Close', 'Low', 'High']
    combined_df = combined_df.drop(columns=[col for col in columns_to_drop if col in combined_df.columns])
    
    # Rename EWJ.Percent.Price to EWJ.Price.Percent.Today
    if 'EWJ.Percent.Price' in combined_df.columns:
        combined_df = combined_df.rename(columns={'EWJ.Percent.Price': 'EWJ.Price.Percent.Today'})
    
    # Convert Date to datetime with UTC timezone
    combined_df['Date'] = pd.to_datetime(combined_df['Date'], utc=True)
    
    # Filter dates up to 28th October 2024
    combined_df = combined_df[combined_df['Date'] <= '2024-10-28']
    
    # Save combined dataframe
    combined_df.to_csv('input.d.csv', index=False)
    print("Combined data saved to input.d.csv")

def create_time_lagged_features(input_file='input.d.csv'):
    # Read the input file
    df = pd.read_csv(input_file)
    
    # Convert Date to datetime with UTC timezone
    df['Date'] = pd.to_datetime(df['Date'], utc=True)
    
    # Sort by Date
    df = df.sort_values('Date')
    
    # Create time-lagged versions
    time_lags = [10, 2, 1, 0]
    
    # Store dataframes
    lagged_dfs = []
    
    for lag in time_lags:
        # Create a copy of the dataframe
        lagged_df = df.copy()
        
        # Shift all columns except Date and EWJ.Price.Percent.Today
        columns_to_lag = [col for col in df.columns if col not in ['Date', 'EWJ.Price.Percent.Today']]
        lagged_df[columns_to_lag] = lagged_df[columns_to_lag].shift(lag)
        
        # Rename lagged columns
        lagged_cols = {col: f'{col}_lag_{lag}' for col in columns_to_lag}
        lagged_df = lagged_df.rename(columns=lagged_cols)
        
        lagged_dfs.append(lagged_df)
    
    # Concatenate all time-lagged dataframes
    final_df = pd.concat(lagged_dfs, axis=1)
    
    # Remove duplicate Date columns, keep first
    final_df = final_df.loc[:,~final_df.columns.duplicated()]
    
    # Save the final dataframe
    final_df.to_csv('input.engineered.csv', index=False)
    print("Engineered features saved to input.engineered.csv")

files = [
    'etf_momentum_delta.csv',
    'NYSE.market.vol.csv',
    'US.JPN.macro.indicators.csv',
    'Toyota_price_change.csv',
    'Mitsubishi_UFJ_Financial_Group_price_change.csv',
    'Sony_price_change.csv',
    'Hitachi_price_change.csv',
    'R_Holdings_price_change.csv',
    'Sumitomo_Mitsui_Financial_Group_price_change.csv',
    'Keyence_price_change.csv',
    'Tokio_Marine_Holdings_price_change.csv',
    'Tokyo_Electron_price_change.csv',
    'Shin_Etsu_Chemical_price_change.csv',
    'Nikkei_225_price_metrics.csv'
]
concat_csvs_by_date(files)
create_time_lagged_features()

Combined data saved to input.d.csv
Engineered features saved to input.engineered.csv
