In [1]:
import pandas as pd
import os
import glob

def concat_csvs_by_date():
    # Find all CSV files
    csv_files = glob.glob('*_price_change.csv') + glob.glob('*_price_metrics.csv')
    
    # Read and store dataframes
    dfs = [pd.read_csv(file) for file in csv_files]
    
    # Concat all dataframes
    combined_df = pd.concat(dfs, axis=1)
    
    # Remove duplicate Date columns, keep first
    combined_df = combined_df.loc[:,~combined_df.columns.duplicated()]
    
    # Save combined dataframe
    combined_df.to_csv('input.d.csv', index=False)
    print("Combined data saved to input.d.csv")

def create_time_lagged_features(input_file='input.d.csv'):
    # Read the input file
    df = pd.read_csv(input_file)
    
    # Convert Date to datetime
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Sort by Date
    df = df.sort_values('Date')
    
    # Create time-lagged versions
    time_lags = [10, 2, 1, 0]
    
    # Store dataframes
    lagged_dfs = []
    
    for lag in time_lags:
        # Create a copy of the dataframe
        lagged_df = df.copy()
        
        # Shift the columns (except Date)
        date_cols = df.columns[df.columns != 'Date']
        lagged_df[date_cols] = lagged_df[date_cols].shift(lag)
        
        # Rename columns to indicate lag
        lagged_cols = {col: f'{col}_lag_{lag}' for col in date_cols}
        lagged_df = lagged_df.rename(columns=lagged_cols)
        
        lagged_dfs.append(lagged_df)
    
    # Concatenate all time-lagged dataframes
    final_df = pd.concat(lagged_dfs, axis=1)
    
    # Remove duplicate Date columns, keep first
    final_df = final_df.loc[:,~final_df.columns.duplicated()]
    
    # Save the final dataframe
    final_df.to_csv('input.engineered.csv', index=False)
    print("Engineered features saved to input.engineered.csv")

# Run both functions
concat_csvs_by_date()
create_time_lagged_features()

Combined data saved to input.d.csv
Engineered features saved to input.engineered.csv
