## Consolidate Financial and Macroeconomic Data

In [2]:
import pandas as pd
import os

# --- Configuration ---
DATA_FOLDER = "test"
FINANCIAL_FILES = {
    'sp500': os.path.join(DATA_FOLDER, 's&p_500_daily.csv'),
    'nasdaq': os.path.join(DATA_FOLDER, 'nasdaq_daily.csv'),
    'ftse100': os.path.join(DATA_FOLDER, 'ftse_100_daily.csv')
}
MACRO_FILES = {
    'gdp': os.path.join(DATA_FOLDER, 'macro_gdp.csv'),
    'inflation': os.path.join(DATA_FOLDER, 'macro_inflation_cpi.csv'),
    'unemployment': os.path.join(DATA_FOLDER, 'macro_unemploymentrate.csv'),
    'vix': os.path.join(DATA_FOLDER, 'macro_vix.csv')
}

# --- Load and Combine Data ---

# Load the primary financial index (e.g., S&P 500)
# The CSV structure: Row 1=headers, Row 2=tickers, Row 3=Date placeholder, Row 4+=data
# We need to skip rows 1 and 2, then use column 0 (Price) as the date index
df_master = pd.read_csv(FINANCIAL_FILES['sp500'], skiprows=[1, 2], index_col=0, parse_dates=True)

# Rename the index to something more meaningful
df_master.index.name = 'Date'

# Select and rename key columns to avoid confusion
df_master = df_master[['Open', 'High', 'Low', 'Close', 'Volume']].add_prefix('sp500_')

# Load and merge other financial indices
for name, path in FINANCIAL_FILES.items():
    if name != 'sp500':
        df_temp = pd.read_csv(path, skiprows=[1, 2], index_col=0, parse_dates=True)
        df_temp.index.name = 'Date'
        df_master = df_master.merge(
            df_temp[['Close', 'Volume']].add_prefix(f'{name}_'),
            left_index=True,
            right_index=True,
            how='left'
        )

# Load and merge macroeconomic data
for name, path in MACRO_FILES.items():
    try:
        df_macro = pd.read_csv(path, index_col='DATE', parse_dates=True)
        # Rename the column to be specific
        df_macro.rename(columns={df_macro.columns[0]: f'macro_{name}'}, inplace=True)
        df_master = df_master.merge(df_macro, left_index=True, right_index=True, how='left')
    except FileNotFoundError:
        print(f"Warning: {path} not found, skipping {name} data")

# --- Clean and Preprocess ---

# Forward-fill the macroeconomic data to fill daily gaps
macro_cols = [col for col in df_master.columns if 'macro_' in col]
if macro_cols:
    df_master[macro_cols] = df_master[macro_cols].ffill()

# Use interpolation for any remaining gaps (e.g., in stock data on holidays)
df_master.interpolate(method='time', inplace=True)

# Drop any rows that still have missing values (typically at the very beginning)
df_master.dropna(inplace=True)

print("‚úÖ Master DataFrame created successfully!")
print("Shape of the data:", df_master.shape)
print("\nColumn names:")
print(df_master.columns.tolist())
print("\nFirst 5 rows:")
print(df_master.head())
print("\nLast 5 rows:")
print(df_master.tail())
print("\nData types:")
print(df_master.dtypes)
print("\nDate range:")
print(f"From: {df_master.index.min()} To: {df_master.index.max()}")

‚úÖ Master DataFrame created successfully!
Shape of the data: (331, 13)

Column names:
['sp500_Open', 'sp500_High', 'sp500_Low', 'sp500_Close', 'sp500_Volume', 'nasdaq_Close', 'nasdaq_Volume', 'ftse100_Close', 'ftse100_Volume', 'macro_gdp', 'macro_inflation', 'macro_unemployment', 'macro_vix']

First 5 rows:
             sp500_Open   sp500_High    sp500_Low  sp500_Close  sp500_Volume  \
Date                                                                           
2024-04-01  5257.970215  5263.950195  5229.200195  5243.770020    3325930000   
2024-04-02  5204.290039  5208.339844  5184.049805  5205.810059    3886590000   
2024-04-03  5194.370117  5228.750000  5194.370117  5211.490234    3703250000   
2024-04-04  5244.049805  5256.589844  5146.060059  5147.209961    4075680000   
2024-04-05  5158.950195  5222.180176  5157.209961  5204.339844    3386780000   

            nasdaq_Close  nasdaq_Volume  ftse100_Close  ftse100_Volume  \
Date                                                   

## Process GDELT Data to Create Sentiment Features

In [2]:
import pandas as pd
import numpy as np

# --- Load GDELT Data ---
gdelt_path = 'test/gdelt_usa_10year_raw.csv'
print(f"Loading GDELT data from {gdelt_path}...")
df_gdelt = pd.read_csv(gdelt_path)

print(f"GDELT data shape: {df_gdelt.shape}")

# --- Parse Sentiment from V2Tone ---
def parse_v2tone_quick(v2tone_str):
    """A robust function to parse the V2Tone string."""
    try:
        # The first value is the average tone
        return float(str(v2tone_str).split(',')[0])
    except (ValueError, IndexError):
        return np.nan # Return NaN if parsing fails

df_gdelt['tone_avg'] = df_gdelt['V2Tone'].apply(parse_v2tone_quick)

# --- Aggregate Sentiment by Day ---
# Convert DATE column to datetime objects
df_gdelt['date_only'] = pd.to_datetime(df_gdelt['DATE'], format='%Y%m%d%H%M%S').dt.date

# Calculate the mean sentiment for each day
daily_sentiment = df_gdelt.groupby('date_only')['tone_avg'].mean().to_frame(name='gdelt_sentiment')
daily_sentiment.index = pd.to_datetime(daily_sentiment.index)

print(f"Daily sentiment shape: {daily_sentiment.shape}")
print(f"Daily sentiment date range: {daily_sentiment.index.min()} to {daily_sentiment.index.max()}")

# --- Clean existing sentiment columns if they exist ---
sentiment_cols = [col for col in df_master.columns if 'gdelt_sentiment' in col]
if sentiment_cols:
    print(f"Removing existing sentiment columns: {sentiment_cols}")
    df_master = df_master.drop(columns=sentiment_cols)

# --- Merge into Master DataFrame ---
print(f"Master DataFrame shape before merge: {df_master.shape}")
df_master = df_master.merge(daily_sentiment, left_index=True, right_index=True, how='left')
print(f"Master DataFrame shape after merge: {df_master.shape}")

# Fill weekends/holidays with the last known sentiment value
df_master['gdelt_sentiment'] = df_master['gdelt_sentiment'].ffill()

# Fill any remaining initial NaNs with 0 (neutral)
df_master['gdelt_sentiment'] = df_master['gdelt_sentiment'].fillna(0)

print("‚úÖ GDELT sentiment successfully merged into master DataFrame!")
print("Final shape:", df_master.shape)
print("\nColumn names:")
print(df_master.columns.tolist())
print("\nSample data with sentiment:")
print(df_master[['sp500_Close', 'gdelt_sentiment']].tail())
print(f"\nSentiment statistics:")
print(f"Mean: {df_master['gdelt_sentiment'].mean():.3f}")
print(f"Std: {df_master['gdelt_sentiment'].std():.3f}")
print(f"Min: {df_master['gdelt_sentiment'].min():.3f}")
print(f"Max: {df_master['gdelt_sentiment'].max():.3f}")
print(f"Missing values: {df_master['gdelt_sentiment'].isna().sum()}")

Loading GDELT data from test/gdelt_usa_10year_raw.csv...
GDELT data shape: (16202, 37)
Daily sentiment shape: (521, 1)
Daily sentiment date range: 2024-01-01 00:00:00 to 2025-06-12 00:00:00
Master DataFrame shape before merge: (331, 13)
Master DataFrame shape after merge: (331, 14)
‚úÖ GDELT sentiment successfully merged into master DataFrame!
Final shape: (331, 14)

Column names:
['sp500_Open', 'sp500_High', 'sp500_Low', 'sp500_Close', 'sp500_Volume', 'nasdaq_Close', 'nasdaq_Volume', 'ftse100_Close', 'ftse100_Volume', 'macro_gdp', 'macro_inflation', 'macro_unemployment', 'macro_vix', 'gdelt_sentiment']

Sample data with sentiment:
            sp500_Close  gdelt_sentiment
Date                                    
2025-07-21  6305.600098        -2.529748
2025-07-22  6309.620117        -2.529748
2025-07-23  6358.910156        -2.529748
2025-07-24  6363.350098        -2.529748
2025-07-25  6388.640137        -2.529748

Sentiment statistics:
Mean: -2.333
Std: 0.777
Min: -4.677
Max: 2.809
Mis

## Final Dataset Summary

In [3]:
# Final verification and summary of the consolidated dataset
print("="*60)
print("FINAL DATASET SUMMARY")
print("="*60)

print(f"üìä Dataset Shape: {df_master.shape[0]:,} rows √ó {df_master.shape[1]} columns")
print(f"üìÖ Date Range: {df_master.index.min().strftime('%Y-%m-%d')} to {df_master.index.max().strftime('%Y-%m-%d')}")
print(f"‚è±Ô∏è  Duration: {(df_master.index.max() - df_master.index.min()).days:,} days")

print("\nüìà Data Categories:")
financial_cols = [col for col in df_master.columns if any(x in col for x in ['sp500', 'nasdaq', 'ftse'])]
macro_cols = [col for col in df_master.columns if 'macro_' in col]
sentiment_cols = [col for col in df_master.columns if 'gdelt' in col]

print(f"  ‚Ä¢ Financial Data: {len(financial_cols)} columns")
print(f"  ‚Ä¢ Macroeconomic Data: {len(macro_cols)} columns") 
print(f"  ‚Ä¢ Sentiment Data: {len(sentiment_cols)} columns")

print("\nüîç Data Quality Check:")
print(f"  ‚Ä¢ Total missing values: {df_master.isnull().sum().sum()}")
print(f"  ‚Ä¢ Rows with any missing values: {df_master.isnull().any(axis=1).sum()}")

print("\nüìã Column Summary:")
for i, col in enumerate(df_master.columns, 1):
    missing = df_master[col].isnull().sum()
    print(f"  {i:2}. {col:<20} - Missing: {missing:4} ({missing/len(df_master)*100:.1f}%)")

print("\n‚úÖ Dataset is ready for modeling and analysis!")
print("="*60)

FINAL DATASET SUMMARY
üìä Dataset Shape: 331 rows √ó 14 columns
üìÖ Date Range: 2024-04-01 to 2025-07-25
‚è±Ô∏è  Duration: 480 days

üìà Data Categories:
  ‚Ä¢ Financial Data: 9 columns
  ‚Ä¢ Macroeconomic Data: 4 columns
  ‚Ä¢ Sentiment Data: 1 columns

üîç Data Quality Check:
  ‚Ä¢ Total missing values: 0
  ‚Ä¢ Rows with any missing values: 0

üìã Column Summary:
   1. sp500_Open           - Missing:    0 (0.0%)
   2. sp500_High           - Missing:    0 (0.0%)
   3. sp500_Low            - Missing:    0 (0.0%)
   4. sp500_Close          - Missing:    0 (0.0%)
   5. sp500_Volume         - Missing:    0 (0.0%)
   6. nasdaq_Close         - Missing:    0 (0.0%)
   7. nasdaq_Volume        - Missing:    0 (0.0%)
   8. ftse100_Close        - Missing:    0 (0.0%)
   9. ftse100_Volume       - Missing:    0 (0.0%)
  10. macro_gdp            - Missing:    0 (0.0%)
  11. macro_inflation      - Missing:    0 (0.0%)
  12. macro_unemployment   - Missing:    0 (0.0%)
  13. macro_vix            

## Create Time-Series & Crisis Features

In [4]:
import pandas as pd

# This assumes your master dataframe is named df_master

# --- Create Time-Series Features ---
# Moving Averages for the S&P 500 close price
df_master['sp500_ma_7'] = df_master['sp500_Close'].rolling(window=7).mean()
df_master['sp500_ma_30'] = df_master['sp500_Close'].rolling(window=30).mean()

# Lagged Values for the S&P 500 close price
for i in range(1, 4): # Create 3 lag features (t-1, t-2, t-3)
    df_master[f'sp500_lag_{i}'] = df_master['sp500_Close'].shift(i)

# --- Label Crisis Periods ---
# Define crisis periods (start_date, end_date)
# Using dates from your proposal [cite: 351, 402]
crisis_periods = {
    '2015_China_Market_Crash': ('2015-06-15', '2016-02-11'),
    '2018_Volatility': ('2018-10-01', '2018-12-31'),
    'COVID_Crash': ('2020-02-19', '2020-03-23'),
    '2022_Inflation_Crash': ('2022-01-01', '2022-12-31')
}

# Create the 'is_crisis' column, initialized to 0
df_master['is_crisis'] = 0

for crisis, (start, end) in crisis_periods.items():
    df_master.loc[start:end, 'is_crisis'] = 1

# Clean up by dropping initial rows with NaNs from lags/MA
df_master.dropna(inplace=True)

print("‚úÖ Time-series and crisis features added!")
print("Number of crisis days labeled:", df_master['is_crisis'].sum())

# Show comprehensive date range information
print(f"\nüìÖ FULL DATASET DATE RANGE:")
print(f"   Start: {df_master.index.min().strftime('%Y-%m-%d')}")
print(f"   End: {df_master.index.max().strftime('%Y-%m-%d')}")
print(f"   Total days: {len(df_master):,}")

# Show first and last few rows to confirm full range
print(f"\nüîç FIRST 3 ROWS (earliest dates):")
print(df_master[['sp500_Close', 'is_crisis']].head(3))

print(f"\nüîç LAST 3 ROWS (latest dates):")
print(df_master[['sp500_Close', 'is_crisis']].tail(3))

# Show crisis period breakdown
print(f"\nüìä CRISIS PERIOD BREAKDOWN:")
for crisis, (start, end) in crisis_periods.items():
    crisis_data = df_master.loc[start:end] if start in df_master.index and end in df_master.index else pd.DataFrame()
    crisis_count = len(crisis_data)
    print(f"   {crisis}: {crisis_count} days ({start} to {end})")

print(f"\nüìà TOTAL CRISIS vs NON-CRISIS DAYS:")
crisis_counts = df_master['is_crisis'].value_counts()
print(f"   Non-Crisis (0): {crisis_counts.get(0, 0):,} days")
print(f"   Crisis (1): {crisis_counts.get(1, 0):,} days")

‚úÖ Time-series and crisis features added!
Number of crisis days labeled: 0

üìÖ FULL DATASET DATE RANGE:
   Start: 2024-05-10
   End: 2025-07-25
   Total days: 302

üîç FIRST 3 ROWS (earliest dates):
            sp500_Close  is_crisis
Date                              
2024-05-10  5222.680176          0
2024-05-13  5221.419922          0
2024-05-14  5246.680176          0

üîç LAST 3 ROWS (latest dates):
            sp500_Close  is_crisis
Date                              
2025-07-23  6358.910156          0
2025-07-24  6363.350098          0
2025-07-25  6388.640137          0

üìä CRISIS PERIOD BREAKDOWN:
   2015_China_Market_Crash: 0 days (2015-06-15 to 2016-02-11)
   2018_Volatility: 0 days (2018-10-01 to 2018-12-31)
   COVID_Crash: 0 days (2020-02-19 to 2020-03-23)
   2022_Inflation_Crash: 0 days (2022-01-01 to 2022-12-31)

üìà TOTAL CRISIS vs NON-CRISIS DAYS:
   Non-Crisis (0): 302 days
   Crisis (1): 0 days


## Data Normalization & Final Prep


In [5]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

# This assumes your fully-featured dataframe is named df_master

# Separate the target variable and the crisis flag
is_crisis_col = df_master['is_crisis']
# We scale features, not the crisis flag itself
features_to_scale = df_master.drop(columns=['is_crisis'])

# Initialize the scaler
scaler = MinMaxScaler()

# Fit and transform the features
scaled_features = scaler.fit_transform(features_to_scale)

# Create a new DataFrame with the scaled features
df_scaled = pd.DataFrame(scaled_features, index=df_master.index, columns=features_to_scale.columns)

# Add the 'is_crisis' column back to the scaled DataFrame
df_scaled['is_crisis'] = is_crisis_col

print("‚úÖ Data successfully scaled!")
print("Shape of scaled data:", df_scaled.shape)
print("\nSample of scaled data:")
print(df_scaled.head())

df_scaled.to_csv('test/usa_forward_test_processed.csv')


‚úÖ Data successfully scaled!
Shape of scaled data: (302, 20)

Sample of scaled data:
            sp500_Open  sp500_High  sp500_Low  sp500_Close  sp500_Volume  \
Date                                                                       
2024-05-10    0.191849    0.005779   0.244306     0.170649      0.240586   
2024-05-13    0.197208    0.003715   0.245271     0.169752      0.323077   
2024-05-14    0.188749    0.014989   0.249718     0.187720      0.388762   
2024-05-15    0.218518    0.067780   0.279245     0.231444      0.336670   
2024-05-16    0.251571    0.079587   0.300719     0.223584      0.266397   

            nasdaq_Close  nasdaq_Volume  ftse100_Close  ftse100_Volume  \
Date                                                                     
2024-05-10      0.183713       0.007624       0.517033        0.312141   
2024-05-13      0.191824       0.008185       0.504147        0.435782   
2024-05-14      0.212874       0.039400       0.513126        0.518928   
2024-05-15 

In [10]:
# Import all necessary libraries
import pandas as pd
import numpy as np
import os

print("--- üöÄ Starting Local Preprocessing of Fresh US Data (2024-2025) ---")

# --- Configuration ---
# Set this to the name of your folder containing the new raw data files.
RAW_DATA_FOLDER = "test"

# Define the paths to your new files
FINANCIAL_FILES = {
    'sp500': os.path.join(RAW_DATA_FOLDER, 's&p_500_daily.csv'),
    'nasdaq': os.path.join(RAW_DATA_FOLDER, 'nasdaq_daily.csv'),
    'ftse100': os.path.join(RAW_DATA_FOLDER, 'ftse_100_daily.csv')
}
MACRO_FILES = {
    'gdp': os.path.join(RAW_DATA_FOLDER, 'macro_gdp.csv'),
    'inflation': os.path.join(RAW_DATA_FOLDER, 'macro_inflation_cpi.csv'),
    'unemployment': os.path.join(RAW_DATA_FOLDER, 'macro_unemploymentrate.csv'),
    'vix': os.path.join(RAW_DATA_FOLDER, 'macro_vix.csv')
}
# Make sure your new GDELT file has this exact name inside the raw data folder
GDELT_FILE = os.path.join(RAW_DATA_FOLDER, 'gdelt_usa_10year_raw.csv')


# --- 1. Load and Combine Financial & Macro Data ---
print("\nStep 1: Loading and merging financial and macroeconomic data...")
try:
    # Load S&P 500 as the base, handling the specific CSV structure
    # Use the same format as earlier cells: skiprows=[1, 2] and index_col=0
    df_master = pd.read_csv(FINANCIAL_FILES['sp500'], skiprows=[1, 2], index_col=0, parse_dates=True)
    df_master.index.name = 'Date'
    df_master = df_master[['Open', 'High', 'Low', 'Close', 'Volume']].add_prefix('sp500_')

    # Merge other financial indices
    for name, path in FINANCIAL_FILES.items():
        if name != 'sp500':
            df_temp = pd.read_csv(path, skiprows=[1, 2], index_col=0, parse_dates=True)
            df_temp.index.name = 'Date'
            df_master = df_master.merge(
                df_temp[['Close', 'Volume']].add_prefix(f'{name}_'),
                left_index=True, right_index=True, how='left'
            )

    # Merge macroeconomic data
    for name, path in MACRO_FILES.items():
        try:
            df_macro = pd.read_csv(path, index_col='DATE', parse_dates=True)
            df_macro.rename(columns={df_macro.columns[0]: f'macro_{name}'}, inplace=True)
            df_master = df_master.merge(df_macro, left_index=True, right_index=True, how='left')
        except FileNotFoundError:
            print(f"Warning: {path} not found, skipping {name} data")

    # Forward-fill macro data and interpolate any gaps
    macro_cols = [col for col in df_master.columns if 'macro_' in col]
    if macro_cols:
        df_master[macro_cols] = df_master[macro_cols].ffill()
    df_master.interpolate(method='time', inplace=True)
    df_master.dropna(inplace=True)
    print("‚úÖ Financial and Macro data successfully combined.")

except FileNotFoundError as e:
    print(f"‚ùå ERROR: A required file was not found. Please check your file paths. Missing file: {e.filename}")


# --- 2. Process and Merge GDELT Sentiment Data ---
print("\nStep 2: Processing and merging GDELT sentiment data...")
try:
    df_gdelt = pd.read_csv(GDELT_FILE)

    def parse_v2tone_quick(v2tone_str):
        try:
            return float(str(v2tone_str).split(',')[0])
        except (ValueError, IndexError):
            return np.nan

    df_gdelt['tone_avg'] = df_gdelt['V2Tone'].apply(parse_v2tone_quick)
    df_gdelt['date_only'] = pd.to_datetime(df_gdelt['DATE'], format='%Y%m%d%H%M%S').dt.date
    daily_sentiment = df_gdelt.groupby('date_only')['tone_avg'].mean().to_frame(name='gdelt_sentiment')
    daily_sentiment.index = pd.to_datetime(daily_sentiment.index)

    df_master = df_master.merge(daily_sentiment, left_index=True, right_index=True, how='left')
    df_master['gdelt_sentiment'] = df_master['gdelt_sentiment'].ffill().fillna(0)
    print("‚úÖ GDELT sentiment data successfully merged.")

except FileNotFoundError:
    print(f"‚ùå ERROR: GDELT file not found at '{GDELT_FILE}'. Please check the file path.")


# --- 3. Create Time-Series Features ---
print("\nStep 3: Creating time-series features (moving averages and lags)...")
df_master['sp500_ma_7'] = df_master['sp500_Close'].rolling(window=7).mean()
df_master['sp500_ma_30'] = df_master['sp500_Close'].rolling(window=30).mean()
for i in range(1, 4):
    df_master[f'sp500_lag_{i}'] = df_master['sp500_Close'].shift(i)

# Drop any rows with NaN values created by the feature engineering
df_master.dropna(inplace=True)
print("‚úÖ Time-series features successfully added.")


# --- 4. Save the Final Processed File ---
FINAL_OUTPUT_FILE = 'test/usa_forward_test_processed.csv'
df_master.to_csv(FINAL_OUTPUT_FILE)

print(f"\n--- üéâ Preprocessing Complete! ---")
print(f"‚úÖ Your model-ready data has been saved as: '{FINAL_OUTPUT_FILE}'")
print(f"Final data shape: {df_master.shape}")
print("\nSample of the final processed data:")
print(df_master.head())

--- üöÄ Starting Local Preprocessing of Fresh US Data (2024-2025) ---

Step 1: Loading and merging financial and macroeconomic data...
‚úÖ Financial and Macro data successfully combined.

Step 2: Processing and merging GDELT sentiment data...
‚úÖ GDELT sentiment data successfully merged.

Step 3: Creating time-series features (moving averages and lags)...
‚úÖ Time-series features successfully added.

--- üéâ Preprocessing Complete! ---
‚úÖ Your model-ready data has been saved as: 'test/usa_forward_test_processed.csv'
Final data shape: (302, 19)

Sample of the final processed data:
             sp500_Open   sp500_High    sp500_Low  sp500_Close  sp500_Volume  \
Date                                                                           
2024-05-10  5225.490234  5239.660156  5209.680176  5222.680176    3617900000   
2024-05-13  5233.080078  5237.259766  5211.160156  5221.419922    4255710000   
2024-05-14  5221.100098  5250.370117  5217.979980  5246.680176    4763580000   
2024-05-15

## Visualize Key Relationships
