## Financial Indices Data üìà

In [1]:
import yfinance as yf
import os
import pandas as pd

# --- Configuration ---
# List of tickers for the indices you need 
tickers = {
    "S&P 500": "^GSPC",
    "NASDAQ": "^IXIC",
    "FTSE 100": "^FTSE"
}

# Set the date range for the data 
# We use today's date for the end since 2025 is in the future.
start_date = "2024-01-01"
end_date = pd.Timestamp.now().strftime('%Y-%m-%d') # Gets today's date

# Path to save the test
output_folder = "test"

# --- Data Collection ---
# Create the data folder if it doesn't exist
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
    print(f"Created folder: {output_folder}")

# Loop through each ticker, download data, and save it
for name, ticker in tickers.items():
    print(f"Downloading data for {name} ({ticker})...")
    
    # Download the data using yfinance
    data = yf.download(ticker, start=start_date, end=end_date)
    
    if not data.empty:
        # Define the output file path
        file_name = f"{name.replace(' ', '_').lower()}_daily.csv"
        output_path = os.path.join(output_folder, file_name)
        
        # Save the data to a CSV file
        data.to_csv(output_path)
        print(f"Successfully saved data to {output_path}\n")
    else:
        print(f"Could not download data for {name}. It might be delisted or the ticker is wrong.\n")

print("--- Financial data collection complete! ---")

Created folder: test
Downloading data for S&P 500 (^GSPC)...


  data = yf.download(ticker, start=start_date, end=end_date)
[*********************100%***********************]  1 of 1 completed
  data = yf.download(ticker, start=start_date, end=end_date)


Successfully saved data to test\s&p_500_daily.csv

Downloading data for NASDAQ (^IXIC)...


[*********************100%***********************]  1 of 1 completed
  data = yf.download(ticker, start=start_date, end=end_date)


Successfully saved data to test\nasdaq_daily.csv

Downloading data for FTSE 100 (^FTSE)...


[*********************100%***********************]  1 of 1 completed

Successfully saved data to test\ftse_100_daily.csv

--- Financial data collection complete! ---





##  Macroeconomic Indicators üè¶

In [2]:
import pandas_datareader.data as web
import os
import pandas as pd

# --- Configuration ---
# Define the series IDs from FRED for the indicators you need 
# Note: These are official codes from the FRED website.
series_ids = {
    "VIX": "VIXCLS",                     # Volatility Index (Daily)
    "GDP": "GDP",                        # Gross Domestic Product (Quarterly)
    "UnemploymentRate": "UNRATE",        # Unemployment Rate (Monthly)
    "Inflation_CPI": "CPIAUCSL"          # Consumer Price Index / Inflation (Monthly)
}

# Set the date range
start_date = "2024-01-01"
end_date = pd.Timestamp.now().strftime('%Y-%m-%d')

# Path to save the data
output_folder = "test"

# --- Data Collection ---
print("Downloading macroeconomic data from FRED...")

for name, series_id in series_ids.items():
    try:
        # Download data from FRED
        data = web.DataReader(series_id, 'fred', start_date, end_date)
        
        # Define the output file path
        output_path = os.path.join(output_folder, f"macro_{name.lower()}.csv")
        
        # Save to CSV
        data.to_csv(output_path)
        print(f"Successfully saved {name} data to {output_path}")
        
    except Exception as e:
        print(f"Could not download data for {name}. Error: {e}")

print("\n--- Macroeconomic data collection complete! ---")

Downloading macroeconomic data from FRED...
Successfully saved VIX data to test\macro_vix.csv
Successfully saved GDP data to test\macro_gdp.csv
Successfully saved UnemploymentRate data to test\macro_unemploymentrate.csv
Successfully saved Inflation_CPI data to test\macro_inflation_cpi.csv

--- Macroeconomic data collection complete! ---


## Sentiment Data (Text) üì∞

In [None]:
import os
import pandas as pd
from newsapi import NewsApiClient
import time

# --- IMPORTANT NOTE ---
# This script will ONLY work if you have a paid NewsAPI plan that allows
# access to historical articles. The free plan is limited to the last 30 days.

# --- Configuration ---
# Replace with your actual API key from a paid plan
api_key = 'YOUR_PAID_API_KEY' 

# The keyword or phrase to search for
search_query = 'stock market OR finance OR economy'

# Define the full date range for your research
start_date = '2015-01-01'
end_date = pd.Timestamp.now().strftime('%Y-%m-%d') # Today's date

# Path to save the final CSV file
output_folder = "data"
output_filename = os.path.join(output_folder, "news_headlines_2015_to_2025.csv")

# --- Initialization ---
newsapi = NewsApiClient(api_key=api_key)
all_articles_list = []

# Generate a list of start/end dates for each month in your range
# This is how we get around the API's limitations for large requests.
date_ranges = pd.date_range(start=start_date, end=end_date, freq='MS')

print(f"Starting data collection from {start_date} to {end_date}...")

# --- Main Loop to Fetch Historical Data ---
for month_start_date in date_ranges:
    month_end_date = (month_start_date + pd.offsets.MonthEnd(1)).strftime('%Y-%m-%d')
    month_start_date_str = month_start_date.strftime('%Y-%m-%d')
    
    print(f"Fetching news for: {month_start_date_str} to {month_end_date}...")
    
    try:
        # Make the request for the current month
        response = newsapi.get_everything(
            q=search_query,
            language='en',
            from_param=month_start_date_str,
            to=month_end_date,
            sort_by='publishedAt',
            page_size=100 # Max results per page
        )
        
        if response['status'] == 'ok':
            # Add the fetched articles to our master list
            for article in response['articles']:
                all_articles_list.append({
                    'date': article['publishedAt'],
                    'title': article['title'],
                    'source': article['source']['name']
                })
            print(f"  > Found {response['totalResults']} articles for this month.")
        else:
            print(f"  > Error fetching data for this month: {response.get('message')}")

    except Exception as e:
        print(f"  > An error occurred: {e}")

    # Be respectful to the API server, wait 1 second between requests
    time.sleep(1)

# --- Save to CSV ---
if all_articles_list:
    # Convert the list of articles into a pandas DataFrame
    articles_df = pd.DataFrame(all_articles_list)
    
    # Clean up the date column
    articles_df['date'] = pd.to_datetime(articles_df['date']).dt.tz_localize(None)
    
    # Save the final DataFrame to a CSV file
    articles_df.to_csv(output_filename, index=False)
    print(f"\nSuccessfully collected and saved {len(articles_df)} articles to {output_filename}")
else:
    print("\nNo articles were collected. Please check your API key and plan.")

print("\n--- Historical news data collection complete! ---")

##  Geopolitical & Crisis Events Data üóìÔ∏è

In [1]:
import gdelt
import pandas as pd
import os
import time

# --- IMPORTANT NOTE ---
# This is a heavy data request and can take a long time to run (possibly hours).
# It will search for events month-by-month over a 10+ year period.

# --- Configuration ---
# Set up GDELT version 2
gd = gdelt.gdelt(version=2)

# Define a smaller date range for testing (GDELT has strict limitations)
# Let's start with just a few recent months to test the approach
start_date = "2024-01-01"
end_date = "2024-01-07"  # Just 3 months for testing

# Path to save the final CSV file
output_folder = "data"
output_filename = os.path.join(output_folder, "gdelt_crisis_events_test.csv")

# --- Initialization ---
all_results = []
# Generate weekly date ranges instead of monthly (GDELT prefers smaller ranges)
date_ranges = pd.date_range(start=start_date, end=end_date, freq='W')

print(f"Starting GDELT data collection from {start_date} to {end_date}...")
print(f"Searching {len(date_ranges)} weekly periods...")
print(f"Note: This is a test run with a smaller date range to avoid API limitations")

# --- Main Loop to Fetch Historical Data ---
for i, week_start_date in enumerate(date_ranges):
    # Calculate the end of the current week (7 days later)
    week_end_date = (week_start_date + pd.Timedelta(days=6)).strftime('%Y-%m-%d')
    week_start_date_str = week_start_date.strftime('%Y-%m-%d')
    
    print(f"Week {i+1}/{len(date_ranges)}: Searching events for {week_start_date_str} to {week_end_date}...")
    
    try:
        # Search GDELT for the current week using a simple date range
        # Using 'gkg' table for Global Knowledge Graph data
        results_df = gd.Search(
            date=[week_start_date_str, week_end_date], 
            table='gkg',
            output='df'
        )
        
        if not results_df.empty:
            # Filter for economic/financial themes if the Themes column exists
            if 'Themes' in results_df.columns:
                # Filter for rows containing economic/financial themes
                economic_filter = results_df['Themes'].str.contains(
                    'ECON_STOCKMARKET|FINANCIAL|CRISIS|UNREST|MARKET|ECONOMY', 
                    case=False, 
                    na=False
                )
                results_df = results_df[economic_filter]
            
            if not results_df.empty:
                print(f"  > Found {len(results_df)} relevant events.")
                all_results.append(results_df)
            else:
                print("  > No relevant economic events found for this week.")
        else:
            print("  > No events found for this week.")

    except Exception as e:
        print(f"  > An error occurred for this week: {e}")
        # Continue with next week even if this one fails
        continue
        
    # Wait 2 seconds to be respectful to the GDELT servers
    time.sleep(2)

# --- Combine, Clean, and Save Results ---
if all_results:
    # Combine all the weekly DataFrames into one large DataFrame
    final_df = pd.concat(all_results, ignore_index=True)
    
    # Let's select a few of the most useful columns to keep the file size manageable
    columns_to_keep = [
        'DATE',
        'SourceCommonName',
        'DocumentIdentifier',
        'V2Tone', # This is very useful - it includes sentiment scores
        'Themes',
        'Locations'
    ]
    
    # Check which columns actually exist in the data
    available_columns = [col for col in columns_to_keep if col in final_df.columns]
    print(f"Available columns: {available_columns}")
    
    # Filter for only the columns we need that actually exist
    final_df_cleaned = final_df[available_columns]
    
    # Save the final DataFrame to a CSV file
    final_df_cleaned.to_csv(output_filename, index=False)
    
    print(f"\nSuccessfully collected and saved {len(final_df_cleaned)} total events to {output_filename}")
    print(f"Sample of first few rows:")
    print(final_df_cleaned.head())
else:
    print("\nNo events were collected overall. Try adjusting your date range or check the API status.")

print("\n--- GDELT crisis event collection complete! ---")

here
Starting GDELT data collection from 2024-01-01 to 2024-01-07...
Searching 1 weekly periods...
Note: This is a test run with a smaller date range to avoid API limitations
Week 1/1: Searching events for 2024-01-07 to 2024-01-13...
Starting GDELT data collection from 2024-01-01 to 2024-01-07...
Searching 1 weekly periods...
Note: This is a test run with a smaller date range to avoid API limitations
Week 1/1: Searching events for 2024-01-07 to 2024-01-13...
  > Found 7300 relevant events.
  > Found 7300 relevant events.
Available columns: ['DATE', 'SourceCommonName', 'DocumentIdentifier', 'V2Tone', 'Themes', 'Locations']

Successfully collected and saved 7300 total events to data\gdelt_crisis_events_test.csv
Sample of first few rows:
             DATE            SourceCommonName  \
0  20240107234500                    wfmj.com   
1  20240107234500           politicalwire.com   
2  20240107234500  northerndailyleader.com.au   
3  20240107234500       idahostatejournal.com   
4  2024010

## Advanced GDELT Data Filtering for Financial Research üéØ

In [2]:
import pandas as pd
import numpy as np

# Read the collected GDELT data
df = pd.read_csv('data/gdelt_crisis_events_test.csv')

print(f"Original dataset size: {len(df)} events")
print(f"Date range: {df['DATE'].min()} to {df['DATE'].max()}")

# --- FINANCIAL-SPECIFIC THEME FILTERING ---
# Define highly relevant financial/economic themes for forecasting
high_priority_themes = [
    'ECON_STOCKMARKET',           # Direct stock market events
    'ECON_INFLATION',             # Inflation events
    'ECON_RECESSION',             # Recession indicators
    'ECON_GDP',                   # GDP-related events
    'ECON_UNEMPLOYMENT',          # Employment data
    'ECON_INTEREST_RATE',         # Interest rate changes
    'ECON_CURRENCY',              # Currency events
    'WB_.*FINANCIAL.*',           # World Bank financial themes
    'WB_.*ECONOMIC.*',            # World Bank economic themes
    'WB_.*MONETARY.*',            # Monetary policy
    'EPU_POLICY',                 # Economic Policy Uncertainty
    'EPU_ECONOMY',                # Economic uncertainty
    'CRISISLEX.*FINANCIAL',       # Financial crisis lexicon
]

medium_priority_themes = [
    'ECON_DEBT',                  # Debt-related events
    'ECON_TRADE',                 # Trade events
    'ECON_DEVELOPMENTORGS',       # Development organizations
    'WB_.*BUSINESS.*',            # Business environment
    'WB_.*INVESTMENT.*',          # Investment climate
    'WB_.*BANKING.*',             # Banking sector
    'MANMADE_DISASTER.*FINANCIAL', # Financial disasters
    'AFFECT',                     # Market sentiment
]

# Create pattern strings for filtering
high_priority_pattern = '|'.join(high_priority_themes)
medium_priority_pattern = '|'.join(medium_priority_themes)

# Apply filtering
high_priority_filter = df['Themes'].str.contains(high_priority_pattern, case=False, na=False, regex=True)
medium_priority_filter = df['Themes'].str.contains(medium_priority_pattern, case=False, na=False, regex=True)

# Combine filters
financial_filter = high_priority_filter | medium_priority_filter

# Apply the filter
financial_df = df[financial_filter].copy()

print(f"\nAfter financial theme filtering: {len(financial_df)} events ({len(financial_df)/len(df)*100:.1f}% of original)")

# --- SENTIMENT SCORING ---
# Parse the V2Tone column (format: "tone1,tone2,tone3,tone4,tone5,tone6,wordcount")
def parse_v2tone(v2tone_str):
    """Parse V2Tone string into individual sentiment components"""
    try:
        parts = str(v2tone_str).split(',')
        if len(parts) >= 7:
            return {
                'tone_avg': float(parts[0]),           # Average tone (-100 to +100)
                'tone_positive': float(parts[1]),      # Positive sentiment score
                'tone_negative': float(parts[2]),      # Negative sentiment score  
                'tone_polarity': float(parts[3]),      # Polarity (how extreme)
                'tone_activity': float(parts[4]),      # Activity reference density
                'tone_self_group': float(parts[5]),    # Self/group reference density
                'word_count': int(parts[6])            # Total words in document
            }
    except:
        pass
    return None

# Apply sentiment parsing
sentiment_data = financial_df['V2Tone'].apply(parse_v2tone)
sentiment_df = pd.json_normalize(sentiment_data.dropna())

# Add sentiment scores back to financial_df
valid_indices = sentiment_data.dropna().index
for col in sentiment_df.columns:
    financial_df.loc[valid_indices, col] = sentiment_df[col].values

print(f"Successfully parsed sentiment for {len(sentiment_df)} events")

# --- LOCATION FILTERING ---
# Focus on major financial centers and markets
major_financial_centers = [
    'New York', 'London', 'Tokyo', 'Hong Kong', 'Singapore', 'Frankfurt', 
    'Zurich', 'Toronto', 'Sydney', 'Paris', 'Milan', 'Amsterdam',
    'United States', 'United Kingdom', 'Germany', 'Japan', 'China',
    'Switzerland', 'Canada', 'Australia', 'France', 'Italy'
]

# Create location filter
location_pattern = '|'.join(major_financial_centers)
major_market_filter = financial_df['Locations'].str.contains(location_pattern, case=False, na=False)

# Split data into different priority levels
high_impact_events = financial_df[high_priority_filter & major_market_filter].copy()
medium_impact_events = financial_df[medium_priority_filter & major_market_filter].copy()
other_financial_events = financial_df[~major_market_filter].copy()

print(f"\nEvent categorization:")
print(f"High impact (key themes + major markets): {len(high_impact_events)} events")
print(f"Medium impact (other themes + major markets): {len(medium_impact_events)} events")
print(f"Other financial events: {len(other_financial_events)} events")

# --- FINAL CLEANED DATASET ---
# Focus on high and medium impact events for forecasting
forecasting_dataset = pd.concat([high_impact_events, medium_impact_events], ignore_index=True)

# Clean up columns and add derived features
forecasting_dataset['date_parsed'] = pd.to_datetime(forecasting_dataset['DATE'], format='%Y%m%d%H%M%S', errors='coerce')
forecasting_dataset['date_only'] = forecasting_dataset['date_parsed'].dt.date

# Save the filtered dataset
output_path = 'data/gdelt_financial_filtered.csv'
forecasting_dataset.to_csv(output_path, index=False)

print(f"\n--- FILTERING COMPLETE ---")
print(f"Final dataset for forecasting: {len(forecasting_dataset)} events")
print(f"Saved to: {output_path}")

# Show sample of the most relevant events
print(f"\nSample of high-impact events:")
if len(high_impact_events) > 0:
    sample_events = high_impact_events.head(3)
    for idx, row in sample_events.iterrows():
        print(f"\nDate: {row['DATE']}")
        print(f"Source: {row['SourceCommonName']}")
        print(f"Avg Tone: {row.get('tone_avg', 'N/A')}")
        print(f"Key Themes: {row['Themes'][:200]}...")
        print(f"Locations: {row['Locations'][:100]}...")
else:
    print("No high-impact events found in this sample")

Original dataset size: 7300 events
Date range: 20240107234500 to 20240113234500

After financial theme filtering: 4964 events (68.0% of original)
Successfully parsed sentiment for 4964 events

Event categorization:
High impact (key themes + major markets): 3323 events
Medium impact (other themes + major markets): 1463 events
Other financial events: 1246 events


  high_impact_events = financial_df[high_priority_filter & major_market_filter].copy()
  medium_impact_events = financial_df[medium_priority_filter & major_market_filter].copy()



--- FILTERING COMPLETE ---
Final dataset for forecasting: 4786 events
Saved to: data/gdelt_financial_filtered.csv

Sample of high-impact events:

Date: 20240107234500
Source: northerndailyleader.com.au
Avg Tone: 1.99335548172757
Key Themes: USPEC_POLICY1;EPU_POLICY;EPU_POLICY_SPENDING;UNGP_FORESTS_RIVERS_OCEANS;AFFECT;TAX_FNCACT;TAX_FNCACT_CHILD;USPEC_POLITICS_GENERAL1;IDEOLOGY;ECON_DEBT;WB_1104_MACROECONOMIC_VULNERABILITY_AND_DEBT;WB_45...
Locations: 4#Gleneagle, Queensland, Australia#AS#AS04#-27.9333#152.983#-1575354...

Date: 20240107234500
Source: idahostatejournal.com
Avg Tone: -1.31233595800525
Key Themes: TAX_FNCACT;TAX_FNCACT_OFFICIALS;TAX_FNCACT_FEDERAL_OFFICIALS;CRISISLEX_C04_LOGISTICS_TRANSPORT;MANMADE_DISASTER_IMPLIED;WB_1921_PRIVATE_SECTOR_DEVELOPMENT;WB_405_BUSINESS_CLIMATE;WB_2531_INSPECTIONS_L...
Locations: 2#New York, United States#US#USNY#42.1497#-74.9384#NY;1#South Korea#KS#KS#37#127.5#KS;3#Portland, Or...

Date: 20240107234500
Source: times-news.com
Avg Tone: -2.23

In [3]:
# --- EXPLORE AVAILABLE THEMES ---
# Let's see what financial/economic themes are actually in your data

print("=== THEME ANALYSIS ===")

# Get all unique themes from the dataset
all_themes = []
for themes_str in df['Themes'].dropna():
    themes_list = str(themes_str).split(';')
    all_themes.extend(themes_list)

# Count theme frequency
from collections import Counter
theme_counts = Counter(all_themes)

# Find financial/economic related themes
financial_keywords = ['ECON', 'FINANCIAL', 'BUSINESS', 'MARKET', 'MONEY', 'BANK', 'INVEST', 'TRADE', 'ECONOMY', 'GDP', 'INFLATION', 'DEBT', 'STOCK']

print("Financial/Economic themes found in your data:")
print("-" * 50)

financial_themes_found = {}
for theme, count in theme_counts.most_common():
    for keyword in financial_keywords:
        if keyword in theme.upper():
            financial_themes_found[theme] = count
            break

# Display financial themes sorted by frequency
for theme, count in sorted(financial_themes_found.items(), key=lambda x: x[1], reverse=True):
    print(f"{theme:<50} : {count:>4} occurrences")

print(f"\nTotal unique financial themes found: {len(financial_themes_found)}")
print(f"Total financial theme mentions: {sum(financial_themes_found.values())}")

# Show top 10 most common themes overall (to understand the data better)
print(f"\n=== TOP 10 MOST COMMON THEMES OVERALL ===")
for theme, count in theme_counts.most_common(10):
    print(f"{theme:<50} : {count:>4} occurrences")

# Show some sentiment statistics
print(f"\n=== SENTIMENT ANALYSIS ===")
if 'tone_avg' in forecasting_dataset.columns:
    sentiment_stats = forecasting_dataset['tone_avg'].describe()
    print("Sentiment score statistics (tone_avg):")
    print(sentiment_stats)
    
    negative_events = len(forecasting_dataset[forecasting_dataset['tone_avg'] < -2])
    positive_events = len(forecasting_dataset[forecasting_dataset['tone_avg'] > 2])
    neutral_events = len(forecasting_dataset[abs(forecasting_dataset['tone_avg']) <= 2])
    
    print(f"\nSentiment breakdown:")
    print(f"Negative events (tone < -2): {negative_events}")
    print(f"Neutral events (-2 <= tone <= 2): {neutral_events}")
    print(f"Positive events (tone > 2): {positive_events}")
else:
    print("Sentiment parsing failed - check V2Tone column format")

=== THEME ANALYSIS ===
Financial/Economic themes found in your data:
--------------------------------------------------
EPU_ECONOMY_HISTORIC                               : 2136 occurrences
TAX_ECON_PRICE                                     : 1295 occurrences
WB_2025_INVESTIGATION                              :  746 occurrences
EPU_ECONOMY                                        :  682 occurrences
WB_698_TRADE                                       :  622 occurrences
WB_1920_FINANCIAL_SECTOR_DEVELOPMENT               :  494 occurrences
ECON_STOCKMARKET                                   :  452 occurrences
WB_1104_MACROECONOMIC_VULNERABILITY_AND_DEBT       :  411 occurrences
WB_1484_EDUCATION_SKILLS_DEVELOPMENT_AND_LABOR_MARKET :  411 occurrences
WB_405_BUSINESS_CLIMATE                            :  358 occurrences
WB_2530_BUSINESS_ENVIRONMENT                       :  352 occurrences
ECON_TAXATION                                      :  291 occurrences
WB_855_LABOR_MARKETS                 

## Ultra-Selective Financial Event Filtering üéØ‚ö°

In [None]:
# --- ULTRA-SELECTIVE FILTERING FOR FINANCIAL FORECASTING ---
# Target: 100-300 events per year (about 2-6 events per week)
# Strategy: Only the most market-moving events

import pandas as pd
import numpy as np

# Load the existing filtered data
df = pd.read_csv('data/gdelt_financial_filtered.csv')
print(f"Starting with {len(df)} events from previous filtering")

# --- TIER 1: HIGHEST IMPACT THEMES ONLY ---
# These are the themes most likely to directly impact stock markets
tier1_themes = [
    'ECON_STOCKMARKET',                    # Direct stock market mentions
    'ECON_INFLATION',                      # Inflation announcements
    'ECON_RECESSION',                      # Recession indicators
    'ECON_GDP',                           # GDP announcements
    'ECON_UNEMPLOYMENT',                  # Employment data releases
    'ECON_INTEREST_RATE',                 # Interest rate decisions
    'ECON_CURRENCY',                      # Currency crises
    'EPU_ECONOMY',                        # Major economic uncertainty
    'WB_1920_FINANCIAL_SECTOR_DEVELOPMENT', # Financial sector events
    'WB_439_MACROECONOMIC_AND_STRUCTURAL_POLICIES', # Major policy changes
]

# --- TIER 2: MAJOR FINANCIAL INSTITUTIONS & CENTRAL BANKS ---
# Events from these sources are typically market-moving
tier2_themes = [
    'WB_.*MONETARY.*',                    # Monetary policy
    'WB_.*CENTRAL_BANK.*',               # Central bank actions
    'WB_.*FINANCIAL_CRISIS.*',           # Financial crisis events
    'CRISISLEX.*FINANCIAL',              # Financial crisis lexicon
    'EPU_POLICY',                        # Major policy announcements
]

# --- SENTIMENT-BASED FILTERING ---
# Only events with extreme sentiment (very positive or very negative)
def is_extreme_sentiment(tone_avg):
    """Check if sentiment is extreme enough to potentially move markets"""
    try:
        tone = float(tone_avg)
        return abs(tone) >= 5.0  # Very strong positive or negative sentiment
    except:
        return False

# --- LOCATION-BASED FILTERING ---
# Only events in major financial centers
tier1_locations = [
    'New York', 'Manhattan', 'Wall Street',           # US Financial Center
    'London', 'City of London',                       # UK Financial Center  
    'Frankfurt', 'European Central Bank',             # EU Financial Center
    'Tokyo', 'Nikkei',                               # Japan Financial Center
    'Washington', 'Federal Reserve',                  # US Policy Center
    'Brussels', 'European Union',                     # EU Policy Center
]

# --- MAJOR NEWS SOURCES ONLY ---
# Focus on major financial news sources that markets actually follow
major_financial_sources = [
    'reuters', 'bloomberg', 'wsj', 'ft.com', 'cnbc', 'marketwatch', 
    'yahoo.com', 'cnn.com', 'bbc', 'ap.org', 'economist.com',
    'financial-times', 'wall-street-journal', 'associated-press'
]

# Apply Tier 1 filtering (most restrictive)
print("\n=== APPLYING ULTRA-SELECTIVE FILTERS ===")

# Filter 1: Tier 1 themes only
tier1_pattern = '|'.join(tier1_themes)
tier1_filter = df['Themes'].str.contains(tier1_pattern, case=False, na=False, regex=True)
tier1_events = df[tier1_filter].copy()
print(f"After Tier 1 themes: {len(tier1_events)} events")

# Filter 2: Add Tier 2 themes for broader coverage
tier2_pattern = '|'.join(tier2_themes) 
tier2_filter = df['Themes'].str.contains(tier2_pattern, case=False, na=False, regex=True)
tier1_and_2_events = df[tier1_filter | tier2_filter].copy()
print(f"After adding Tier 2 themes: {len(tier1_and_2_events)} events")

# Filter 3: Extreme sentiment only
extreme_sentiment_filter = tier1_and_2_events['tone_avg'].apply(is_extreme_sentiment)
high_impact_events = tier1_and_2_events[extreme_sentiment_filter].copy()
print(f"After extreme sentiment filtering: {len(high_impact_events)} events")

# Filter 4: Major financial centers only
location_pattern = '|'.join(tier1_locations)
location_filter = high_impact_events['Locations'].str.contains(location_pattern, case=False, na=False)
geographic_filtered = high_impact_events[location_filter].copy()
print(f"After major financial centers: {len(geographic_filtered)} events")

# Filter 5: Major news sources only
source_pattern = '|'.join(major_financial_sources)
source_filter = geographic_filtered['SourceCommonName'].str.contains(source_pattern, case=False, na=False)
source_filtered = geographic_filtered[source_filter].copy()
print(f"After major financial sources: {len(source_filtered)} events")

# --- FINAL SELECTION APPROACH ---
# If still too many events, use additional criteria

if len(source_filtered) > 50:  # Still too many for one week
    print(f"\nStill {len(source_filtered)} events - applying final selection criteria:")
    
    # Criteria 1: Word count (longer articles are typically more significant)
    source_filtered['word_count_score'] = source_filtered['word_count'] / source_filtered['word_count'].max()
    
    # Criteria 2: Sentiment extremity (more extreme = more important)
    source_filtered['sentiment_extremity'] = abs(source_filtered['tone_avg'])
    source_filtered['sentiment_score'] = source_filtered['sentiment_extremity'] / source_filtered['sentiment_extremity'].max()
    
    # Criteria 3: Theme relevance (count of financial themes)
    def count_financial_themes(themes_str):
        if pd.isna(themes_str):
            return 0
        count = 0
        for theme in tier1_themes + tier2_themes:
            if theme.replace('.*', '') in str(themes_str):
                count += 1
        return count
    
    source_filtered['theme_count'] = source_filtered['Themes'].apply(count_financial_themes)
    source_filtered['theme_score'] = source_filtered['theme_count'] / max(source_filtered['theme_count'].max(), 1)
    
    # Combined importance score
    source_filtered['importance_score'] = (
        0.4 * source_filtered['sentiment_score'] +      # 40% sentiment
        0.3 * source_filtered['theme_score'] +          # 30% theme relevance  
        0.3 * source_filtered['word_count_score']       # 30% article length
    )
    
    # Select top 20 most important events per week
    final_events = source_filtered.nlargest(20, 'importance_score')
    
else:
    final_events = source_filtered.copy()
    final_events['importance_score'] = 1.0  # All events are important

print(f"\n=== FINAL SELECTION ===")
print(f"Ultra-filtered events: {len(final_events)} events")
print(f"Projected yearly events: {len(final_events) * 52} events/year")

# --- SAVE ULTRA-FILTERED DATASET ---
# Clean up and save
final_columns = [
    'DATE', 'date_only', 'SourceCommonName', 'DocumentIdentifier',
    'tone_avg', 'tone_positive', 'tone_negative', 'tone_polarity',
    'Themes', 'Locations', 'word_count', 'importance_score'
]

ultra_filtered_df = final_events[final_columns].copy()
ultra_filtered_df.to_csv('data/gdelt_ultra_filtered.csv', index=False)

print(f"Saved ultra-filtered dataset to: data/gdelt_ultra_filtered.csv")

# --- SHOW SAMPLE RESULTS ---
print(f"\n=== SAMPLE OF MOST IMPORTANT EVENTS ===")
if len(final_events) > 0:
    top_events = final_events.nlargest(5, 'importance_score')
    for idx, row in top_events.iterrows():
        print(f"\nüìà Importance Score: {row['importance_score']:.3f}")
        print(f"üìÖ Date: {row['date_only']}")
        print(f"üì∞ Source: {row['SourceCommonName']}")
        print(f"üòä Sentiment: {row['tone_avg']:.1f}")
        print(f"üìù Word Count: {row['word_count']}")
        print(f"üîó URL: {row['DocumentIdentifier'][:80]}...")
        print(f"üè∑Ô∏è Key Themes: {str(row['Themes'])[:100]}...")
else:
    print("No events found with current criteria - consider relaxing filters")

print(f"\n‚úÖ ULTRA-FILTERING COMPLETE!")
print(f"üìä Target achieved: ~{len(final_events) * 52} events per year")

## USA-Focused Financial Event Filtering üá∫üá∏üìà

In [4]:
# --- USA-FOCUSED ULTRA-SELECTIVE FILTERING ---
# Target: 50-150 events per year for USA financial markets
# Strategy: Only US financial events that could impact S&P 500, NASDAQ, etc.

import pandas as pd
import numpy as np

# Load the existing filtered data
df = pd.read_csv('data/gdelt_financial_filtered.csv')
print(f"Starting with {len(df)} events from previous filtering")

# --- USA-SPECIFIC FILTERING ---
# Focus only on events within the United States
usa_locations = [
    'United States', 'New York', 'Manhattan', 'Wall Street', 'NYSE', 'NASDAQ',
    'Washington', 'Federal Reserve', 'Fed', 'Treasury', 'SEC', 
    'California', 'Silicon Valley', 'San Francisco', 'Los Angeles',
    'Chicago', 'Illinois', 'Boston', 'Massachusetts', 'Texas', 'Florida',
    'White House', 'Congress', 'Senate', 'House of Representatives'
]

# Create USA location filter
usa_pattern = '|'.join(usa_locations)
usa_filter = df['Locations'].str.contains(usa_pattern, case=False, na=False)
usa_events = df[usa_filter].copy()

print(f"After USA location filtering: {len(usa_events)} events")

# --- USA FINANCIAL THEMES (MOST CRITICAL) ---
# Focus on themes that directly impact US markets
usa_financial_themes = [
    'ECON_STOCKMARKET',                    # Stock market events
    'ECON_INFLATION',                      # US inflation data
    'ECON_RECESSION',                      # US recession indicators
    'ECON_GDP',                           # US GDP announcements
    'ECON_UNEMPLOYMENT',                  # US employment data
    'ECON_INTEREST_RATE',                 # Fed interest rates
    'EPU_ECONOMY',                        # US economic uncertainty
    'EPU_POLICY',                         # US policy changes
    'WB_1920_FINANCIAL_SECTOR_DEVELOPMENT', # Financial sector
    'WB_439_MACROECONOMIC_AND_STRUCTURAL_POLICIES', # Major US policies
]

# Apply theme filtering
usa_theme_pattern = '|'.join(usa_financial_themes)
usa_theme_filter = usa_events['Themes'].str.contains(usa_theme_pattern, case=False, na=False, regex=True)
usa_themed_events = usa_events[usa_theme_filter].copy()

print(f"After USA financial themes: {len(usa_themed_events)} events")

# --- US FINANCIAL NEWS SOURCES ---
# Focus on major US financial news sources
us_financial_sources = [
    'reuters.com', 'bloomberg', 'wsj', 'cnbc', 'marketwatch', 'yahoo.com',
    'cnn.com', 'foxnews.com', 'abc', 'nbc', 'cbs', 'ap.org', 'usatoday',
    'washingtonpost', 'nytimes', 'fortune', 'forbes', 'business-insider',
    'thestreet', 'seeking-alpha', 'barrons'
]

# Apply source filtering
us_source_pattern = '|'.join(us_financial_sources)
source_filter = usa_themed_events['SourceCommonName'].str.contains(us_source_pattern, case=False, na=False)
usa_source_events = usa_themed_events[source_filter].copy()

print(f"After US financial sources: {len(usa_source_events)} events")

# --- EXTREME SENTIMENT FOR US MARKETS ---
# Only events with very strong sentiment (market-moving potential)
def is_usa_market_moving_sentiment(tone_avg):
    """Check if sentiment is extreme enough to move US markets"""
    try:
        tone = float(tone_avg)
        return abs(tone) >= 7.0  # Very extreme sentiment for US markets
    except:
        return False

extreme_usa_sentiment = usa_source_events['tone_avg'].apply(is_usa_market_moving_sentiment)
usa_extreme_events = usa_source_events[extreme_usa_sentiment].copy()

print(f"After extreme sentiment (¬±7.0): {len(usa_extreme_events)} events")

# --- US MARKET HOURS & BUSINESS DAYS ---
# Focus on events that happen during or close to US market hours
usa_extreme_events['hour'] = pd.to_datetime(usa_extreme_events['DATE'], format='%Y%m%d%H%M%S').dt.hour
usa_extreme_events['weekday'] = pd.to_datetime(usa_extreme_events['DATE'], format='%Y%m%d%H%M%S').dt.weekday

# US market hours: 9:30 AM - 4:00 PM ET (roughly 14:30 - 21:00 UTC)
# Business days: Monday-Friday (0-4 in Python weekday)
market_hours_filter = (
    (usa_extreme_events['hour'].between(6, 23)) &  # Extended hours for global impact
    (usa_extreme_events['weekday'] < 5)  # Monday to Friday
)

usa_market_relevant = usa_extreme_events[market_hours_filter].copy()
print(f"After market hours/business days: {len(usa_market_relevant)} events")

# --- IMPORTANCE SCORING FOR USA EVENTS ---
if len(usa_market_relevant) > 20:  # Still need further filtering
    print(f"\nApplying final USA-specific importance scoring...")
    
    # USA-specific scoring criteria
    usa_market_relevant['usa_sentiment_score'] = abs(usa_market_relevant['tone_avg']) / 15.0  # Normalize to max 15
    usa_market_relevant['usa_word_score'] = usa_market_relevant['word_count'] / usa_market_relevant['word_count'].max()
    
    # Count USA-specific financial themes
    def count_usa_themes(themes_str):
        if pd.isna(themes_str):
            return 0
        count = 0
        for theme in usa_financial_themes:
            if theme in str(themes_str):
                count += 1
        return count
    
    usa_market_relevant['usa_theme_count'] = usa_market_relevant['Themes'].apply(count_usa_themes)
    usa_market_relevant['usa_theme_score'] = usa_market_relevant['usa_theme_count'] / max(usa_market_relevant['usa_theme_count'].max(), 1)
    
    # USA market importance score
    usa_market_relevant['usa_importance_score'] = (
        0.5 * usa_market_relevant['usa_sentiment_score'] +      # 50% sentiment (most important for markets)
        0.3 * usa_market_relevant['usa_theme_score'] +          # 30% theme relevance
        0.2 * usa_market_relevant['usa_word_score']             # 20% article significance
    )
    
    # Select top 10 most important USA events per week (targeting ~500/year)
    final_usa_events = usa_market_relevant.nlargest(10, 'usa_importance_score')
    
else:
    final_usa_events = usa_market_relevant.copy()
    final_usa_events['usa_importance_score'] = 1.0

print(f"\n=== USA FINANCIAL EVENTS FINAL SELECTION ===")
print(f"Ultra-filtered USA events: {len(final_usa_events)} events")
print(f"Projected yearly USA events: {len(final_usa_events) * 52} events/year")

# --- SAVE USA-FOCUSED DATASET ---
usa_columns = [
    'DATE', 'date_only', 'SourceCommonName', 'DocumentIdentifier',
    'tone_avg', 'tone_positive', 'tone_negative', 'tone_polarity',
    'Themes', 'Locations', 'word_count', 'usa_importance_score'
]

available_usa_columns = [col for col in usa_columns if col in final_usa_events.columns]
usa_final_df = final_usa_events[available_usa_columns].copy()
usa_final_df.to_csv('data/gdelt_usa_financial.csv', index=False)

print(f"Saved USA-focused dataset to: data/gdelt_usa_financial.csv")

# --- SHOW USA SAMPLE RESULTS ---
print(f"\n=== TOP USA FINANCIAL EVENTS ===")
if len(final_usa_events) > 0:
    for idx, row in final_usa_events.head(3).iterrows():
        print(f"\nüá∫üá∏ USA Importance Score: {row.get('usa_importance_score', 'N/A'):.3f}")
        print(f"üìÖ Date: {row['date_only']}")
        print(f"üì∞ Source: {row['SourceCommonName']}")
        print(f"üòä Sentiment: {row['tone_avg']:.1f}")
        print(f"üìù Word Count: {row['word_count']}")
        print(f"üîó URL: {row['DocumentIdentifier'][:80]}...")
        print(f"üè∑Ô∏è Key Themes: {str(row['Themes'])[:100]}...")
        print(f"üìç USA Locations: {str(row['Locations'])[:100]}...")
else:
    print("No USA events found - consider relaxing filters")

print(f"\n‚úÖ USA-FOCUSED FILTERING COMPLETE!")
print(f"üéØ Perfect for US market forecasting: ~{len(final_usa_events) * 52} events per year")
print(f"üìà Focus: S&P 500, NASDAQ, US economic indicators")

Starting with 4786 events from previous filtering
After USA location filtering: 3614 events
After USA financial themes: 3265 events
After US financial sources: 351 events
After extreme sentiment (¬±7.0): 23 events
After market hours/business days: 19 events

=== USA FINANCIAL EVENTS FINAL SELECTION ===
Ultra-filtered USA events: 19 events
Projected yearly USA events: 988 events/year
Saved USA-focused dataset to: data/gdelt_usa_financial.csv

=== TOP USA FINANCIAL EVENTS ===

üá∫üá∏ USA Importance Score: 1.000
üìÖ Date: 2024-01-09
üì∞ Source: yahoo.com
üòä Sentiment: -12.0
üìù Word Count: 305.0
üîó URL: https://news.yahoo.com/suspect-rape-12-old-culver-230028573.html...
üè∑Ô∏è Key Themes: TRIAL;RAPE;SOC_POINTSOFINTEREST;SOC_POINTSOFINTEREST_PRISON;WB_2495_DETENTION_PRISON_AND_CORRECTIONS...
üìç USA Locations: 3#Kern County, California, United States#US#USCA#35.2961#-118.668#2054176;3#Los Angeles County, Cali...

üá∫üá∏ USA Importance Score: 1.000
üìÖ Date: 2024-01-09
üì∞ So

## 10-Year USA Financial Data Collection üìÖüá∫üá∏

In [None]:
# --- 10-YEAR USA FINANCIAL DATA COLLECTION ---
# Systematic collection of GDELT data from 2015-2025 for USA financial events
# Strategy: Collect weekly, apply USA filtering immediately to manage data volume

import gdelt
import pandas as pd
import os
import time
from datetime import datetime, timedelta

# --- Configuration ---
gd = gdelt.gdelt(version=2)

# 10-year date range for your research
start_date = "2015-03-01"
end_date = "2025-07-01"  # 10 years of data

# Output paths
output_folder = "data"
raw_output = os.path.join(output_folder, "gdelt_usa_10year_raw.csv")
final_output = os.path.join(output_folder, "gdelt_usa_10year_filtered.csv")

# --- USA FILTERING CRITERIA (Pre-defined for efficiency) ---
usa_locations = [
    'United States', 'New York', 'Manhattan', 'Wall Street', 'NYSE', 'NASDAQ',
    'Washington', 'Federal Reserve', 'Fed', 'Treasury', 'SEC', 
    'California', 'Silicon Valley', 'San Francisco', 'Los Angeles',
    'Chicago', 'Illinois', 'Boston', 'Massachusetts', 'Texas', 'Florida',
    'White House', 'Congress', 'Senate', 'House of Representatives'
]

usa_financial_themes = [
    'ECON_STOCKMARKET', 'ECON_INFLATION', 'ECON_RECESSION', 'ECON_GDP',
    'ECON_UNEMPLOYMENT', 'ECON_INTEREST_RATE', 'EPU_ECONOMY', 'EPU_POLICY',
    'WB_1920_FINANCIAL_SECTOR_DEVELOPMENT', 'WB_439_MACROECONOMIC_AND_STRUCTURAL_POLICIES'
]

us_financial_sources = [
    'reuters', 'bloomberg', 'wsj', 'cnbc', 'marketwatch', 'yahoo',
    'cnn', 'foxnews', 'abc', 'nbc', 'cbs', 'ap.org', 'usatoday',
    'washingtonpost', 'nytimes', 'fortune', 'forbes'
]

# Create filter patterns
usa_location_pattern = '|'.join(usa_locations)
usa_theme_pattern = '|'.join(usa_financial_themes)
usa_source_pattern = '|'.join(us_financial_sources)

print(f"üá∫üá∏ Starting 10-year USA financial data collection...")
print(f"üìÖ Period: {start_date} to {end_date}")
print(f"‚è±Ô∏è This will take several hours - progress will be saved periodically")

# --- Generate Monthly Date Ranges ---
# Using monthly periods to balance API limits with reasonable collection speed
monthly_ranges = pd.date_range(start=start_date, end=end_date, freq='MS')

print(f"üìä Total periods to collect: {len(monthly_ranges)} months")

# --- Collection with Real-time Filtering ---
all_usa_events = []
collection_stats = {
    'total_periods': len(monthly_ranges),
    'completed_periods': 0,
    'total_raw_events': 0,
    'total_usa_events': 0,
    'failed_periods': 0
}

# Resume capability - check if we have partial data
resume_from = 0
if os.path.exists(raw_output):
    print("üìÇ Found existing data file - checking for resume point...")
    try:
        existing_df = pd.read_csv(raw_output)
        if not existing_df.empty:
            last_date = existing_df['DATE'].max()
            last_date_parsed = pd.to_datetime(str(last_date), format='%Y%m%d%H%M%S')
            # Find where to resume
            for i, period_start in enumerate(monthly_ranges):
                if period_start > last_date_parsed:
                    resume_from = i
                    break
            print(f"üîÑ Resuming from period {resume_from + 1}/{len(monthly_ranges)}")
            all_usa_events.append(existing_df)
    except Exception as e:
        print(f"‚ö†Ô∏è Could not resume from existing file: {e}")
        resume_from = 0

# --- Main Collection Loop ---
for i, month_start in enumerate(monthly_ranges[resume_from:], start=resume_from):
    # Calculate period end (end of month)
    if month_start.month == 12:
        month_end = month_start.replace(year=month_start.year + 1, month=1, day=1) - timedelta(days=1)
    else:
        month_end = month_start.replace(month=month_start.month + 1, day=1) - timedelta(days=1)
    
    month_start_str = month_start.strftime('%Y-%m-%d')
    month_end_str = month_end.strftime('%Y-%m-%d')
    
    print(f"\nüìÖ Period {i+1}/{len(monthly_ranges)}: {month_start_str} to {month_end_str}")
    
    try:
        # Search GDELT for the current month
        results_df = gd.Search(
            date=[month_start_str, month_end_str],
            table='gkg',
            output='df'
        )
        
        if not results_df.empty:
            raw_count = len(results_df)
            collection_stats['total_raw_events'] += raw_count
            
            # Apply USA location filter first (most restrictive)
            usa_location_filter = results_df['Locations'].str.contains(
                usa_location_pattern, case=False, na=False
            )
            usa_located = results_df[usa_location_filter].copy()
            
            if not usa_located.empty:
                # Apply USA financial theme filter
                usa_theme_filter = usa_located['Themes'].str.contains(
                    usa_theme_pattern, case=False, na=False, regex=True
                )
                usa_themed = usa_located[usa_theme_filter].copy()
                
                if not usa_themed.empty:
                    # Apply USA source filter
                    usa_source_filter = usa_themed['SourceCommonName'].str.contains(
                        usa_source_pattern, case=False, na=False
                    )
                    usa_final = usa_themed[usa_source_filter].copy()
                    
                    if not usa_final.empty:
                        usa_count = len(usa_final)
                        collection_stats['total_usa_events'] += usa_count
                        all_usa_events.append(usa_final)
                        
                        print(f"  ‚úÖ Raw: {raw_count} ‚Üí USA Financial: {usa_count} events")
                    else:
                        print(f"  üì∞ No events after USA source filtering")
                else:
                    print(f"  üè∑Ô∏è No events after USA theme filtering")
            else:
                print(f"  üìç No USA located events")
        else:
            print(f"  ‚ùå No events found for this period")
            
    except Exception as e:
        print(f"  ‚ö†Ô∏è Error collecting data for this period: {e}")
        collection_stats['failed_periods'] += 1
        continue
    
    collection_stats['completed_periods'] += 1
    
    # Save progress every 12 months
    if (i + 1) % 12 == 0 and all_usa_events:
        print(f"\nüíæ Saving progress at {month_start.year}...")
        temp_df = pd.concat(all_usa_events, ignore_index=True)
        temp_df.to_csv(raw_output, index=False)
        print(f"   Saved {len(temp_df)} total USA events so far")
    
    # Be respectful to GDELT servers
    time.sleep(3)  # 3 seconds between requests for monthly data
    
    # Progress update every 6 months
    if (i + 1) % 6 == 0:
        print(f"\nüìä Progress Update:")
        print(f"   Completed: {collection_stats['completed_periods']}/{collection_stats['total_periods']} periods")
        print(f"   USA Events Collected: {collection_stats['total_usa_events']}")
        print(f"   Failed Periods: {collection_stats['failed_periods']}")

# --- Final Processing ---
print(f"\nüèÅ Collection Complete!")
print(f"üìà Final Statistics:")
for key, value in collection_stats.items():
    print(f"   {key}: {value}")

if all_usa_events:
    # Combine all collected data
    final_usa_df = pd.concat(all_usa_events, ignore_index=True)
    
    # Remove duplicates (in case of overlapping periods)
    final_usa_df = final_usa_df.drop_duplicates(subset=['DATE', 'DocumentIdentifier'])
    
    # Parse dates and add time features
    final_usa_df['date_parsed'] = pd.to_datetime(final_usa_df['DATE'], format='%Y%m%d%H%M%S', errors='coerce')
    final_usa_df['date_only'] = final_usa_df['date_parsed'].dt.date
    final_usa_df['year'] = final_usa_df['date_parsed'].dt.year
    final_usa_df['month'] = final_usa_df['date_parsed'].dt.month
    final_usa_df['weekday'] = final_usa_df['date_parsed'].dt.weekday
    
    # Parse sentiment scores
    def parse_v2tone_quick(v2tone_str):
        try:
            parts = str(v2tone_str).split(',')
            if len(parts) >= 7:
                return {
                    'tone_avg': float(parts[0]),
                    'tone_positive': float(parts[1]),
                    'tone_negative': float(parts[2]),
                    'word_count': int(parts[6])
                }
        except:
            pass
        return {'tone_avg': 0, 'tone_positive': 0, 'tone_negative': 0, 'word_count': 0}
    
    sentiment_data = final_usa_df['V2Tone'].apply(parse_v2tone_quick)
    sentiment_df = pd.json_normalize(sentiment_data)
    
    for col in sentiment_df.columns:
        final_usa_df[col] = sentiment_df[col].values
    
    # Save raw collected data
    final_usa_df.to_csv(raw_output, index=False)
    print(f"\nüíæ Saved raw 10-year data: {raw_output}")
    print(f"üìä Total events collected: {len(final_usa_df)}")
    
    # Show yearly breakdown
    yearly_counts = final_usa_df.groupby('year').size()
    print(f"\nüìÖ Yearly Distribution:")
    for year, count in yearly_counts.items():
        print(f"   {year}: {count} events")
    
    # Show average events per year
    avg_per_year = len(final_usa_df) / len(yearly_counts)
    print(f"\nüìà Average events per year: {avg_per_year:.0f}")
    
    print(f"\n‚úÖ 10-YEAR USA FINANCIAL DATA COLLECTION COMPLETE!")
    print(f"üéØ Ready for advanced filtering and forecasting model development")
    
else:
    print(f"\n‚ùå No data collected - check your filters and API connectivity")

üá∫üá∏ Starting 10-year USA financial data collection...
üìÖ Period: 2015-01-01 to 2025-07-01
‚è±Ô∏è This will take several hours - progress will be saved periodically
üìä Total periods to collect: 127 months

üìÖ Period 1/127: 2015-01-01 to 2015-01-31
  ‚ö†Ô∏è Error collecting data for this period: GDELT 2.0 only supports 'Feb 18 2015 - Present'queries currently. Try another date.

üìÖ Period 2/127: 2015-02-01 to 2015-02-28
  ‚ö†Ô∏è Error collecting data for this period: GDELT 2.0 only supports 'Feb 18 2015 - Present'queries currently. Try another date.

üìÖ Period 3/127: 2015-03-01 to 2015-03-31
