In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import fredapi as fred
from gdelt import gdelt

def get_industrial_production(FRED_API_KEY):
    """
    Fetch Industrial Production data from FRED
    """
    try:
        f = fred.Fred(api_key=FRED_API_KEY)
        # Fetch Industrial Production Index
        ip_data = f.get_series('INDPRO')
        ip_data = ip_data.to_frame(name='Industrial_Production')
        ip_data.index.name = 'DATE'
        return ip_data
    except Exception as e:
        print(f"Error fetching industrial production data: {e}")
        return None

def get_gdelt_events(start_date='2023-01-01', end_date='2023-01-31'):
    """
    Retrieve GDELT events with specific filters and limited scope
    """
    try:
        # Create GDELT object
        gd = gdelt()
        
        # Reduce the date range to minimize file downloads
        # If the full range is too large, try a shorter period
        start_date = '2023-01-01'
        end_date = '2023-01-31'  # Limit to one month
        
        # Perform the search with reduced parameters
        events = gd.Search(
            date=[start_date, end_date],
            coverage=True,
            table='events'
        )
        
        # Basic preprocessing
        if not events.empty:
            # Select relevant columns
            key_columns = [
                'GLOBALEVENTID', 
                'SQLDATE', 
                'Actor1Name', 
                'Actor2Name', 
                'EventRootCode', 
                'EventBaseCode', 
                'ActionGeo_CountryCode', 
                'AvgTone'
            ]
            
            # Filter for US events
            us_events = events[events['ActionGeo_CountryCode'] == 'US']
            
            # Ensure selected columns exist
            us_events = us_events[[col for col in key_columns if col in us_events.columns]]
            
            print(f"Retrieved {len(us_events)} US events")
            return us_events
        else:
            print("No events found matching the criteria")
            return pd.DataFrame()
    
    except Exception as e:
        print(f"Error retrieving GDELT events: {e}")
        return pd.DataFrame()

def align_gdelt_with_industrial_production(events_df, ip_data):
    """
    Align GDELT events with Industrial Production Index
    """
    try:
        # Convert SQLDATE to datetime
        events_df['Date'] = pd.to_datetime(events_df['SQLDATE'], format='%Y%m%d')
        
        # Resample events to monthly frequency
        monthly_events = events_df.groupby([
            pd.Grouper(key='Date', freq='M'), 
            'EventRootCode'
        ]).size().unstack(fill_value=0).reset_index()
        
        # Rename Date column
        monthly_events.rename(columns={'Date': 'DATE'}, inplace=True)
        
        # Ensure DATE column is datetime in both dataframes
        monthly_events['DATE'] = pd.to_datetime(monthly_events['DATE'])
        
        # Ensure Industrial Production index is datetime
        if not isinstance(ip_data.index, pd.DatetimeIndex):
            ip_data.index = pd.to_datetime(ip_data.index)
        
        # Filter Industrial Production data to recent years (e.g., last 20 years)
        recent_ip_data = ip_data[ip_data.index >= pd.Timestamp.now() - pd.DateOffset(years=20)]
        
        # Align with Industrial Production data
        aligned_data = pd.merge(
            monthly_events, 
            recent_ip_data.reset_index(), 
            on='DATE', 
            how='inner'
        )
        
        # Debug print
        print("Aligned data shape:", aligned_data.shape)
        print("Columns in aligned data:", aligned_data.columns)
        
        # Ensure we have data
        if aligned_data.empty:
            print("No matching data found between events and industrial production")
            return None
        
        return aligned_data
    
    except Exception as e:
        print(f"Error in data alignment: {e}")
        return None

def train_prediction_model(ip_data, events_data):
    """
    Train multiple prediction models and compare performance
    """
    # Align data
    aligned_data = align_gdelt_with_industrial_production(events_data, ip_data)
    
    # Check if alignment was successful
    if aligned_data is None or aligned_data.empty:
        print("Could not align data for modeling")
        return None, None
    
    # Prepare features and target
    # Exclude non-numeric or unnecessary columns
    numeric_columns = aligned_data.select_dtypes(include=[np.number]).columns
    numeric_columns = [col for col in numeric_columns if col not in ['DATE', 'Industrial_Production']]
    
    X = aligned_data[numeric_columns]
    y = aligned_data['Industrial_Production']
    
    # Check if we have enough samples
    if len(X) < 2:
        print("Not enough samples for train_test_split")
        return None, None
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train Random Forest Regressor
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    
    # Predictions
    y_pred = rf_model.predict(X_test)
    
    # Evaluate model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Feature importances
    feature_importances = pd.DataFrame({
        'feature': X.columns,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    return {
        'MSE': mse,
        'R2': r2,
        'Predictions': y_pred
    }, feature_importances


# FRED API Key = 'fe3052cbf69bad0030a2c6183efd12e0'

def main():
    # FRED API Key (replace with your actual key)
    FRED_API_KEY = 'fe3052cbf69bad0030a2c6183efd12e0'
    
    print("Starting data retrieval...")
    
    # Fetch data
    ip_data = get_industrial_production(FRED_API_KEY)
    print("Industrial Production Data:")
    print(ip_data.head())
    print(f"Industrial Production Data Shape: {ip_data.shape}")
    
    # Retrieve events data
    print("Retrieving GDELT events...")
    events_data = get_gdelt_events()
    print("GDELT Events Data:")
    print(events_data.head())
    print(f"GDELT Events Data Shape: {events_data.shape}")
    
    # Ensure data is not empty
    if ip_data is not None and not events_data.empty:
        print("Attempting to train prediction model...")
        
        # Train models and get results
        results, feature_importances = train_prediction_model(ip_data, events_data)
        
        # Check if modeling was successful
        if results is not None and feature_importances is not None:
            # Print results
            print("\n--- Model Performance ---")
            print(f"Mean Squared Error: {results['MSE']}")
            print(f"R-squared Score: {results['R2']}")
            
            print("\n--- Feature Importances ---")
            print(feature_importances)
        else:
            print("Model training failed. Check data alignment.")
    else:
        print("Failed to retrieve data. Please check your API keys and data sources.")

# Add this to ensure the main function runs
if __name__ == "__main__":
    main()
