# 📈 Predicting US Industrial Production Index Using GDELT News Data
This notebook demonstrates how to retrieve and process economic news data from the GDELT database, and use it to predict the Industrial Production Index (IPI) from the Federal Reserve.


In [None]:
# Install necessary packages (if not already installed)
!pip install pandas pandas_datareader pandas-gbq google-cloud-bigquery openai scikit-learn matplotlib


In [None]:
import pandas as pd
from pandas_datareader import data as web
from datetime import datetime
import matplotlib.pyplot as plt

# Fetch Industrial Production Index (IPI) from FRED
start = datetime(2005, 1, 1)
end = datetime(2025, 1, 1)
ipi = web.DataReader('INDPRO', 'fred', start, end)
ipi = ipi.resample('M').mean()
ipi.plot(title='US Industrial Production Index (INDPRO)', figsize=(10, 4))
plt.show()

## 📰 Load GDELT News Data from BigQuery
You will need to set up a Google Cloud project and download a service account JSON file.

In [None]:
from google.cloud import bigquery
from google.oauth2 import service_account

# Replace with your service account path and project ID
credentials = service_account.Credentials.from_service_account_file("your-service-account.json")
project_id = "your-gcp-project-id"
client = bigquery.Client(credentials=credentials, project=project_id)

query = '''
SELECT DATE, AvgTone, GoldsteinScale, NumArticles
FROM `gdelt-bq.gdeltv2.gkg`
WHERE DATE >= '20050101' AND DATE < '20250101'
AND V2Themes LIKE '%ECON%'
'''
df_gdelt = client.query(query).to_dataframe()
df_gdelt['DATE'] = pd.to_datetime(df_gdelt['DATE'], format='%Y%m%d')
df_gdelt = df_gdelt.set_index('DATE').resample('M').mean()
df_gdelt.head()

In [None]:
# Merge GDELT and IPI data
df = df_gdelt.join(ipi, how='inner')
df.dropna(inplace=True)
df.head()

In [None]:
# Simple ML Model: Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X = df.drop(columns=['INDPRO'])
y = df['INDPRO']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

model = RandomForestRegressor()
model.fit(X_train, y_train)
preds = model.predict(X_test)

mse = mean_squared_error(y_test, preds)
print(f"Mean Squared Error: {mse:.2f}")