In [7]:
import requests
import csv
from datetime import datetime, timedelta
import os
import time

# Set env variables
APCA_API_KEY_ID = os.getenv("APCA_API_KEY_ID")
APCA_API_SECRET_KEY = os.getenv("APCA_API_SECRET_KEY")

# Prepare headers for the HTTP request
headers = {
    'APCA-API-KEY-ID': APCA_API_KEY_ID,
    'APCA-API-SECRET-KEY': APCA_API_SECRET_KEY,
}

# Fetch and write news headlines to CSV
def fetch_and_write_news():
    end_date = datetime.now()
    start_date = end_date - timedelta(days=365 * 2)
    max_requests_per_minute = 200
    batch_size = 30  # Limit to 30 headlines per month
    retry_attempts = 3  # Maximum number of retry attempts for a failed request
    request_count = 0
    minute_start_time = time.time()

    with open('news_with_relevance.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Date", "Headline", "Content", "Relevant"])

        current_date = start_date
        while current_date < end_date:
            month_end_date = current_date + timedelta(days=30)
            next_page_token = None
            has_more = True
            monthly_headlines_count = 0

            while has_more and monthly_headlines_count < batch_size:
                # Check rate limit
                if request_count >= max_requests_per_minute:
                    elapsed_time = time.time() - minute_start_time
                    if elapsed_time < 60:
                        time.sleep(60 - elapsed_time)
                    request_count = 0
                    minute_start_time = time.time()

                params = {
                    'start': current_date.strftime('%Y-%m-%d'),
                    'end': month_end_date.strftime('%Y-%m-%d'),
                    'include_content': 'true',
                    'limit': batch_size,
                }

                if next_page_token:
                    params['page_token'] = next_page_token

                for attempt in range(retry_attempts):
                    response = requests.get('https://data.alpaca.markets/v1beta1/news', headers=headers, params=params)
                    request_count += 1

                    if response.status_code == 200:
                        news_batch = response.json()
                        for article in news_batch['news']:
                            if monthly_headlines_count >= batch_size:
                                break
                            date = datetime.fromisoformat(article['created_at']).strftime('%m/%d/%Y')
                            headline = article['headline']
                            content = article['summary']
                            writer.writerow([date, headline, content, ""])
                            print(date, headline, content)
                            monthly_headlines_count += 1

                        next_page_token = news_batch.get('next_page_token')
                        has_more = next_page_token is not None
                        if not has_more:  # Reset for the next month
                            break
                    else:
                        print(f"Failed to fetch news articles {response.status_code} on attempt {attempt + 1}")
                        if response.status_code == 400:
                            print("Bad request, skipping this batch.")
                            break
                        time.sleep(2 ** attempt)  # Exponential backoff
                else:
                    # No more retries
                    break

            current_date = month_end_date

fetch_and_write_news()

06/29/2022 First Probable Case of Monkeypox in Michigan Identified in Oakland County 
06/29/2022 Seachange International 10%+ Owner Karen Singer Reported Purchase of 250,000 Shares @ Avg Price: of $0.60/Share in Form 4 Filing on Wednesday 
06/29/2022 Top Wall Street Bets Mentions for Wednesday June 29, 2022 After-Hours: GME, BBBY, TSLA, AMD, CCL, AMC, AAPL, INTC, SOFI, TWTR 
06/29/2022 Better Tips For Uber Drivers, Consider Getting A Tesla Ride-share drivers are turning to renting or buying electric vehicles. The move, which was being done to help combat high prices for gas at the pump, is turning into higher tips for some drivers as well.
06/29/2022 Mesa Getting My Own Movie? Is Jar Jar Binks Getting An Origin Story From Taikia Waititi? Could one of the most hated and talked about Star Wars characters of all time be getting an origin story in the future?
06/29/2022 Exclusive: Golden Matrix Looks To Esports Wagering, Acquisitions To Continue Growth As Core Business Posts 15 Straight Qu

In [None]:
import requests
import csv
from datetime import datetime, timedelta
import os

# Set env variables
APCA_API_KEY_ID = os.getenv("APCA_API_KEY_ID")
APCA_API_SECRET_KEY = os.getenv("APCA_API_SECRET_KEY")

# Keywords related to macroeconomic indicators
keywords = [
    "Gross Domestic Product", "GDP", "Unemployment Rate", "Inflation Rate",
    "Consumer Price Index", "CPI", "Producer Price Index", "PPI", "Interest Rates",
    "Balance of Trade", "Government Debt", "Budget Deficit", "Budget Surplus", "Exchange Rates",
    "Money Supply", "Industrial Production", "Retail Sales", "Housing Starts",
    "Gross National Product", "GNP", "Net National Product", "NNP", "Disposable Income",
    "Personal Consumption Expenditures", "PCE", "Labor Force Participation Rate", "Productivity",
    "Current Account Balance", "Foreign Direct Investment", "FDI", "Capital Formation",
    "Government Spending", "Private Investment", "Nominal GDP", "Real GDP",
    "Trade Balance", "Import Prices", "Export Prices", "Purchasing Managers' Index", "PMI",
    "Capacity Utilization", "Job Openings and Labor Turnover Survey", "JOLTS",
    "Consumer Confidence Index", "CCI", "Producer Confidence Index", "PCI", "Core Inflation",
    "Real Income", "Wage Growth", "Household Debt", "Corporate Debt", "Fiscal Policy",
    "Monetary Policy", "Economic Growth Rate", "Business Cycle", "Recession", "Expansion",
    "Deflation", "Hyperinflation", "Stagflation", "Labor Market", "Capital Stock",
    "Public Sector Borrowing Requirement", "PSBR", "Yield Curve", "National Saving Rate",
    "Exchange Rate Mechanism", "ERM", "Purchasing Power Parity", "PPP", "Terms of Trade",
    "Human Development Index", "HDI", "Gini Coefficient", "Lorenz Curve", "Shadow Economy",
    "Informal Economy", "Real Estate Market", "Stock Market", "Bond Market", "Commodity Prices",
    "Financial Stability", "Economic Indicators", "Leading Indicators", "Lagging Indicators",
    "Coincident Indicators", "Demographic Trends", "Labor Productivity", "Unit Labor Costs",
    "Nominal Interest Rate", "Real Interest Rate", "Velocity of Money", "Broad Money",
    "Narrow Money", "Financial Intermediaries", "Exchange Reserves", "Remittances"
]

# Prepare headers for the HTTP request
headers = {
    'APCA-API-KEY-ID': APCA_API_KEY_ID,
    'APCA-API-SECRET-KEY': APCA_API_SECRET_KEY,
}

# Fetch and filter news and write to csv
def fetch_and_filter_news(start_date, end_date, batch_size=30):
    next_page_token = None
    has_more = True
    with open('macroeconomic_news.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Date", "Headline", "Content"])

        while start_date < end_date:
            batch_end_date = start_date + timedelta(days=30)
            while has_more:
                params = {
                    'start': start_date.strftime('%Y-%m-%d'),
                    'end': batch_end_date.strftime('%Y-%m-%d'),
                    'include_content': 'true',
                    'limit': batch_size,
                }

                if next_page_token:
                    params['page_token'] = next_page_token

                response = requests.get('https://data.alpaca.markets/v1beta1/news', headers=headers, params=params)

                if response.status_code == 200:
                    news_batch = response.json()
                    for article in news_batch['news']:
                        # Keyword check
                        if any(keyword in article['headline'] or keyword in article['summary'] for keyword in keywords):
                            date = datetime.fromisoformat(article['created_at']).strftime('%m/%d/%Y')
                            headline = article['headline']
                            content = article['summary']
                            writer.writerow([date, headline, content])
                            print(date, headline, content)

                    next_page_token = news_batch.get('next_page_token')
                    has_more = next_page_token is not None
                    if not has_more:  # Reset for the next batch
                        start_date += timedelta(days=30)  # Next 30 days
                        next_page_token = None  # Reset pagination token
                        has_more = True  # Reset has_more
                else:
                    print("Failed to fetch news articles", response.status_code)
                    break

start_date = datetime.now() - timedelta(days=365*3)
end_date = datetime.now()

fetch_and_filter_news(start_date, end_date, batch_size=30)

In [None]:
import requests
import csv
from datetime import datetime, timedelta
import os
import time

# Set env variables
APCA_API_KEY_ID = os.getenv("APCA_API_KEY_ID")
APCA_API_SECRET_KEY = os.getenv("APCA_API_SECRET_KEY")

# Keywords related to macroeconomic indicators
keywords = [
    "Gross Domestic Product", "GDP", "Unemployment Rate", "Inflation Rate",
    "Consumer Price Index", "CPI", "Producer Price Index", "PPI", "Interest Rates",
    "Balance of Trade", "Government Debt", "Budget Deficit", "Surplus", "Exchange Rates",
    "Money Supply", "Industrial Production", "Retail Sales", "Housing Starts"
]

# Prepare headers for the HTTP request
headers = {
    'APCA-API-KEY-ID': APCA_API_KEY_ID,
    'APCA-API-SECRET-KEY': APCA_API_SECRET_KEY,
}

# Fetch and filter news and write to csv
def fetch_and_filter_news(end_date, batch_size=30):
    next_page_token = None
    has_more = True
    request_count = 0
    max_requests_per_minute = 200
    minute_start_time = time.time()
    retry_attempts = 3  # Maximum number of retry attempts for a failed request

    with open('macroeconomic_news.csv', 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Date", "Headline", "Content"])

        while end_date > start_date:
            batch_start_date = end_date - timedelta(days=30)
            while has_more:
                # Check rate limit
                if request_count >= max_requests_per_minute:
                    elapsed_time = time.time() - minute_start_time
                    if elapsed_time < 60:
                        time.sleep(60 - elapsed_time)
                    request_count = 0
                    minute_start_time = time.time()

                params = {
                    'start': batch_start_date.strftime('%Y-%m-%d'),
                    'end': end_date.strftime('%Y-%m-%d'),
                    'include_content': 'true',
                    'limit': batch_size,
                }

                if next_page_token:
                    params['page_token'] = next_page_token

                for attempt in range(retry_attempts):
                    response = requests.get('https://data.alpaca.markets/v1beta1/news', headers=headers, params=params)
                    request_count += 1

                    if response.status_code == 200:
                        news_batch = response.json()
                        for article in news_batch['news']:
                            # Keyword check
                            if any(keyword in article['headline'] or keyword in article['summary'] for keyword in keywords):
                                date = datetime.fromisoformat(article['created_at']).strftime('%m/%d/%Y')
                                headline = article['headline']
                                content = article['summary']
                                writer.writerow([date, headline, content])
                                print(date, headline, content)

                        next_page_token = news_batch.get('next_page_token')
                        has_more = next_page_token is not None
                        if not has_more:  # Reset for the next batch
                            end_date -= timedelta(days=30)  # Previous 30 days
                            next_page_token = None  # Reset pagination token
                            has_more = True  # Reset has_more
                        break  # Break out of retry loop on success
                    else:
                        print(f"Failed to fetch news articles {response.status_code} on attempt {attempt + 1}")
                        if response.status_code == 400:
                            print("Bad request, skipping this batch.")
                            break
                        time.sleep(2 ** attempt)  # Exponential backoff
                else:
                    # No more retries
                    end_date -= timedelta(days=30)
                    has_more = False
                    next_page_token = None

start_date = datetime.now() - timedelta(days=365*5)
end_date = datetime.now()

fetch_and_filter_news(end_date, batch_size=30)

In [6]:
import openai
import csv
import os

class OpenAIChatClient:
    def __init__(self, api_key=None):
        self.client = openai.OpenAI(api_key=api_key or os.getenv("OPENAI_API_KEY"))

    def chat_gpt(self, prompt):
        response = self.client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content.strip()

def score_article_with_llm(client, headline, content):
    if not content.strip():
        prompt = f"Based on the headline, score the article as relevant (1) or not relevant (0) to US macroeconomic indicators. Examples of macroeconomic indicators: Gross Domestic Product, GDP, Unemployment Rate, Inflation Rate, Consumer Price Index, CPI, Producer Price Index, PPI, Interest Rates, Balance of Trade, Government Debt, Budget Deficit, Surplus, Exchange Rates, Money Supply, Industrial Production, Retail Sales, Housing Starts. Make sure not to include any stock news or earnings. \n\nHeadline: {headline}"
    else:
        prompt = f"Based on the headline and content, score the article as relevant (1) or not relevant (0) to US domestic macroeconomic indicators. Examples of macroeconomic indicators: Gross Domestic Product, GDP, Unemployment Rate, Inflation Rate, Consumer Price Index, CPI, Producer Price Index, PPI, Interest Rates, Balance of Trade, Government Debt, Budget Deficit, Surplus, Exchange Rates, Money Supply, Industrial Production, Retail Sales, Housing Starts. Make sure not to include any stock news or earnings. \n\nHeadline: {headline}\n\nContent: {content}"
    
    try:
        score = client.chat_gpt(prompt)
        if score not in ["0", "1"]:
            return f"Failed to score the article, received an invalid score: {score}"
        return score
    except Exception as e:
        return f"Error scoring article with API: {e}"

def score_articles_with_llm(client, csv_file):
    articles = []

    with open(csv_file, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.reader(file)
        header = next(reader)  # Read the header
        for row in reader:
            articles.append(row)

    if 'Relevant' not in header:
        header.append('Relevant')

    # Iterate over each article and get the relevance score from the LLM
    for i, article in enumerate(articles):
        print(f"\nScoring Article {i+1}/{len(articles)}")
        print(f"Date: {article[0]}")
        print(f"Headline: {article[1]}")
        print(f"Content: {article[2]}")
        if len(article) < 4 or article[3] == "":
            score = score_article_with_llm(client, article[1], article[2])
            if score in ["0", "1"]:
                if len(article) < 4:
                    article.append(score)
                else:
                    article[3] = score
            else:
                print(score)

    with open(csv_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(header)
        writer.writerows(articles)

    print("All articles have been scored and saved.")

# Initialize the OpenAI chat client
client = OpenAIChatClient()  # API key is taken from environment variables

# Specify the CSV file to score
csv_file = 'news_with_relevance.csv'
score_articles_with_llm(client, csv_file)


Scoring Article 1/750
Date: 06/29/2022
Headline: First Probable Case of Monkeypox in Michigan Identified in Oakland County
Content: 

Scoring Article 2/750
Date: 06/29/2022
Headline: Seachange International 10%+ Owner Karen Singer Reported Purchase of 250,000 Shares @ Avg Price: of $0.60/Share in Form 4 Filing on Wednesday
Content: 
Failed to score the article, received an invalid score: 0 - This article is not relevant to US macroeconomic indicators.

Scoring Article 3/750
Date: 06/29/2022
Headline: Top Wall Street Bets Mentions for Wednesday June 29, 2022 After-Hours: GME, BBBY, TSLA, AMD, CCL, AMC, AAPL, INTC, SOFI, TWTR
Content: 

Scoring Article 4/750
Date: 06/29/2022
Headline: Better Tips For Uber Drivers, Consider Getting A Tesla
Content: Ride-share drivers are turning to renting or buying electric vehicles. The move, which was being done to help combat high prices for gas at the pump, is turning into higher tips for some drivers as well.

Scoring Article 5/750
Date: 06/29/2022