In [1]:
from benzinga import news_data
import pandas as pd
import warnings
from datetime import datetime
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
import concurrent.futures
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning, module='bs4')
import os
import zipfile

api_key = 'YOUR_API_KEY'
print(api_key)
def remove_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

main_df = pd.DataFrame()
today_date = datetime.today().strftime('%Y-%m-%d')
date_from = "2007-01-01"
paper = news_data.News(api_key, log=False)
def get_news(ticker, page, date_from, date_to, display_output="full"):
    news = paper.news(company_tickers=ticker, display_output=display_output, date_from=date_from, date_to=date_to, page=page, pagesize=100)
    if (len(news) == 0):
        return []
    df = pd.DataFrame(news)
    df['teaser'] = df['teaser'].apply(remove_html_tags)
    df['body'] = df['body'].apply(remove_html_tags)
    return df

tickers = [
    'MSFT',
    'AMZN',
    'NVDA',
    'AAPL',
    'GOOGL',
    'META',
    'GOOG',
    'BRK.B',
    'TSLA',
    'UNH',
    'LLY',
    'XOM',
    'JPM',
    'V',
    'JNJ',
    'PG',
    'AVGO',
    'MA',
    'HD',
    'CVX',
    'MRK',
    'ABBV',
    'COST',
    'ADBE',
    'WMT',
    'PEP',
    'KO',
    'CSCO',
    'CRM',
    'MCD',
    'ACN',
    'LIN',
    'BAC',
    'NFLX',
    'PFE',
    'TMO',
    'ABT',
    'CMCSA',
    'ORCL',
    'AMD',
    'DIS',
    'WFC',
    'VZ',
    'AMGN',
    'COP',
    'PM',
    'INTC',
    'INTU',
    'IBM',
    'TXN',
    'DHR',
    'CAT',
    'UNP',
    'NKE',
    'GE',
    'QCOM',
    'HON',
    'NEE',
    'RTX',
    'SPGI',
    'NOW',
    'BMY',
    'AMAT',
    'LOW',
    'T',
    'SBUX',
    'ELV',
    'BA',
    'TJX',
    'DE',
    'UPS',
    'LMT',
    'GS',
    'BKNG',
    'GILD',
    'MDT',
    'VRTX',
    'MMC',
    'PLD',
    'MS',
    'ISRG',
    'ADP',
    'PGR',
    'CI',
    'MDLZ',
    'CB',
    'SYK',
    'CVS',
    'BLK',
    'REGN',
    'AXP',
    'AMT',
    'ADI',
    'SLB',
    'ETN',
    'LRCX',
    'CME',
    'SCHW',
    'C',
    'EOG'
]

def create_datasets_folder():
    if not os.path.exists('datasets'):
        os.makedirs('datasets')

def get_news_by_ticker(ticker):
    try:
        page = 0
        main_df = pd.DataFrame()
        date_from = "2007-01-01"
        total = 0
        while True:
            if page > 100:
                date_from  = datetime.strptime(main_df['updated'].iloc[-1], "%a, %d %b %Y %H:%M:%S %z").strftime('%Y-%m-%d')
                page = 0
            news_df = get_news(ticker, page, date_from, today_date, 'full')
            if (len(news_df) == 0):
                break
            main_df = pd.concat([main_df, news_df], ignore_index=True)
            main_df = main_df.drop_duplicates(subset=['id'])
            page += 1
            total += 1
            print(f"{ticker} - {total} page. Added rows: {len(news_df)} total: {len(main_df)}")
        if ticker == 'BRK.B' or ticker == 'BRK.A':
            ticker = 'BRK-B'
        main_df.to_csv(f"datasets/news_sp_500_{ticker}.csv")
        return ticker
    except Exception as e:
        print(e)

def merge_all_in_one_file():
    main_df = pd.DataFrame()
    for ticker in tickers:
        if ticker == 'BRK.B':
            ticker = 'BRK-B'
        df = pd.read_csv(f"datasets/news_sp_500_{ticker}.csv")
        main_df = pd.concat([main_df, df], ignore_index=True)
        main_df = main_df.drop_duplicates(subset=['id'])
    print(f"Rows in total {len(main_df)}")
    main_df.to_csv(f"datasets/news_sp_500.csv")

def run_concurent(max_workers=10):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(get_news_by_ticker, ticker) for ticker in tickers]
        for future in concurrent.futures.as_completed(futures):
            ticker = future.result()
            print(f"Ticker {ticker} done")

def check_all_files():
    for ticker in tickers:
        if ticker == 'BRK.B':
            ticker = 'BRK-B'
        df = pd.read_csv(f"datasets/news_sp_500_{ticker}.csv")
        if len(df) == 0:
            print(f"{ticker} - {len(df)}")

def zip_all_datasets():
    zf = zipfile.ZipFile('news_datasets.zip', mode='w')
    for ticker in tickers:
        if ticker == 'BRK.B':
            ticker = 'BRK-B'
        zf.write(f"datasets/news_sp_500_{ticker}.csv")
    zf.write(f"datasets/news_sp_500.csv")
    zf.close()

KeyError: 'BENZINGA_API_KEY'

In [None]:
create_datasets_folder()

# run all tickers in 10 threads
run_concurent()

In [None]:
# check all files in case some of them are empty
check_all_files()

In [None]:
merge_all_in_one_file()

In [3]:
zip_all_datasets()