# more data gathering

In [1]:
import requests
import pandas as pd
import time
from datetime import datetime, timedelta
import pytz
import json

In [2]:
# testing with other APIs
tickers = ['META', 'AAPL', 'AMZN', 'NFLX', 'GOOG', 'MSFT', 'IBM', 'ORCL', 'NVDA', 'INTC']

# notes:
# Marketaux allows 100 calls per day, 3 articles per call max
# Alpha Vantage allows 25 calls per day, 50 articles per call max



In [3]:
# Marketaux
mt_base_url = "https://api.marketaux.com/v1/news/all"
mt_api_key = "vGc2ngli75vh0B0TLLZBYmLoP9j6JdlIO0xnYU1Z"

def request_news_mt(ticker, start, end):
    url = f"{mt_base_url}?symbols={ticker}&published_after={start}&published_before={end}&api_token={mt_api_key}"
    resp = requests.get(url)
    if resp.status_code != 200:
        print(f"Request Stock Error {resp.status_code}")
        return None
    return resp.json()

In [4]:


def filter_returns_mt(returned_json):
#     print(returned_json)
    utc = pytz.utc
    eastern = pytz.timezone('America/New_York')
    
    main_list = []
    relation_list = []

    main_cols = ['id', 'publish_time', 'title', 'article_url', 
                 'ticker', 'publisher', 'description', 'keywords']
    
    for data in returned_json['data']:
        # Parse and convert `time_published` to Eastern Time if it exists
        publish_time = data.get('published_at')
        if publish_time:
            # Parse the "YYYY-MM-DDTHH:MM:SS.SSSSSSZ" format and assume it's in UTC
            dt_utc = datetime.strptime(publish_time, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=utc)
            publish_time_eastern = dt_utc.astimezone(eastern)
            publish_time_str = publish_time_eastern.strftime('%Y-%m-%d %H:%M:%S')
        else:
            publish_time_str = None
            
        # Extract main fields
        entry = {
            'id': data.get('uuid'),
            'publish_time': publish_time_str,
            'title': data.get('title'),
            'article_url': data.get('url'),
            'ticker': data['entities'][0].get('symbol') if data.get('entities') else None,
            'publisher': data.get('source'),
            'description': data.get('description'),
            'keywords': data.get('keywords')
        }
        main_list.append(entry)

        # Extract related information for 'relation_list' if there are multiple entities
        if 'entities' in data:
            for entity in data['entities']:
                relation_entry = {
                    'news_id': data.get('uuid'),
                    'source_ticker': entry['ticker'],
                    'sentiment_score': entity.get('sentiment_score'),
                    'time': entry['publish_time'],
                    'ticker': entity.get('symbol'),
                    'relevance_score': entity.get('match_score', ""),
                    'ticker_sentiment_score': entity.get('ticker_sentiment_score', ""),
                    'ticker_sentiment_label': entity.get('ticker_sentiment_label', "")
                }
                relation_list.append(relation_entry)
    
    # Convert to DataFrames for further processing or saving
    # print(main_list)
    main_df = pd.DataFrame(main_list)
    relation_df = pd.DataFrame(relation_list)
    return main_df, relation_df

        
    

In [5]:
# date acquired
# ('2024-09-01', '2024-09-03')
# ('2024-09-03', '2024-09-05')
# ('2024-09-05', '2024-09-07'),
# ('2024-09-06', '2024-09-08'),
# ('2024-09-07', '2024-09-09'),
# ('2024-09-08', '2024-09-10')
# ('2022-01-01', '2022-01-03'),
# ('2022-01-02', '2022-01-04'),
# ('2022-01-03', '2022-01-05'),
# ('2022-01-04', '2022-01-06'),
# ('2022-01-05', '2022-01-07'),
# ('2022-01-06', '2022-01-08'),
# ('2022-01-07', '2022-01-09'),
# ('2022-01-08', '2022-01-10'),
# ('2022-01-09', '2022-01-11')
# ('2022-01-10', '2022-01-12'),
# ('2022-01-11', '2022-01-13'),
# ('2022-01-12', '2022-01-14'),
# ('2022-01-13', '2022-01-15'),
# ('2022-01-14', '2022-01-16'),
# ('2022-01-15', '2022-01-17'),
# ('2022-01-16', '2022-01-18'),
# ('2022-01-17', '2022-01-19'),
# ('2022-01-18', '2022-01-20')
# ('2022-01-19', '2022-01-21'),
# ('2022-01-20', '2022-01-22'),
# ('2022-01-21', '2022-01-23'),
# ('2022-01-22', '2022-01-24'),
# ('2022-01-23', '2022-01-25'),
# ('2022-01-24', '2022-01-26'),
# ('2022-01-25', '2022-01-27'),
# ('2022-01-26', '2022-01-28'),
# ('2022-01-27', '2022-01-29'),
# ('2022-01-28', '2022-01-30')
# ('2023-01-31', '2023-02-02'),
# ('2023-02-01', '2023-02-03'),
# ('2023-02-02', '2023-02-04'),
# ('2023-02-03', '2023-02-05'),
# ('2023-02-04', '2023-02-06'),
# ('2023-02-05', '2023-02-07'),
# ('2023-02-06', '2023-02-08'),
# ('2023-02-07', '2023-02-09'),
# ('2023-02-08', '2023-02-10'),
# ('2023-02-09', '2023-02-11')


In [6]:
dates = [
    ('2023-01-31', '2023-02-02'),
    ('2023-02-01', '2023-02-03'),
    ('2023-02-02', '2023-02-04'),
    ('2023-02-03', '2023-02-05'),
    ('2023-02-04', '2023-02-06'),
    ('2023-02-05', '2023-02-07'),
    ('2023-02-06', '2023-02-08'),
    ('2023-02-07', '2023-02-09'),
    ('2023-02-08', '2023-02-10'),
    ('2023-02-09', '2023-02-11')
]

for date in dates:
    for ticker in tickers:
        main_df_mt, relation_df_mt = filter_returns_mt(request_news_mt(ticker, date[0], date[1]))
        main_df_mt.to_csv(f"dataset/news_maux/{ticker}_main_{date[0]}_{date[1]}.csv", index=False)
        relation_df_mt.to_csv(f"dataset/news_maux/{ticker}_relation_{date[0]}_{date[1]}.csv", index=False)

In [9]:
# main_df_mt, relation_df_mt = filter_returns_mt(request_news_mt('AAPL', '2024-10-01', '2024-10-03'))

# main_df_mt.to_csv("dataset/news_maux/AAPL_main_1.csv", index=False)
# relation_df_mt.to_csv("dataset/news_maux/AAPL_relation_df_1.csv", index=False)

In [10]:
# data_marketaux = filter_returns_mt(request_news_mt('AAPL', '2024-10-01', '2024-10-05'))

In [11]:
# print(len(data_marketaux))

In [12]:
# print(data_marketaux)

In [7]:
data_raw_m = request_news_mt('AMZN', '2022-01-01', '2022-01-05')
print(data_raw_m)

{'meta': {'found': 92, 'returned': 3, 'limit': 3, 'page': 1}, 'data': [{'uuid': '7b589651-b32a-4ea5-838b-ebd1ffab0e77', 'title': 'RBC Chooses Amazon, M&T, UnitedHealth As Top 2022 Stocks', 'description': "'Our U.S. equity strategy team expects 2022 to be a year of solid but more moderate returns relative to 2021,' RBC said.", 'keywords': 'Banking, Energy, Stock, Health, Technology, Utilities, INVESTING, Utilities, Health Care, STOCKS, Investing Stocks, Financial Services, Technology, Investing, Amazon, Energy, Financial Services, Stocks', 'snippet': "'Our U.S. equity strategy team expects 2022 to be a year of solid but more moderate returns relative to 2021,' RBC said.\n\nRBC Capital Markets Tuesday named its ...", 'url': 'https://www.thestreet.com/investing/rbc-chooses-amazon-mt-unitedhealth-top-stocks', 'image_url': 'https://www.thestreet.com/.image/t_share/MTY4NjQwNDY5NTEzMDg2ODcx/mt-bank-mtb-stock-higher-following-q1-earnings-results.jpg', 'language': 'en', 'published_at': '2022-01

In [14]:
# temp_d_1 = request_news_mt('AAPL', '2024-10-01', '2024-10-05')

# marketaux_path = "marketaux.json"
# with open(marketaux_path, "w") as f:
#     json.dump(temp_d_1, f)

In [7]:
# Alpha Advantage
alpha_base_url = "https://www.alphavantage.co/query"
alpha_api_key = "SMASMZ5KH2MHZ462"

def convert_time(time_str):
    # Define the default hour and minute values
    default_hour = 0
    default_minute = 0
    
    # Parse the date string with optional time components
    try:
        # Try parsing the input string with both date and time
        dt = datetime.strptime(time_str, '%Y-%m-%dT%H:%M')
    except ValueError:
        try:
            # If time is not provided, parse just the date and add default values
            dt = datetime.strptime(time_str, '%Y-%m-%d').replace(hour=default_hour, minute=default_minute)
        except ValueError:
            raise ValueError("Invalid date format. Expected 'YYYY-MM-DD' or 'YYYY-MM-DDTHH:MM'")
    
    # Return the formatted string
    return dt.strftime('%Y%m%dT%H%M')

def request_news_alpha(ticker, start, end):
    time_from = convert_time(start)
    time_to = convert_time(end)
    url = f"{alpha_base_url}?function=NEWS_SENTIMENT&tickers={ticker}&time_from={time_from}&time_to={time_to}&apikey={alpha_api_key}"
    resp = requests.get(url)
    if resp.status_code != 200:
        print(f"Request Stock Error {resp.status_code}")
        return None
    return resp.json()

In [8]:



def process_news_feed_alpha(returned_json, ticker): # json return for this one does not have ticker
    main_list = []
    relation_list = []
    
    # Define UTC and Eastern Time zones
    utc = pytz.utc
    eastern = pytz.timezone('America/New_York')
    
    # print(returned_json)
    
    for item in returned_json['feed']:
        # Parse and convert `time_published` to Eastern Time if it exists
        publish_time = item.get('time_published')
        if publish_time:
            # Parse the "YYYYMMDDTHHMMSS" format and assume it's in UTC
            dt_utc = datetime.strptime(publish_time, "%Y%m%dT%H%M%S").replace(tzinfo=utc)
            publish_time_eastern = dt_utc.astimezone(eastern)
            publish_time_str = publish_time_eastern.strftime('%Y-%m-%d %H:%M:%S')
        else:
            publish_time_str = None

        # Extract main data fields
        entry = {
            'id': item.get('id', ""),  # Adjust if 'id' has a different key name in your data
            'publish_time': publish_time_str,
            'title': item.get('title', ""),
            'article_url': item.get('url', ""),
            'ticker': ticker,
            'publisher': item.get('source', ""),
            'description': item.get('summary', ""),
            'keywords': ",".join([topic['topic'] for topic in item.get('topics', [])])
        }
        main_list.append(entry)
        
        # Extract ticker sentiment details for relation data
        if 'ticker_sentiment' in item:
            for relation in item['ticker_sentiment']:
                relation_entry = {
                    'news_id': entry['id'],
                    'source_ticker': ticker,
                    'time': publish_time_str,
                    'ticker': relation.get('ticker'),
                    'relevance_score': relation.get('relevance_score', ""),
                    'ticker_sentiment_score': relation.get('ticker_sentiment_score', ""),
                    'ticker_sentiment_label': relation.get('ticker_sentiment_label', "")
                }
                relation_list.append(relation_entry)
    
    # Convert lists to dataframes
    main_df = pd.DataFrame(main_list)
    relation_df = pd.DataFrame(relation_list)
    
    return main_df, relation_df

In [9]:
# date = ('2024-10-01', '2024-10-03')

dates = [
    ('2023-02-23', '2023-02-26'),
    ('2023-02-27', '2023-03-02')
]

for date in dates:
    for ticker in tickers:
        main_df_alpha, relation_df_alpha = process_news_feed_alpha(request_news_alpha(ticker, date[0], date[1]), ticker)
        main_df_alpha.to_csv(f"dataset/news_alpha/{ticker}_main_{date[0]}_{date[1]}.csv", index=False)
        relation_df_alpha.to_csv(f"dataset/news_alpha/{ticker}_relation_{date[0]}_{date[1]}.csv", index=False)


In [None]:
# date acquired
# ('2024-10-01', '2024-10-03')
# ('2023-01-01', '2023-01-03'),
# ('2023-01-04', '2023-01-06')
# ('2023-02-01', '2023-02-03'),
# ('2023-02-04', '2023-02-06')
# ('2023-02-07', '2023-02-10'),
# ('2023-02-11', '2023-02-14')
# ('2023-02-15', '2023-02-18'),
# ('2023-02-19', '2023-02-22')
# ('2023-02-23', '2023-02-26'),
# ('2023-02-27', '2023-03-02')

In [30]:
# main_df_alpha, relation_df_alpha = process_news_feed_alpha(request_news_alpha('AAPL', '2024-10-01', '2024-10-03'), 'AAPL')



In [51]:
# main_df_alpha.to_csv("dataset/news_alpha/AAPL_main_1.csv", index=False)
# relation_df_alpha.to_csv("dataset/news_alpha/AAPL_relation_df_1.csv", index=False)

In [52]:
# data_alpha = filter_returns_alpha(request_news_alpha('AAPL', '2024-10-01', '2024-10-03'))

In [53]:
# print(len(data_alpha))

In [54]:
# print(data_alpha)

In [19]:
# temp_d_2 = request_news_alpha('AAPL', '2022-01-01', '2024-01-03')

# print(temp_d_2)

# alpha_path = "alpha.json"
# with open(alpha_path, "w") as f:
#     json.dump(temp_d_2, f)

{'items': '50', 'sentiment_score_definition': 'x <= -0.35: Bearish; -0.35 < x <= -0.15: Somewhat-Bearish; -0.15 < x < 0.15: Neutral; 0.15 <= x < 0.35: Somewhat_Bullish; x >= 0.35: Bullish', 'relevance_score_definition': '0 < x <= 1, with a higher score indicating higher relevance.', 'feed': [{'title': 'Snowflake Stock Fell Today -- Is It a Buy for 2024?', 'url': 'https://www.fool.com/investing/2024/01/02/snowflake-stock-fell-today-is-it-a-buy-for-2024/', 'time_published': '20240102T233000', 'authors': ['Keith Noonan'], 'summary': 'Does the data-services specialist have what it takes to be a big winner in 2024 and beyond?', 'banner_image': 'https://g.foolcdn.com/editorial/images/759956/gettyimages-1499780516.jpg', 'source': 'Motley Fool', 'category_within_source': 'n/a', 'source_domain': 'www.fool.com', 'topics': [{'topic': 'Financial Markets', 'relevance_score': '0.998356'}, {'topic': 'Earnings', 'relevance_score': '0.310843'}, {'topic': 'Technology', 'relevance_score': '0.5'}, {'topic

In [56]:
# time_l = [
#     ('2023-10-01', '2023-10-05'),
#     ('2023-10-01', '2023-10-10'),
#     ('2023-10-01', '2023-11-01'),
#     ('2023-10-01', '2023-12-30'),
#     ('2023-10-01', '2024-06-01'),
# ]

# for t in time_l:
#     dt = filter_returns_alpha(request_news_alpha('AAPL', t[0], t[1]))
#     print(len(dt))

In [57]:
# print(dt)

In [58]:
# print(len(filter_returns_alpha(request_news_alpha('AAPL', '2023-10-01', '2023-10-04'))))

In [19]:
r = request_news_alpha('GOOG', '2022-01-01', '2024-10-01')
print(r)

{'items': '50', 'sentiment_score_definition': 'x <= -0.35: Bearish; -0.35 < x <= -0.15: Somewhat-Bearish; -0.15 < x < 0.15: Neutral; 0.15 <= x < 0.35: Somewhat_Bullish; x >= 0.35: Bullish', 'relevance_score_definition': '0 < x <= 1, with a higher score indicating higher relevance.', 'feed': [{'title': 'Before you continue', 'url': 'https://consent.google.com/m', 'time_published': '20240930T235306', 'authors': [], 'summary': 'Page 51 - N Sundaresha Subramanian/workspace/business Standard Web/video Business Standard ...', 'banner_image': None, 'source': 'Business Standard', 'category_within_source': 'GoogleRSS', 'source_domain': 'consent.google.com', 'topics': [{'topic': 'Technology', 'relevance_score': '1.0'}], 'overall_sentiment_score': 0.086343, 'overall_sentiment_label': 'Neutral', 'ticker_sentiment': [{'ticker': 'GOOG', 'relevance_score': '0.213048', 'ticker_sentiment_score': '0.142784', 'ticker_sentiment_label': 'Neutral'}]}, {'title': 'Before you continue', 'url': 'https://consent