# more data gathering

In [65]:
import requests
import pandas as pd
import time
from datetime import datetime, timedelta
import pytz
import json

In [66]:
# testing with other APIs
tickers = ['META', 'AAPL', 'AMZN', 'NFLX', 'GOOGL', 'MSFT', 'IBM', 'ORCL', 'NVDA', 'INTC']

# notes:
# Marketaux allows 100 calls per day, 3 articles per call max
# Alpha Vantage allows 25 calls per day, 50 articles per call max



In [67]:
# Marketaux
mt_base_url = "https://api.marketaux.com/v1/news/all"
mt_api_key = "vGc2ngli75vh0B0TLLZBYmLoP9j6JdlIO0xnYU1Z"

def request_news_mt(ticker, start, end):
    url = f"{mt_base_url}?symbols={ticker}&published_after={start}&published_before={end}&api_token={mt_api_key}"
    resp = requests.get(url)
    if resp.status_code != 200:
        print(f"Request Stock Error {resp.status_code}")
        return None
    return resp.json()

In [68]:


def filter_returns_mt(returned_json):
#     print(returned_json)
    utc = pytz.utc
    eastern = pytz.timezone('America/New_York')
    
    main_list = []
    relation_list = []

    main_cols = ['id', 'publish_time', 'title', 'article_url', 
                 'ticker', 'publisher', 'description', 'keywords']
    
    for data in returned_json['data']:
        # Parse and convert `time_published` to Eastern Time if it exists
        publish_time = data.get('published_at')
        if publish_time:
            # Parse the "YYYY-MM-DDTHH:MM:SS.SSSSSSZ" format and assume it's in UTC
            dt_utc = datetime.strptime(publish_time, "%Y-%m-%dT%H:%M:%S.%fZ").replace(tzinfo=utc)
            publish_time_eastern = dt_utc.astimezone(eastern)
            publish_time_str = publish_time_eastern.strftime('%Y-%m-%d %H:%M:%S')
        else:
            publish_time_str = None
            
        # Extract main fields
        entry = {
            'id': data.get('uuid'),
            'publish_time': publish_time_str,
            'title': data.get('title'),
            'article_url': data.get('url'),
            'ticker': data['entities'][0].get('symbol') if data.get('entities') else None,
            'publisher': data.get('source'),
            'description': data.get('description'),
            'keywords': data.get('keywords')
        }
        main_list.append(entry)

        # Extract related information for 'relation_list' if there are multiple entities
        if 'entities' in data:
            for entity in data['entities']:
                relation_entry = {
                    'news_id': data.get('uuid'),
                    'source_ticker': entity.get('symbol'),
                    'sentiment_score': entity.get('sentiment_score'),
                    'time': entry['publish_time'],
                    'ticker': entity.get('ticker'),
                    'relevance_score': entity.get('match_score', ""),
                    'ticker_sentiment_score': entity.get('ticker_sentiment_score', ""),
                    'ticker_sentiment_label': entity.get('ticker_sentiment_label', "")
                }
                relation_list.append(relation_entry)
    
    # Convert to DataFrames for further processing or saving
    # print(main_list)
    main_df = pd.DataFrame(main_list)
    relation_df = pd.DataFrame(relation_list)
    return main_df, relation_df

        
    

In [69]:
date = ('2024-09-01', '2024-09-03')

for ticker in tickers:
    main_df_alpha, relation_df_alpha = filter_returns_mt(request_news_mt(ticker, date[0], date[1]))
    main_df_alpha.to_csv(f"dataset/news_maux/{ticker}_main_{date[0]}_{date[1]}.csv", index=False)
    relation_df_alpha.to_csv(f"dataset/news_maux/{ticker}_relation_{date[0]}_{date[1]}.csv", index=False)

In [70]:
# main_df_mt, relation_df_mt = filter_returns_mt(request_news_mt('AAPL', '2024-10-01', '2024-10-03'))

# main_df_mt.to_csv("dataset/news_maux/AAPL_main_1.csv", index=False)
# relation_df_mt.to_csv("dataset/news_maux/AAPL_relation_df_1.csv", index=False)

In [23]:
# data_marketaux = filter_returns_mt(request_news_mt('AAPL', '2024-10-01', '2024-10-05'))

In [24]:
# print(len(data_marketaux))

In [25]:
# print(data_marketaux)

In [26]:
# data_raw_m = request_news_mt('AAPL', '2024-10-01', '2024-10-05')

In [27]:
# temp_d_1 = request_news_mt('AAPL', '2024-10-01', '2024-10-05')

# marketaux_path = "marketaux.json"
# with open(marketaux_path, "w") as f:
#     json.dump(temp_d_1, f)

In [28]:
# Alpha Advantage
alpha_base_url = "https://www.alphavantage.co/query"
alpha_api_key = "SMASMZ5KH2MHZ462"

def convert_time(time_str):
    # Define the default hour and minute values
    default_hour = 0
    default_minute = 0
    
    # Parse the date string with optional time components
    try:
        # Try parsing the input string with both date and time
        dt = datetime.strptime(time_str, '%Y-%m-%dT%H:%M')
    except ValueError:
        try:
            # If time is not provided, parse just the date and add default values
            dt = datetime.strptime(time_str, '%Y-%m-%d').replace(hour=default_hour, minute=default_minute)
        except ValueError:
            raise ValueError("Invalid date format. Expected 'YYYY-MM-DD' or 'YYYY-MM-DDTHH:MM'")
    
    # Return the formatted string
    return dt.strftime('%Y%m%dT%H%M')

def request_news_alpha(ticker, start, end):
    time_from = convert_time(start)
    time_to = convert_time(end)
    url = f"{alpha_base_url}?function=NEWS_SENTIMENT&tickers={ticker}&time_from={time_from}&time_to={time_to}&apikey={alpha_api_key}"
    resp = requests.get(url)
    if resp.status_code != 200:
        print(f"Request Stock Error {resp.status_code}")
        return None
    return resp.json()

In [49]:



def process_news_feed_alpha(returned_json, ticker): # json return for this one does not have ticker
    main_list = []
    relation_list = []
    
    # Define UTC and Eastern Time zones
    utc = pytz.utc
    eastern = pytz.timezone('America/New_York')
    
    for item in returned_json['feed']:
        # Parse and convert `time_published` to Eastern Time if it exists
        publish_time = item.get('time_published')
        if publish_time:
            # Parse the "YYYYMMDDTHHMMSS" format and assume it's in UTC
            dt_utc = datetime.strptime(publish_time, "%Y%m%dT%H%M%S").replace(tzinfo=utc)
            publish_time_eastern = dt_utc.astimezone(eastern)
            publish_time_str = publish_time_eastern.strftime('%Y-%m-%d %H:%M:%S')
        else:
            publish_time_str = None

        # Extract main data fields
        entry = {
            'id': item.get('id', ""),  # Adjust if 'id' has a different key name in your data
            'publish_time': publish_time_str,
            'title': item.get('title', ""),
            'article_url': item.get('url', ""),
            'ticker': ticker,
            'publisher': item.get('source', ""),
            'description': item.get('summary', ""),
            'keywords': ",".join([topic['topic'] for topic in item.get('topics', [])])
        }
        main_list.append(entry)
        
        # Extract ticker sentiment details for relation data
        if 'ticker_sentiment' in item:
            for relation in item['ticker_sentiment']:
                relation_entry = {
                    'news_id': entry['id'],
                    'source_ticker': ticker,
                    'time': publish_time_str,
                    'ticker': relation.get('ticker'),
                    'relevance_score': relation.get('relevance_score', ""),
                    'ticker_sentiment_score': relation.get('ticker_sentiment_score', ""),
                    'ticker_sentiment_label': relation.get('ticker_sentiment_label', "")
                }
                relation_list.append(relation_entry)
    
    # Convert lists to dataframes
    main_df = pd.DataFrame(main_list)
    relation_df = pd.DataFrame(relation_list)
    
    return main_df, relation_df

In [50]:
date = ('2024-10-01', '2024-10-03')

for ticker in tickers:
    main_df_alpha, relation_df_alpha = process_news_feed_alpha(request_news_alpha(ticker, date[0], date[1]), ticker)
    main_df_alpha.to_csv(f"dataset/news_alpha/{ticker}_main_{date[0]}_{date[1]}.csv", index=False)
    relation_df_alpha.to_csv(f"dataset/news_alpha/{ticker}_relation_{date[0]}_{date[1]}.csv", index=False)


In [None]:
# date acquired
# ('2024-10-01', '2024-10-03')

In [30]:
# main_df_alpha, relation_df_alpha = process_news_feed_alpha(request_news_alpha('AAPL', '2024-10-01', '2024-10-03'), 'AAPL')



In [51]:
# main_df_alpha.to_csv("dataset/news_alpha/AAPL_main_1.csv", index=False)
# relation_df_alpha.to_csv("dataset/news_alpha/AAPL_relation_df_1.csv", index=False)

In [52]:
# data_alpha = filter_returns_alpha(request_news_alpha('AAPL', '2024-10-01', '2024-10-03'))

In [53]:
# print(len(data_alpha))

In [54]:
# print(data_alpha)

In [55]:
# temp_d_2 = request_news_alpha('AAPL', '2024-10-01', '2024-10-05')

# alpha_path = "alpha.json"
# with open(alpha_path, "w") as f:
#     json.dump(temp_d_2, f)

In [56]:
# time_l = [
#     ('2023-10-01', '2023-10-05'),
#     ('2023-10-01', '2023-10-10'),
#     ('2023-10-01', '2023-11-01'),
#     ('2023-10-01', '2023-12-30'),
#     ('2023-10-01', '2024-06-01'),
# ]

# for t in time_l:
#     dt = filter_returns_alpha(request_news_alpha('AAPL', t[0], t[1]))
#     print(len(dt))

In [57]:
# print(dt)

In [58]:
# print(len(filter_returns_alpha(request_news_alpha('AAPL', '2023-10-01', '2023-10-04'))))