# scrape the news sentiments up to a year back for microsoft alone

In [5]:
import requests
from pprint import pprint
import csv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime, timedelta
import json
import time
from tqdm import tqdm
import calendar
import os
import logging
# set up logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# create a file handler
handler = logging.FileHandler('msft_news_scrape.log')
handler.setLevel(logging.INFO)
# create a logging format
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# add the handlers to the logger
logger.addHandler(handler)
sns.set()
pd.set_option('display.max_colwidth',1000)

In [6]:
import string
import random

def get_data(parameters):
    endpoint = "https://www.alphavantage.co/query"
    for _ in range(200):
        parameters['apikey'] = ''.join(random.choices(string.ascii_uppercase + string.digits, k=15))
        # Send a GET request to the API endpoint
        response = requests.get(endpoint, params=parameters)
        # Check if the request was successful
        if response.status_code == 200:
            data = response.json()
            if 'Note' not in data: 
                break
            logger.warning(f'API key {parameters["apikey"]} has been used too many times. response note: {data["Note"]}')
            data = None
            time.sleep(1)
        else: 
            logger.error(f'API key {parameters["apikey"]} has returned an error. response note: {response.json()}')
    return data

In [7]:
def sweep_news_in_range(start_time: datetime, day_range: int = 100):
    # create a folder to store the sentiments and ignore if it already exists
    os.makedirs('sentiments', exist_ok=True)
    sentiment_list = []
    for i in tqdm(range(day_range)):
        # repeat the process for ealiest and latest
        for sort_by in ['LATEST', 'EARLIEST', 'RELEVANCE']:
            # get the time range
            time_to = (start_time - timedelta(days=i)).strftime('%Y%m%dT%H%M')
            time_from = (start_time - timedelta(days=i+1)).strftime('%Y%m%dT%H%M')
            parameters = {
                "function": "NEWS_SENTIMENT",
                'tickers': 'MSFT',
                "time_from": time_from,
                "time_to": time_to,
                "sort": sort_by,
                "limit": "200"
            }
            # get the news sentiment for the past week
            sentiment = get_data(parameters)
            if sentiment is None:
                logger.error(f'Unnable to fetch sentiment data for {sort_by} from {time_from} to {time_to}')
                continue
            with open(os.path.join('sentiments', f'sentiments_{sort_by}_{time_from}.json'), 'w') as f:
                json.dump(sentiment, f, indent=4)
            # append the sentiment to the list
            sentiment_list.append(sentiment)
    return sentiment_list

In [8]:
try: 
    sent_df = pd.read_csv('datasets/msft_sentiments.csv').drop(columns=['Unnamed: 0'], errors='ignore')
    # get the timedelta between the last date in the csv and the current date
    day_range = (datetime.now() - pd.to_datetime(sent_df['time']).iloc[0]).days + 1
except FileNotFoundError: 
    sent_df = pd.DataFrame()
    day_range = 365

start_time = datetime.now()
# # get the datetime of the 10.10.2021
# start_time = datetime.strptime('2.3.2022', '%d.%m.%Y')
# day_range = 365

# start sweeping from right now, for older sweeps change datetime to datetime.now() - timedelta(days=num_days_past)
all_news = sweep_news_in_range(start_time, day_range)

  0%|          | 0/4 [00:07<?, ?it/s]


KeyboardInterrupt: 