# **Import modules**

In [25]:
import pandas as pd
import numpy as np
from polygon import RESTClient
import yfinance as yf
from newsapi import NewsApiClient
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import pytz
import tkinter as tk
from tkinter import messagebox, ttk
from tkcalendar import Calendar
import time
from requests.exceptions import HTTPError

pd.set_option('future.no_silent_downcasting', True)

news_api_key = '6b423b01d98e47859e2ecbc296aa9b2b'

ticker = None

In [26]:
def validate_date(date_obj):
    """
    Validate and convert date object to datetime in US/Eastern timezone.
    """
    eastern = pytz.timezone('US/Eastern')
    return date_obj.replace(tzinfo=eastern)

def validate_interval(interval):
    """
    Validate the interval against supported intervals.
    """
    valid_intervals = ['1m', '5m', '30m', '1d', '1wk', '1mo']
    return interval if interval in valid_intervals else None

### **Fetch and Clean historical stock data**

In [27]:
def fetch_historical_stock_data(tickers, start_date, end_date, interval='5m', timeout=30):
    """
    Fetch historical stock data for a list of tickers using Polygon.io, including all available fields.
    """
    # Validate interval
    valid_intervals = ['1m', '5m', '30m', '1d', '1wk', '1mo']
    if interval not in valid_intervals:
        raise ValueError("Invalid interval. Choose from '1m', '5m', '30m', '1d', '1wk', '1mo'")

    # Map interval to timespan and multiplier
    interval_mapping = {
        '1m': ('minute', 1),
        '5m': ('minute', 5),
        '30m': ('minute', 30),
        '1d': ('day', 1),
        '1wk': ('week', 1),
        '1mo': ('month', 1)
    }
    timespan, multiplier = interval_mapping[interval]

    # Convert dates to UTC
    utc = pytz.UTC
    start_date_utc = start_date.astimezone(utc)
    end_date_utc = end_date.astimezone(utc)

    # Check if end_date is in the future
    now = datetime.now(pytz.UTC)
    if end_date_utc > now:
        end_date_utc = now
        print("Adjusted end date to current date as future dates are not valid.")

    # Format as YYYY-MM-DD (date only)
    from_ = start_date_utc.strftime('%Y-%m-%d')
    to_ = end_date_utc.strftime('%Y-%m-%d')

    # Initialize Polygon.io client with API key
    client = RESTClient(api_key="ryZeJ9HSNMP_wMiLfWCm07mlAeloVzDc")

    historical_data = []
    required_columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 'Number of Trades', 'VWAP', 'Is OTC', 'Ticker']

    for ticker in tickers:
        try:
            # Use get_aggs to fetch aggregates
            response = client.get_aggs(
                ticker=ticker,
                multiplier=multiplier,
                timespan=timespan,
                from_=from_,
                to=to_,
                limit=50000,
                adjusted=True
            )

            # Debug: Print response structure
            print(f"Response for {ticker}: {response}")

            # In version 1.2.6, response is a list of Agg objects
            if not response:
                print(f"No data returned for {ticker}")
                continue

            df = pd.DataFrame([{
                'Date': pd.to_datetime(agg.timestamp, unit='ms'),
                'Open': agg.open,
                'High': agg.high,
                'Low': agg.low,
                'Close': agg.close,
                'Volume': agg.volume,
                'Adj Close': agg.close,  # Since adjusted=True
                'Number of Trades': getattr(agg, 'transactions', None),
                'VWAP': getattr(agg, 'vwap', None),
                'Is OTC': getattr(agg, 'otc', False),
                'Ticker': ticker
            } for agg in response])

            print(f"Fetched {len(df)} data points for {ticker} at {interval} interval")
            historical_data.append(df)

        except Exception as e:
            print(f"Error fetching data for {ticker}: {e}")

    if not historical_data:
        print("No data collected for any tickers.")
        return pd.DataFrame(columns=required_columns)

    combined_df = pd.concat(historical_data, ignore_index=True)
    combined_df['Date'] = pd.to_datetime(combined_df['Date'])
    return combined_df[required_columns]

In [28]:
def clean_stock_data(data):
    """
    Clean stock data by handling missing values and ensuring data quality.
    """
    if data.empty:
        print("No data to clean.")
        return data

    required_columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 'Number of Trades', 'VWAP', 'Is OTC', 'Ticker']
    if not all(col in data.columns for col in required_columns):
        print("Missing required columns.")
        return data

    price_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'VWAP']
    data[price_columns] = data[price_columns].ffill()
    data['Volume'] = data['Volume'].fillna(0)
    data['Number of Trades'] = data['Number of Trades'].fillna(0)
    data['Is OTC'] = data['Is OTC'].fillna(False)
    data = data.dropna(subset=['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 'Ticker'])
    data = data.drop_duplicates(subset=['Ticker', 'Date'], keep='last')
    
    for col in price_columns + ['Volume', 'Number of Trades']:
        data[col] = data[col].clip(lower=0)

    data['Price_Change'] = data.groupby('Ticker')['Close'].pct_change()
    data = data[data['Price_Change'].abs() < 0.05]
    data = data.drop(columns=['Price_Change'], errors='ignore')
    
    return data

# **Fetch news headlines**

In [29]:
def fetch_news_data(tickers, api_key):
    """
    Fetch news articles including company-specific, global, and political news.
    Adjust start_date to NewsAPI free plan limit (1 month back).
    """
    utc = pytz.UTC
    newsapi = NewsApiClient(api_key=api_key)
    news_data = []
    
    # Adjust start_date to earliest allowed (current date - 32 days)
    now = datetime.now(utc)
    earliest_allowed = now - timedelta(days=32)
    
    for ticker in tickers:
        try:
            company_query = f"{ticker} stock"
            company_articles = newsapi.get_everything(
                q=company_query,
                from_param=now.strftime('%Y-%m-%d'),
                to=earliest_allowed.strftime('%Y-%m-%d'),
                language='en',
                sort_by='publishedAt'
            )
            for article in company_articles['articles']:
                news_data.append({
                    'Ticker': ticker,
                    'Date': pd.to_datetime(article['publishedAt']),
                    'Title': article['title'],
                    'Description': article['description'] or '',
                    'Source': article['source']['name'],
                    'URL': article['url'],
                    'Category': 'Company'
                })
            
            global_query = f"{ticker} industry OR global economy OR political {ticker}"
            global_articles = newsapi.get_everything(
                q=global_query,
                from_param=now.strftime('%Y-%m-%d'),
                to=earliest_allowed.strftime('%Y-%m-%d'),
                language='en',
                sort_by='publishedAt'
            )
            for article in global_articles['articles']:
                news_data.append({
                    'Ticker': ticker,
                    'Date': pd.to_datetime(article['publishedAt']),
                    'Title': article['title'],
                    'Description': article['description'] or '',
                    'Source': article['source']['name'],
                    'URL': article['url'],
                    'Category': 'Global/Political'
                })
            
        except Exception as e:
            print(f"Error fetching news for {ticker}: {e}")
    
    news_df = pd.DataFrame(news_data)
    if not news_df.empty:
        print(f"Fetched {len(news_df)} news articles for {tickers}")
        print(f"News columns: {news_df.columns.tolist()}")
    else:
        print(f"No news articles found for {tickers} in the specified date range.")
    return news_df

In [30]:
def fetch_financial_reports(tickers):
    """
    Fetch financial statements for tickers.
    """
    financial_data = []
    
    for ticker in tickers:
        try:
            stock = yf.Ticker(ticker)
            income_stmt = stock.financials.reset_index()
            income_stmt['Ticker'] = ticker
            financial_data.append(income_stmt)
            income_stmt.to_csv(f"income_statement_{ticker}.csv", index=False)
            
            balance_sheet = stock.balance_sheet.reset_index()
            balance_sheet['Ticker'] = ticker
            financial_data.append(balance_sheet)
            balance_sheet.to_csv(f"balance_sheet_{ticker}.csv", index=False)
            
            cash_flow = stock.cashflow.reset_index()
            cash_flow['Ticker'] = ticker
            financial_data.append(cash_flow)
            cash_flow.to_csv(f"cash_flow_{ticker}.csv", index=False)
    
        except Exception as e:
            print(f"Error fetching financial reports for {ticker}: {e}")
    
    if financial_data:
        combined_financials = pd.concat(financial_data, ignore_index=True)
        combined_financials.to_csv(f"financial_reports.csv", index=False)
        return combined_financials
    else:
        print("No financial data retrieved.")
        return pd.DataFrame()

In [31]:
def create_gui():
    """
    Create a stylish tkinter GUI to fetch stock, financial, and news data.
    """
    def open_calendar(is_start_date):
        top = tk.Toplevel(root)
        top.title("Select Date")
        top.configure(bg='#f0f4f8')
        top.geometry("300x300")
        
        cal = Calendar(top, selectmode='day', date_pattern='yyyy-mm-dd', background='#ffffff', foreground='#333333', bordercolor='#cccccc', headersbackground='#e0e8f0', normalbackground='#f0f4f8')
        cal.pack(pady=10, padx=10)
        
        def select_date():
            selected_date = cal.get_date()
            if is_start_date:
                start_date_var.set(selected_date)
            else:
                end_date_var.set(selected_date)
            error_label.config(text="", fg="#d32f2f")
            top.destroy()
        
        tk.Button(top, text="Select", command=select_date, bg='#4CAF50', fg='white', font=('Helvetica', 10, 'bold'), padx=10, pady=5).pack(pady=10)

    def submit():
        start_date_str = start_date_var.get()
        end_date_str = end_date_var.get()
        interval = interval_var.get()
        ticker = ticker_var.get().strip().upper()
        
        error_label.config(text="", fg="#d32f2f")
        
        if not ticker:
            error_label.config(text="Please enter a valid stock ticker (e.g., AAPL).", fg="#d32f2f")
            return
        
        try:
            start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
            end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
            start_date = validate_date(start_date)
            end_date = validate_date(end_date)
        except ValueError:
            error_label.config(text="Invalid date format. Use YYYY-MM-DD.", fg="#d32f2f")
            return
        
        if start_date is None or end_date is None:
            error_label.config(text="Invalid date selection. Please select valid dates.", fg="#d32f2f")
            return
        if not validate_interval(interval):
            error_label.config(text="Invalid interval. Choose from dropdown.", fg="#d32f2f")
            return
        if end_date < start_date:
            error_label.config(text="End date must be on or after start date.", fg="#d32f2f")
            return
        
        range_limits = {
            '1m': 7, '5m': 60, '30m': 60, '1d': 730, '1wk': 1825, '1mo': 7300}
        
        max_days = range_limits.get(interval, 60)
        days_diff = (end_date - start_date).days
        if days_diff > max_days:
            error_label.config(text=f"{interval} interval limited to {max_days} days.", fg="#d32f2f")
            return
        
        root.destroy()
        root.quit()
        
        tickers = [ticker]
        
        print(f"\nFetching stock data for {tickers} from {start_date.date()} to {end_date.date()} at {interval} interval...")
        
        def fetch_data():
            try:
                historical_data = fetch_historical_stock_data(tickers, start_date, end_date, interval)
                if historical_data is None or historical_data.empty:
                    raise ValueError("No historical data retrieved")

                cleaned_data = clean_stock_data(historical_data)
                cleaned_data.to_csv("cleaned_stock_data.csv", index=False)
                
                print(f"\nFetching financial reports for {tickers}...")
                financial_data = fetch_financial_reports(tickers)
                financial_data.to_csv("financial_data.csv", index=False)
                
                print(f"\nFetching news data...")
                news_data = fetch_news_data(tickers, news_api_key)
                news_data.to_csv("news_data.csv", index=False)
                
            except Exception as e:
                print(f"Error processing data: {e}")
                temp_root = tk.Tk()
                temp_root.withdraw()
                messagebox.showerror("Error", f"Error processing data: {e}", parent=temp_root)
                temp_root.destroy()
                
        fetch_data()

    def cancel():
        root.destroy()
        root.quit()

    def clear_error_on_interaction(*args):
        error_label.config(text="", fg="#d32f2f")

    root = tk.Tk()
    root.title("Stock Data Analyzer")
    root.geometry("400x600")
    root.configure(bg='#f0f4f8')

    title_frame = tk.Frame(root, bg='#4a90e2', height=50)
    title_frame.pack(fill='x')
    tk.Label(title_frame, text="Stock Data Analyzer", font=('Helvetica', 16, 'bold'), fg='white', bg='#4a90e2', padx=20, pady=10).pack()

    content_frame = tk.Frame(root, bg='#f0f4f8', padx=20, pady=20)
    content_frame.pack(expand=True, fill='both')

    tk.Label(content_frame, text="STOCK TICKER (e.g., AAPL):", font=('Helvetica', 10), bg='#f0f4f8', fg='#333333').pack(pady=(10, 5))
    ticker_var = tk.StringVar(value='AAPL')
    ticker_entry = tk.Entry(content_frame, textvariable=ticker_var, font=('Helvetica', 10), width=30, borderwidth=2, relief='groove')
    ticker_entry.pack(pady=5)
    ticker_var.trace('w', clear_error_on_interaction)

    tk.Label(content_frame, text="START DATE", font=('Helvetica', 10), bg='#f0f4f8', fg='#333333').pack(pady=(10, 5))
    start_date_var = tk.StringVar(value=datetime.now().strftime('%Y-%m-%d'))
    start_date_entry = tk.Entry(content_frame, textvariable=start_date_var, font=('Helvetica', 10), width=30, state='readonly', borderwidth=2, relief='groove')
    start_date_entry.pack(pady=5)
    tk.Button(content_frame, text="Select Start Date", command=lambda: open_calendar(True), font=('Helvetica', 10), bg='#4CAF50', fg='white', padx=10, pady=5).pack(pady=5)
    start_date_var.trace('w', clear_error_on_interaction)

    tk.Label(content_frame, text="END DATE", font=('Helvetica', 10), bg='#f0f4f8', fg='#333333').pack(pady=(10, 5))
    end_date_var = tk.StringVar(value=datetime.now().strftime('%Y-%m-%d'))
    end_date_entry = tk.Entry(content_frame, textvariable=end_date_var, font=('Helvetica', 10), width=30, state='readonly', borderwidth=2, relief='groove')
    end_date_entry.pack(pady=5)
    tk.Button(content_frame, text="Select End Date", command=lambda: open_calendar(False), font=('Helvetica', 10), bg='#4CAF50', fg='white', padx=10, pady=5).pack(pady=5)
    end_date_var.trace('w', clear_error_on_interaction)

    tk.Label(content_frame, text="INTERVAL", font=('Helvetica', 10), bg='#f0f4f8', fg='#333333').pack(pady=(10, 5))
    interval_var = tk.StringVar(value='5m')
    interval_menu = ttk.OptionMenu(content_frame, interval_var, '5m', '1m', '5m', '30m', '1d', '1wk', '1mo', style='Custom.TMenubutton')
    interval_menu.pack(pady=5)
    interval_var.trace('w', clear_error_on_interaction)

    error_label = tk.Label(content_frame, text="", font=('Helvetica', 10), fg="#d32f2f", bg='#f0f4f8', wraplength=400)
    error_label.pack(pady=10)

    button_frame = tk.Frame(content_frame, bg='#f0f4f8')
    button_frame.pack(pady=15, fill='x')
    tk.Button(button_frame, text="Submit", command=submit, font=('Helvetica', 10, 'bold'), bg='#2196F3', fg='white', padx=10, pady=5).pack(side=tk.LEFT, padx=40)
    tk.Button(button_frame, text="Cancel", command=cancel, font=('Helvetica', 10, 'bold'), bg='#f44336', fg='white', padx=10, pady=5).pack(side=tk.RIGHT, padx=40)

    style = ttk.Style()
    style.theme_use('clam')
    style.configure('Custom.TMenubutton', background='#e0e8f0', foreground='#333333', font=('Helvetica', 10), width=27)

    root.mainloop()

In [32]:
create_gui()


Fetching stock data for ['AAPL'] from 2024-05-03 to 2025-05-03 at 1d interval...
Response for AAPL: [Agg(open=186.645, high=187, low=182.66, close=183.38, volume=163224109.0, vwap=184.3674, timestamp=1714708800000, transactions=1468505, otc=None), Agg(open=182.354, high=184.2, low=180.42, close=181.71, volume=78319667.0, vwap=181.8711, timestamp=1714968000000, transactions=898763, otc=None), Agg(open=183.45, high=184.9, low=181.32, close=182.4, volume=77298571.0, vwap=182.7741, timestamp=1715054400000, transactions=747518, otc=None), Agg(open=182.85, high=183.07, low=181.45, close=182.74, volume=45057087.0, vwap=182.4822, timestamp=1715140800000, transactions=518609, otc=None), Agg(open=182.56, high=184.66, low=182.11, close=184.57, volume=48948972.0, vwap=184.005, timestamp=1715227200000, transactions=551004, otc=None), Agg(open=184.9, high=185.09, low=182.13, close=183.05, volume=50745096.0, vwap=183.097, timestamp=1715313600000, transactions=558778, otc=None), Agg(open=185.435, hig