# **Import modules**

In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
from newsapi import NewsApiClient
from fredapi import Fred
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime, timedelta
import pytz
import tkinter as tk
from tkinter import messagebox, ttk
from tkcalendar import Calendar

In [None]:
def validate_date(date_obj):
    """
    Validate and convert date object to datetime in US/Eastern timezone.
    """
    try:
        eastern = pytz.timezone('US/Eastern')
        return date_obj.replace(tzinfo=eastern)
    except Exception:
        return None

def validate_interval(interval):
    """
    Validate the interval against supported yfinance intervals.
    """
    valid_intervals = ['1m', '5m', '30m', '1d', '1wk', '1mo']
    return interval if interval in valid_intervals else None

### **Fetch and Clean historical stock data**

In [None]:
def fetch_historical_stock_data(tickers, start_date, end_date, interval='5m'):
    """
    Fetch historical stock data for a list of tickers at specified interval.
    """
    historical_data = []
    required_columns = ['Open', 'High', 'Low', 'Close', 'Volume']
    
    eastern = pytz.timezone('US/Eastern')
    if isinstance(start_date, datetime):
        start_date = start_date.astimezone(eastern)
    if isinstance(end_date, datetime):
        end_date = end_date.astimezone(eastern)
    
    try:
        df = yf.download(tickers, start=start_date, end=end_date, interval=interval)
        
        if df.empty:
            print("No data returned. Check if the date range includes trading hours or if tickers are valid.")
            return pd.DataFrame()

        if len(tickers) == 1:
            ticker = tickers[0]
            if isinstance(df.columns, pd.MultiIndex):
                df.columns = [col[0] for col in df.columns]
            
            missing_columns = [col for col in required_columns if col not in df.columns]
            if missing_columns:
                print(f"Missing required columns for {ticker}: {missing_columns}")
                return pd.DataFrame()
            
            df = df.reset_index()
            if 'Date' not in df.columns:
                if df.columns[0] in ['Datetime', 'index', 'DateTime']:
                    df = df.rename(columns={df.columns[0]: 'Date'})
                else:
                    print(f"Unexpected index column: {df.columns[0]}")
                    return pd.DataFrame()
            
            df['Ticker'] = ticker
            df['Adj Close'] = df.get('Adj Close', df['Close'])
            df = df[['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 'Ticker']]
            print(f"Fetched {len(df)} data points for {ticker} at {interval} interval")
            historical_data.append(df)
        else:
            for ticker in tickers:
                if ticker not in df.columns.get_level_values(1):
                    print(f"No data returned for {ticker}")
                    continue
                ticker_df = df.xs(ticker, level=1, axis=1).copy()
                if ticker_df.empty:
                    print(f"No data returned for {ticker}")
                    continue
                
                missing_columns = [col for col in required_columns if col not in ticker_df.columns]
                if missing_columns:
                    print(f"Missing required columns for {ticker}: {missing_columns}")
                    continue
                
                ticker_df = ticker_df.reset_index()
                if 'Date' not in ticker_df.columns:
                    if ticker_df.columns[0] in ['Datetime', 'index', 'DateTime']:
                        ticker_df = ticker_df.rename(columns={ticker_df.columns[0]: 'Date'})
                    else:
                        print(f"Unexpected index column for {ticker}: {ticker_df.columns[0]}")
                        continue
                
                ticker_df['Ticker'] = ticker
                ticker_df['Adj Close'] = ticker_df.get('Adj Close', ticker_df['Close'])
                ticker_df = ticker_df[['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 'Ticker']]
                print(f"Fetched {len(ticker_df)} data points for {ticker} at {interval} interval")
                historical_data.append(ticker_df)
        
        if not historical_data:
            print("No data collected for any tickers. Possible non-trading day or invalid tickers.")
            return pd.DataFrame()

        combined_df = pd.concat(historical_data, ignore_index=True)
        combined_df['Date'] = pd.to_datetime(combined_df['Date'])
        return combined_df

    except Exception as e:
        print(f"Error fetching data: {e}")
        return pd.DataFrame()

In [None]:
def clean_stock_data(data):
    """
    Clean stock data by handling missing values and ensuring data quality.
    """
    if data.empty:
        print("No data to clean.")
        return data

    required_columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 'Ticker']
    if not all(col in data.columns for col in required_columns):
        print("Missing required columns.")
        return data

    price_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close']
    data[price_columns] = data[price_columns].ffill()
    data['Volume'] = data['Volume'].fillna(0)
    data = data.dropna()
    data = data.drop_duplicates(subset=['Ticker', 'Date'], keep='last')
    
    for col in price_columns + ['Volume']:
        data[col] = data[col].clip(lower=0)

    data['Price_Change'] = data.groupby('Ticker')['Close'].pct_change()
    data = data[data['Price_Change'].abs() < 0.05]
    data = data.drop(columns=['Price_Change'], errors='ignore')
    
    return data

# **Fetch news headlines**

In [None]:
def fetch_news_data(tickers, api_key, start_date, end_date):
    """
    Fetch news articles including company-specific, global, and political news.
    """
    newsapi = NewsApiClient(api_key=api_key)
    news_data = []
    
    if isinstance(start_date, datetime):
        start_date = start_date.astimezone(pytz.UTC)
    if isinstance(end_date, datetime):
        end_date = end_date.astimezone(pytz.UTC)
    
    for ticker in tickers:
        try:
            company_query = f"{ticker} stock"
            company_articles = newsapi.get_everything(
                q=company_query,
                from_param=start_date.strftime('%Y-%m-%d'),
                to=end_date.strftime('%Y-%m-%d'),
                language='en',
                sort_by='publishedAt'
            )
            for article in company_articles['articles']:
                news_data.append({
                    'Ticker': ticker,
                    'Date': pd.to_datetime(article['publishedAt']),
                    'Title': article['title'],
                    'Description': article['description'] or '',
                    'Source': article['source']['name'],
                    'URL': article['url'],
                    'Category': 'Company'
                })
            
            global_query = f"{ticker} industry OR global economy OR political {ticker}"
            global_articles = newsapi.get_everything(
                q=global_query,
                from_param=start_date.strftime('%Y-%m-%d'),
                to=end_date.strftime('%Y-%m-%d'),
                language='en',
                sort_by='publishedAt'
            )
            for article in global_articles['articles']:
                news_data.append({
                    'Ticker': ticker,
                    'Date': pd.to_datetime(article['publishedAt']),
                    'Title': article['title'],
                    'Description': article['description'] or '',
                    'Source': article['source']['name'],
                    'URL': article['url'],
                    'Category': 'Global/Political'
                })
                
        except Exception as e:
            print(f"Error fetching news for {ticker}: {e}")
    
    news_df = pd.DataFrame(news_data)
    if not news_df.empty:
        print(f"Fetched {len(news_df)} news articles for {tickers}")
        print(f"News columns: {news_df.columns.tolist()}")
    else:
        print(f"No news articles found for {tickers} in the specified date range.")
    return news_df

In [None]:
def fetch_income_statement(tickers):
    """
    Fetch income statement for a list of tickers.
    
    Args:
        tickers (list): List of stock ticker symbols (e.g., ['AAPL']).
    
    Returns:
        pd.DataFrame: DataFrame with income statement data, including Ticker column.
    """
    income_data = []
    
    for ticker in tickers:
        try:
            stock = yf.Ticker(ticker)
            income_stmt = stock.financials.reset_index()
            income_stmt['Ticker'] = ticker
            income_data.append(income_stmt)
            print(f"Fetched income statement for {ticker}")
        except Exception as e:
            print(f"Error fetching income statement for {ticker}: {e}")
    
    if income_data:
        combined_income = pd.concat(income_data, ignore_index=True)
        print(f"Income statement columns: {combined_income.columns.tolist()}")
        return combined_income
    else:
        print("No income statement data retrieved.")
        return pd.DataFrame()

In [None]:
def fetch_balance_sheet(tickers):
    """
    Fetch balance sheet for a list of tickers.
    
    Args:
        tickers (list): List of stock ticker symbols (e.g., ['AAPL']).
    
    Returns:
        pd.DataFrame: DataFrame with balance sheet data, including Ticker column.
    """
    balance_data = []
    
    for ticker in tickers:
        try:
            stock = yf.Ticker(ticker)
            balance_sheet = stock.balance_sheet.reset_index()
            balance_sheet['Ticker'] = ticker
            balance_data.append(balance_sheet)
            print(f"Fetched balance sheet for {ticker}")
        except Exception as e:
            print(f"Error fetching balance sheet for {ticker}: {e}")
    
    if balance_data:
        combined_balance = pd.concat(balance_data, ignore_index=True)
        print(f"Balance sheet columns: {combined_balance.columns.tolist()}")
        return combined_balance
    else:
        print("No balance sheet data retrieved.")
        return pd.DataFrame()

In [None]:
def fetch_cash_flow(tickers):
    """
    Fetch cash flow statement for a list of tickers.
    
    Args:
        tickers (list): List of stock ticker symbols (e.g., ['AAPL']).
    
    Returns:
        pd.DataFrame: DataFrame with cash flow data, including Ticker column.
    """
    cash_flow_data = []
    
    for ticker in tickers:
        try:
            stock = yf.Ticker(ticker)
            cash_flow = stock.cashflow.reset_index()
            cash_flow['Ticker'] = ticker
            cash_flow_data.append(cash_flow)
            print(f"Fetched cash flow statement for {ticker}")
        except Exception as e:
            print(f"Error fetching cash flow statement for {ticker}: {e}")
    
    if cash_flow_data:
        combined_cash_flow = pd.concat(cash_flow_data, ignore_index=True)
        print(f"Cash flow columns: {combined_cash_flow.columns.tolist()}")
        return combined_cash_flow
    else:
        print("No cash flow data retrieved.")
        return pd.DataFrame()

In [None]:
def fetch_financial_reports(tickers):
    """
    Fetch financial statements (income statement, balance sheet, cash flow) for tickers
    and save each to a separate CSV file named with the ticker.
    
    Args:
        tickers (list): List of stock ticker symbols (e.g., ['AAPL']).
    
    Returns:
        pd.DataFrame: Combined DataFrame with all financial statements, including Statement column.
    """
    financial_data = []
    
    for ticker in tickers:
        # Fetch Income Statement
        income_df = fetch_income_statement([ticker])
        if not income_df.empty:
            income_df.to_csv(f"income_statement_{ticker}.csv", index=False)
            print(f"Saved income statement for {ticker} to income_statement_{ticker}.csv")
            financial_data.append(income_df)
        
        # Fetch Balance Sheet
        balance_df = fetch_balance_sheet([ticker])
        if not balance_df.empty:
            balance_df.to_csv(f"balance_sheet_{ticker}.csv", index=False)
            print(f"Saved balance sheet for {ticker} to balance_sheet_{ticker}.csv")
            financial_data.append(balance_df)
        
        # Fetch Cash Flow
        cash_flow_df = fetch_cash_flow([ticker])
        if not cash_flow_df.empty:
            cash_flow_df.to_csv(f"cash_flow_{ticker}.csv", index=False)
            print(f"Saved cash flow statement for {ticker} to cash_flow_{ticker}.csv")
            financial_data.append(cash_flow_df)
    
    if financial_data:
        combined_financials = pd.concat(financial_data, ignore_index=True)
        print(f"Financial data columns: {combined_financials.columns.tolist()}")
        return combined_financials
    else:
        print("No financial data retrieved.")
        return pd.DataFrame()

# **Exploratory data analysis**

In [None]:
def perform_eda(df, financial_df, news_df, ticker='AAPL', interval='5m'):
    """
    Perform EDA including stock data, financial reports, news, and unemployment rate.
    """
    ticker_df = df[df['Ticker'] == ticker].copy()
    
    if ticker_df.empty:
        print(f"No stock data available for ticker {ticker}")
        return
    
    # Stock Data Summary
    print(f"\nSummary Statistics for {ticker} Stock Data:")
    print(ticker_df.describe())
    
    # News Summary
    if not news_df.empty:
        print(f"\nNews Article Counts by Category for {ticker}:")
        print(news_df[news_df['Ticker'] == ticker]['Category'].value_counts())
    
    # Save plots
    plt.figure(figsize=(12, 6))
    plt.plot(ticker_df['Date'], ticker_df['Close'], label='Close Price', color='blue')
    plt.title(f'{ticker} Closing Price ({interval} Intervals)')
    plt.xlabel('Time' if interval in ['1m', '5m', '30m'] else 'Date')
    plt.ylabel('Price (USD)')
    plt.grid(True)
    plt.legend()
    plt.show()

In [None]:
def create_gui():
    """
    Create a tkinter GUI to fetch stock, financial, news, and unemployment data.
    """
    def open_calendar(is_start_date):
        top = tk.Toplevel(root)
        top.title("Select Date")
        top.geometry("300x300")
        
        cal = Calendar(top, selectmode='day', date_pattern='yyyy-mm-dd')
        cal.pack(pady=10)
        
        def select_date():
            selected_date = cal.get_date()
            if is_start_date:
                start_date_var.set(selected_date)
            else:
                end_date_var.set(selected_date)
            error_label.config(text="", fg="red")
            top.destroy()
        
        tk.Button(top, text="Select", command=select_date).pack(pady=10)

    def submit():
        start_date_str = start_date_var.get()
        end_date_str = end_date_var.get()
        interval = interval_var.get()
        
        error_label.config(text="", fg="red")
        
        try:
            start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
            end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
            start_date = validate_date(start_date)
            end_date = validate_date(end_date)
        except ValueError:
            error_label.config(text="Invalid date format. Use YYYY-MM-DD.", fg="red")
            return
        
        if start_date is None or end_date is None:
            error_label.config(text="Invalid date selection. Please select valid dates.", fg="red")
            return
        if not validate_interval(interval):
            error_label.config("Invalid interval. Choose from dropdown.", fg="red")
            return
        if end_date < start_date:
            error_label.config(text="End date must be on or after start date.", fg="red")
            return
        
        range_limits = {
            '1m': 7, '5m': 60, '30m': 60, '1d': 730, '1wk': 1825, '1mo': 7300}
        
        max_days = range_limits.get(interval, 60)
        days_diff = (end_date - start_date).days
        if days_diff > max_days:
            error_label.config(text=f"{interval} interval limited to {max_days} days.", fg="red")
            return
        
        root.destroy()
        
        tickers = ['AAPL']
        news_api_key = '6b423b01d98e47859e2ecbc296aa9b2b'
        fred_api_key = 'YOUR_FRED_API_KEY'  # Replace with your FRED API key
        
        print(f"\nFetching stock data for {tickers} from {start_date.date()} to {end_date.date()} at {interval} interval...")
        try:
            historical_data = fetch_historical_stock_data(tickers, start_date, end_date, interval)
            if historical_data.empty:
                raise ValueError("No historical data retrieved")

            cleaned_data = clean_stock_data(historical_data)
            cleaned_data.to_csv("cleaned_stock_data.csv", index=False)
            
            print(f"\nFetching financial reports for {tickers}...")
            financial_data = fetch_financial_reports(tickers)
            if not financial_data.empty:
                financial_data.to_csv("financial_reports.csv", index=False)
                print("Financial reports saved to financial_reports.csv")
            
            print(f"\nFetching news data for {tickers} from {start_date.date()} to {end_date.date()}...")
            news_data = fetch_news_data(tickers, news_api_key, start_date, end_date)
            if not news_data.empty:
                news_data.to_csv("news_data.csv", index=False)
                print("News data saved to news_data.csv")
            
            print("\nPerforming exploratory data analysis...")
            perform_eda(cleaned_data, financial_data, news_data, ticker='AAPL', interval=interval)

        except Exception as e:
            print(f"Error processing data: {e}")
            tk.Tk().withdraw()
            messagebox.showerror("Error", f"Error processing data: {e}")

    def cancel():
        root.destroy()

    def clear_error_on_interaction(*args):
        error_label.config(text="", fg="red")

    root = tk.Tk()
    root.title("Stock Data Input")
    root.geometry("400x400")
    
    tk.Label(root, text="Start Date:").pack(pady=10)
    start_date_var = tk.StringVar(value=datetime.now().strftime('%Y-%m-%d'))
    start_date_entry = tk.Entry(root, textvariable=start_date_var, state='readonly')
    start_date_entry.pack()
    tk.Button(root, text="Select Start Date", command=lambda: open_calendar(True)).pack(pady=5)
    start_date_var.trace('w', clear_error_on_interaction)
    
    tk.Label(root, text="End Date:").pack(pady=10)
    end_date_var = tk.StringVar(value=datetime.now().strftime('%Y-%m-%d'))
    end_date_entry = tk.Entry(root, textvariable=end_date_var, state='readonly')
    end_date_entry.pack()
    tk.Button(root, text="Select End Date", command=lambda: open_calendar(False)).pack(pady=5)
    end_date_var.trace('w', clear_error_on_interaction)
    
    tk.Label(root, text="Interval:").pack(pady=10)
    interval_var = tk.StringVar(value='5m')
    interval_menu = ttk.OptionMenu(root, interval_var, '5m', '1m', '5m', '30m', '1d', '1wk', '1mo')
    interval_menu.pack()
    interval_var.trace('w', clear_error_on_interaction)
    
    error_label = tk.Label(root, text="", fg="red", wraplength=350)
    error_label.pack(pady=10)
    
    button_frame = tk.Frame(root)
    button_frame.pack(pady=20)
    tk.Button(button_frame, text="Submit", command=submit).pack(side=tk.LEFT, padx=10)
    tk.Button(button_frame, text="Cancel", command=cancel).pack(side=tk.LEFT, padx=10)
    
    root.mainloop()

In [None]:
# Ensure inline plotting in Jupyter
%matplotlib inline

# Launch GUI
create_gui()


Fetching stock data for ['AAPL'] from 2025-04-30 to 2025-05-02 at 30m interval...
