# **Import modules**

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf
from newsapi import NewsApiClient
from fredapi import Fred
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime, timedelta
import pytz
import tkinter as tk
from tkinter import messagebox, ttk
from tkcalendar import Calendar
import threading
from tqdm import tqdm

In [2]:
def validate_date(date_obj):
    """
    Validate and convert date object to datetime in US/Eastern timezone.
    """
    try:
        eastern = pytz.timezone('US/Eastern')
        return date_obj.replace(tzinfo=eastern)
    except Exception:
        return None

def validate_interval(interval):
    """
    Validate the interval against supported yfinance intervals.
    """
    valid_intervals = ['1m', '5m', '30m', '1d', '1wk', '1mo']
    return interval if interval in valid_intervals else None

### **Fetch and Clean historical stock data**

In [3]:
def fetch_historical_stock_data(tickers, start_date, end_date, interval='5m', timeout=30):
    """
    Fetch historical stock data for a list of tickers at specified interval with timeout.
    """
    historical_data = []
    required_columns = ['Open', 'High', 'Low', 'Close', 'Volume']
    
    eastern = pytz.timezone('US/Eastern')
    if isinstance(start_date, datetime):
        start_date = start_date.astimezone(eastern)
    if isinstance(end_date, datetime):
        end_date = end_date.astimezone(eastern)
    
    try:
        if end_date > datetime.now(pytz.UTC):
            end_date = datetime.now(pytz.UTC)
            print("Adjusted end date to current date as future dates are not valid.")
        
        # Simulate progress with tqdm for stock data fetching
        with tqdm(total=3, desc=f"Fetching stock data for {tickers}") as pbar:
            pbar.set_description("Initializing")
            time.sleep(1)  # Simulate initialization delay
            pbar.update(1)
            
            df = yf.download(tickers, start=start_date, end=end_date, interval=interval, auto_adjust=True, timeout=timeout)
            pbar.set_description("Retrieving data")
            pbar.update(1)
            
            if df.empty:
                pbar.set_description("Completed (No data)")
                print("No data returned. Check if the date range includes trading hours or if tickers are valid.")
                return pd.DataFrame()

            if len(tickers) == 1:
                ticker = tickers[0]
                if isinstance(df.columns, pd.MultiIndex):
                    df.columns = [col[0] for col in df.columns]
                
                missing_columns = [col for col in required_columns if col not in df.columns]
                if missing_columns:
                    pbar.set_description("Completed (Missing columns)")
                    print(f"Missing required columns for {ticker}: {missing_columns}")
                    return pd.DataFrame()
                
                df = df.reset_index()
                if 'Date' not in df.columns:
                    if df.columns[0] in ['Datetime', 'index', 'DateTime']:
                        df = df.rename(columns={df.columns[0]: 'Date'})
                    else:
                        pbar.set_description("Completed (Invalid column)")
                        print(f"Unexpected index column: {df.columns[0]}")
                        return pd.DataFrame()
                
                df['Ticker'] = ticker
                df['Adj Close'] = df.get('Adj Close', df['Close'])
                df = df[['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 'Ticker']]
                print(f"Fetched {len(df)} data points for {ticker} at {interval} interval")
                historical_data.append(df)
            else:
                for ticker in tickers:
                    if ticker not in df.columns.get_level_values(1):
                        print(f"No data returned for {ticker}")
                        continue
                    ticker_df = df.xs(ticker, level=1, axis=1).copy()
                    if ticker_df.empty:
                        print(f"No data returned for {ticker}")
                        continue
                    
                    missing_columns = [col for col in required_columns if col not in ticker_df.columns]
                    if missing_columns:
                        print(f"Missing required columns for {ticker}: {missing_columns}")
                        continue
                    
                    ticker_df = ticker_df.reset_index()
                    if 'Date' not in ticker_df.columns:
                        if ticker_df.columns[0] in ['Datetime', 'index', 'DateTime']:
                            ticker_df = ticker_df.rename(columns={ticker_df.columns[0]: 'Date'})
                        else:
                            print(f"Unexpected index column for {ticker}: {ticker_df.columns[0]}")
                            continue
                    
                    ticker_df['Ticker'] = ticker
                    ticker_df['Adj Close'] = ticker_df.get('Adj Close', ticker_df['Close'])
                    ticker_df = ticker_df[['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 'Ticker']]
                    print(f"Fetched {len(ticker_df)} data points for {ticker} at {interval} interval")
                    historical_data.append(ticker_df)
            
            pbar.set_description("Processing data")
            pbar.update(1)
            combined_df = pd.concat(historical_data, ignore_index=True)
            combined_df['Date'] = pd.to_datetime(combined_df['Date'])
            pbar.set_description("Completed")
            return combined_df

    except Exception as e:
        print(f"Error fetching data: {e}")
        return pd.DataFrame()

In [4]:
def clean_stock_data(data):
    """
    Clean stock data by handling missing values and ensuring data quality.
    """
    if data.empty:
        print("No data to clean.")
        return data

    required_columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 'Ticker']
    if not all(col in data.columns for col in required_columns):
        print("Missing required columns.")
        return data

    price_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close']
    data[price_columns] = data[price_columns].ffill()
    data['Volume'] = data['Volume'].fillna(0)
    data = data.dropna()
    data = data.drop_duplicates(subset=['Ticker', 'Date'], keep='last')
    
    for col in price_columns + ['Volume']:
        data[col] = data[col].clip(lower=0)

    data['Price_Change'] = data.groupby('Ticker')['Close'].pct_change()
    data = data[data['Price_Change'].abs() < 0.05]
    data = data.drop(columns=['Price_Change'], errors='ignore')
    
    return data

# **Fetch news headlines**

In [5]:
def fetch_news_data(tickers, api_key, start_date, end_date):
    """
    Fetch news articles including company-specific, global, and political news.
    """
    newsapi = NewsApiClient(api_key=api_key)
    news_data = []
    
    if isinstance(start_date, datetime):
        start_date = start_date.astimezone(pytz.UTC)
    if isinstance(end_date, datetime):
        end_date = end_date.astimezone(pytz.UTC)
    
    with tqdm(total=len(tickers) * 2, desc="Fetching news data") as pbar:
        for ticker in tickers:
            try:
                pbar.set_description(f"Fetching company news for {ticker}")
                company_query = f"{ticker} stock"
                company_articles = newsapi.get_everything(
                    q=company_query,
                    from_param=start_date.strftime('%Y-%m-%d'),
                    to=end_date.strftime('%Y-%m-%d'),
                    language='en',
                    sort_by='publishedAt'
                )
                for article in company_articles['articles']:
                    news_data.append({
                        'Ticker': ticker,
                        'Date': pd.to_datetime(article['publishedAt']),
                        'Title': article['title'],
                        'Description': article['description'] or '',
                        'Source': article['source']['name'],
                        'URL': article['url'],
                        'Category': 'Company'
                    })
                pbar.update(1)
                
                pbar.set_description(f"Fetching global news for {ticker}")
                global_query = f"{ticker} industry OR global economy OR political {ticker}"
                global_articles = newsapi.get_everything(
                    q=global_query,
                    from_param=start_date.strftime('%Y-%m-%d'),
                    to=end_date.strftime('%Y-%m-%d'),
                    language='en',
                    sort_by='publishedAt'
                )
                for article in global_articles['articles']:
                    news_data.append({
                        'Ticker': ticker,
                        'Date': pd.to_datetime(article['publishedAt']),
                        'Title': article['title'],
                        'Description': article['description'] or '',
                        'Source': article['source']['name'],
                        'URL': article['url'],
                        'Category': 'Global/Political'
                    })
                pbar.update(1)
                
            except Exception as e:
                print(f"Error fetching news for {ticker}: {e}")
    
    news_df = pd.DataFrame(news_data)
    if not news_df.empty:
        print(f"Fetched {len(news_df)} news articles for {tickers}")
        print(f"News columns: {news_df.columns.tolist()}")
    else:
        print(f"No news articles found for {tickers} in the specified date range.")
    return news_df

In [6]:
def fetch_income_statement(tickers):
    """
    Fetch income statement for a list of tickers.
    """
    income_data = []
    
    for ticker in tickers:
        try:
            stock = yf.Ticker(ticker)
            income_stmt = stock.financials.reset_index()
            income_stmt['Ticker'] = ticker
            income_data.append(income_stmt)
            print(f"Fetched income statement for {ticker}")
        except Exception as e:
            print(f"Error fetching income statement for {ticker}: {e}")
    
    if income_data:
        combined_income = pd.concat(income_data, ignore_index=True)
        print(f"Income statement columns: {combined_income.columns.tolist()}")
        return combined_income
    else:
        print("No income statement data retrieved.")
        return pd.DataFrame()

In [7]:
def fetch_balance_sheet(tickers):
    """
    Fetch balance sheet for a list of tickers.
    """
    balance_data = []
    
    for ticker in tickers:
        try:
            stock = yf.Ticker(ticker)
            balance_sheet = stock.balance_sheet.reset_index()
            balance_sheet['Ticker'] = ticker
            balance_data.append(balance_sheet)
            print(f"Fetched balance sheet for {ticker}")
        except Exception as e:
            print(f"Error fetching balance sheet for {ticker}: {e}")
    
    if balance_data:
        combined_balance = pd.concat(balance_data, ignore_index=True)
        print(f"Balance sheet columns: {combined_balance.columns.tolist()}")
        return combined_balance
    else:
        print("No balance sheet data retrieved.")
        return pd.DataFrame()

In [8]:
def fetch_cash_flow(tickers):
    """
    Fetch cash flow statement for a list of tickers.
    """
    cash_flow_data = []
    
    for ticker in tickers:
        try:
            stock = yf.Ticker(ticker)
            cash_flow = stock.cashflow.reset_index()
            cash_flow['Ticker'] = ticker
            cash_flow_data.append(cash_flow)
            print(f"Fetched cash flow statement for {ticker}")
        except Exception as e:
            print(f"Error fetching cash flow statement for {ticker}: {e}")
    
    if cash_flow_data:
        combined_cash_flow = pd.concat(cash_flow_data, ignore_index=True)
        print(f"Cash flow columns: {combined_cash_flow.columns.tolist()}")
        return combined_cash_flow
    else:
        print("No cash flow data retrieved.")
        return pd.DataFrame()

In [9]:
def fetch_financial_reports(tickers):
    """
    Fetch financial statements for tickers.
    """
    financial_data = []
    
    with tqdm(total=len(tickers) * 3, desc="Fetching financial reports") as pbar:
        for ticker in tickers:
            try:
                pbar.set_description(f"Fetching income for {ticker}")
                stock = yf.Ticker(ticker)
                income_stmt = stock.get_income_stmt().reset_index()
                income_stmt['Ticker'] = ticker
                income_stmt['Statement'] = 'Income Statement'
                financial_data.append(income_stmt)
                income_stmt.to_csv(f"income_statement_{ticker}.csv", index=False)
                print(f"Saved income statement for {ticker} to income_statement_{ticker}.csv")
                pbar.update(1)
                
                pbar.set_description(f"Fetching balance for {ticker}")
                balance_sheet = stock.get_balance_sheet().reset_index()
                balance_sheet['Ticker'] = ticker
                balance_sheet['Statement'] = 'Balance Sheet'
                financial_data.append(balance_sheet)
                balance_sheet.to_csv(f"balance_sheet_{ticker}.csv", index=False)
                print(f"Saved balance sheet for {ticker} to balance_sheet_{ticker}.csv")
                pbar.update(1)
                
                pbar.set_description(f"Fetching cash flow for {ticker}")
                cash_flow = stock.get_cashflow().reset_index()
                cash_flow['Ticker'] = ticker
                cash_flow['Statement'] = 'Cash Flow'
                financial_data.append(cash_flow)
                cash_flow.to_csv(f"cash_flow_{ticker}.csv", index=False)
                print(f"Saved cash flow for {ticker} to cash_flow_{ticker}.csv")
                pbar.update(1)
                
            except Exception as e:
                print(f"Error fetching financial reports for {ticker}: {e}")
    
    if financial_data:
        combined_financials = pd.concat(financial_data, ignore_index=True)
        combined_financials.to_csv(f"financial_reports.csv", index=False)
        print(f"Saved combined financial reports to financial_reports.csv")
        print(f"Financial reports columns: {combined_financials.columns.tolist()}")
        return combined_financials
    else:
        print("No financial data retrieved.")
        return pd.DataFrame()

# **Exploratory data analysis**

In [10]:
def perform_eda(df, financial_df, news_df, ticker, interval='5m'):
    """
    Perform EDA including stock data, financial reports, and news.
    """
    ticker_df = df[df['Ticker'] == ticker].copy()
    
    if ticker_df.empty:
        print(f"No stock data available for ticker {ticker}")
        return
    
    print(f"\nSummary Statistics for {ticker} Stock Data:")
    print(ticker_df.describe())
    
    if not financial_df.empty:
        print(f"\nFinancial Statements Summary for {ticker}:")
        for statement in financial_df['Statement'].unique():
            stmt_df = financial_df[(financial_df['Ticker'] == ticker) & (financial_df['Statement'] == statement)]
            print(f"\n{statement} Head:")
            print(stmt_df.head())
    
    if not news_df.empty:
        print(f"\nNews Article Counts by Category for {ticker}:")
        print(news_df[news_df['Ticker'] == ticker]['Category'].value_counts())

In [11]:
def create_gui():
    """
    Create a stylish tkinter GUI to fetch stock, financial, and news data with user-defined ticker.
    """
    def open_calendar(is_start_date):
        top = tk.Toplevel(root)
        top.title("Select Date")
        top.configure(bg='#f0f4f8')
        top.geometry("300x300")
        
        cal = Calendar(top, selectmode='day', date_pattern='yyyy-mm-dd', background='#ffffff', foreground='#333333', bordercolor='#cccccc', headersbackground='#e0e8f0', normalbackground='#f0f4f8')
        cal.pack(pady=10, padx=10)
        
        def select_date():
            selected_date = cal.get_date()
            if is_start_date:
                start_date_var.set(selected_date)
            else:
                end_date_var.set(selected_date)
            error_label.config(text="", fg="#d32f2f")
            top.destroy()
        
        tk.Button(top, text="Select", command=select_date, bg='#4CAF50', fg='white', font=('Helvetica', 10, 'bold'), padx=10, pady=5).pack(pady=10)

    def submit():
        start_date_str = start_date_var.get()
        end_date_str = end_date_var.get()
        interval = interval_var.get()
        ticker = ticker_var.get().strip().upper()
        
        error_label.config(text="", fg="#d32f2f")
        
        if not ticker:
            error_label.config(text="Please enter a valid stock ticker (e.g., AAPL).", fg="#d32f2f")
            return
        
        try:
            start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
            end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
            start_date = validate_date(start_date)
            end_date = validate_date(end_date)
        except ValueError:
            error_label.config(text="Invalid date format. Use YYYY-MM-DD.", fg="#d32f2f")
            return
        
        if start_date is None or end_date is None:
            error_label.config(text="Invalid date selection. Please select valid dates.", fg="#d32f2f")
            return
        if not validate_interval(interval):
            error_label.config(text="Invalid interval. Choose from dropdown.", fg="#d32f2f")
            return
        if end_date < start_date:
            error_label.config(text="End date must be on or after start date.", fg="#d32f2f")
            return
        
        range_limits = {
            '1m': 7, '5m': 60, '30m': 60, '1d': 730, '1wk': 1825, '1mo': 7300}
        
        max_days = range_limits.get(interval, 60)
        days_diff = (end_date - start_date).days
        if days_diff > max_days:
            error_label.config(text=f"{interval} interval limited to {max_days} days.", fg="#d32f2f")
            return
        
        root.destroy()
        root.quit()  # Explicitly stop the event loop
        
        tickers = [ticker]
        news_api_key = '6b423b01d98e47859e2ecbc296aa9b2b'
        
        print(f"\nFetching stock data for {tickers} from {start_date.date()} to {end_date.date()} at {interval} interval...")
        def fetch_data():
            try:
                historical_data = fetch_historical_stock_data(tickers, start_date, end_date, interval)
                if historical_data.empty:
                    raise ValueError("No historical data retrieved")

                cleaned_data = clean_stock_data(historical_data)
                cleaned_data.to_csv("cleaned_stock_data.csv", index=False)
                
                print(f"\nFetching financial reports for {tickers}...")
                financial_data = fetch_financial_reports(tickers)
                if not financial_data.empty:
                    financial_data.to_csv("financial_reports.csv", index=False)
                    print("Financial reports saved to financial_reports.csv")
                
                print(f"\nFetching news data for {tickers} from {start_date.date()} to {end_date.date()}...")
                news_data = fetch_news_data(tickers, news_api_key, start_date, end_date)
                if not news_data.empty:
                    news_data.to_csv("news_data.csv", index=False)
                    print("News data saved to news_data.csv")
                
                print("\nPerforming exploratory data analysis...")
                perform_eda(cleaned_data, financial_data, news_data, ticker=ticker, interval=interval)

            except Exception as e:
                print(f"Error processing data: {e}")
                temp_root = tk.Tk()
                temp_root.withdraw()
                messagebox.showerror("Error", f"Error processing data: {e}", parent=temp_root)
                temp_root.destroy()

        # Run data fetching in a separate thread
        fetch_thread = threading.Thread(target=fetch_data)
        fetch_thread.start()
        fetch_thread.join(timeout=120)  # Wait up to 120 seconds for the thread to complete

    def cancel():
        root.destroy()
        root.quit()  # Explicitly stop the event loop

    def clear_error_on_interaction(*args):
        error_label.config(text="", fg="#d32f2f")

    # Set up the main window with a gradient background
    root = tk.Tk()
    root.title("Stock Data Analyzer")
    root.geometry("400x600")  # Increased height to accommodate buttons
    root.configure(bg='#f0f4f8')

    # Create a frame for the title with gradient effect
    title_frame = tk.Frame(root, bg='#4a90e2', height=50)
    title_frame.pack(fill='x')
    tk.Label(title_frame, text="Stock Data Analyzer", font=('Helvetica', 16, 'bold'), fg='white', bg='#4a90e2', padx=20, pady=10).pack()

    # Main content frame
    content_frame = tk.Frame(root, bg='#f0f4f8', padx=20, pady=20)
    content_frame.pack(expand=True, fill='both')

    # Stock Ticker
    tk.Label(content_frame, text="STOCK TICKER (e.g., AAPL):", font=('Helvetica', 10), bg='#f0f4f8', fg='#333333').pack(pady=(10, 5))
    ticker_var = tk.StringVar(value='AAPL')
    ticker_entry = tk.Entry(content_frame, textvariable=ticker_var, font=('Helvetica', 10), width=30, borderwidth=2, relief='groove')
    ticker_entry.pack(pady=5)
    ticker_var.trace('w', clear_error_on_interaction)

    # Start Date
    tk.Label(content_frame, text="START DATE", font=('Helvetica', 10), bg='#f0f4f8', fg='#333333').pack(pady=(10, 5))
    start_date_var = tk.StringVar(value=datetime.now().strftime('%Y-%m-%d'))
    start_date_entry = tk.Entry(content_frame, textvariable=start_date_var, font=('Helvetica', 10), width=30, state='readonly', borderwidth=2, relief='groove')
    start_date_entry.pack(pady=5)
    tk.Button(content_frame, text="Select Start Date", command=lambda: open_calendar(True), font=('Helvetica', 10), bg='#4CAF50', fg='white', padx=10, pady=5).pack(pady=5)
    start_date_var.trace('w', clear_error_on_interaction)

    # End Date
    tk.Label(content_frame, text="END DATE", font=('Helvetica', 10), bg='#f0f4f8', fg='#333333').pack(pady=(10, 5))
    end_date_var = tk.StringVar(value=datetime.now().strftime('%Y-%m-%d'))
    end_date_entry = tk.Entry(content_frame, textvariable=end_date_var, font=('Helvetica', 10), width=30, state='readonly', borderwidth=2, relief='groove')
    end_date_entry.pack(pady=5)
    tk.Button(content_frame, text="Select End Date", command=lambda: open_calendar(False), font=('Helvetica', 10), bg='#4CAF50', fg='white', padx=10, pady=5).pack(pady=5)
    end_date_var.trace('w', clear_error_on_interaction)

    # Interval
    tk.Label(content_frame, text="INTERVAL", font=('Helvetica', 10), bg='#f0f4f8', fg='#333333').pack(pady=(10, 5))
    interval_var = tk.StringVar(value='5m')
    interval_menu = ttk.OptionMenu(content_frame, interval_var, '5m', '1m', '5m', '30m', '1d', '1wk', '1mo', style='Custom.TMenubutton')
    interval_menu.pack(pady=5)
    interval_var.trace('w', clear_error_on_interaction)

    # Error Label
    error_label = tk.Label(content_frame, text="", font=('Helvetica', 10), fg="#d32f2f", bg='#f0f4f8', wraplength=400)
    error_label.pack(pady=10)

    # Button Frame with debug background
    button_frame = tk.Frame(content_frame, bg='#f0f4f8')
    button_frame.pack(pady=15, fill='x')  # Ensure it fills the width
    tk.Button(button_frame, text="Submit", command=submit, font=('Helvetica', 10, 'bold'), bg='#2196F3', fg='white', padx=10, pady=5).pack(side=tk.LEFT, padx=40)
    tk.Button(button_frame, text="Cancel", command=cancel, font=('Helvetica', 10, 'bold'), bg='#f44336', fg='white', padx=10, pady=5).pack(side=tk.RIGHT, padx=40)

    # Configure ttk style
    style = ttk.Style()
    style.theme_use('clam')
    style.configure('Custom.TMenubutton', background='#e0e8f0', foreground='#333333', font=('Helvetica', 10), width=27)

    root.mainloop()

In [None]:
create_gui()

# **Data Preprocessing**

ticker_var