# **Import modules**

In [9]:
import pandas as pd
import numpy as np
import yfinance as yf
from newsapi import NewsApiClient
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime, timedelta
import pytz
import tkinter as tk
from tkinter import messagebox, ttk
from tkcalendar import Calendar
import json

In [10]:
def validate_date(date_obj):
    """
    Validate and convert date object to datetime in US/Eastern timezone.
    
    Args:
        date_obj (datetime): Date from Calendar.
    
    Returns:
        datetime: Datetime object in US/Eastern timezone, or None if invalid.
    """
    try:
        eastern = pytz.timezone('US/Eastern')
        return date_obj.replace(tzinfo=eastern)
    except Exception:
        return None

def validate_interval(interval):
    """
    Validate the interval against supported yfinance intervals.
    
    Args:
        interval (str): Interval (e.g., '1m', '5m', '1d').
    
    Returns:
        str: Valid interval, or None if invalid.
    """
    valid_intervals = ['1m', '2m', '5m', '15m', '30m', '60m', '90m', '1d', '1wk', '1mo', '3mo']
    return interval if interval in valid_intervals else None

### **Fetch and Clean historical stock data**

In [11]:
def fetch_historical_stock_data(tickers, start_date, end_date, interval='5m'):
    """
    Fetch and structure historical stock data for a list of tickers at specified interval.
    
    Args:
        tickers (list): List of stock ticker symbols (e.g., ['AAPL']).
        start_date (datetime or str): Start date for data collection.
        end_date (datetime or str): End date for data collection.
        interval (str): Data interval (e.g., '1m', '5m', '1d'). Default: '5m'.
    
    Returns:
        pd.DataFrame: Combined DataFrame with historical stock data (flat columns).
    """
    historical_data = []
    required_columns = ['Open', 'High', 'Low', 'Close', 'Volume']
    
    eastern = pytz.timezone('US/Eastern')
    if isinstance(start_date, datetime):
        start_date = start_date.astimezone(eastern)
    if isinstance(end_date, datetime):
        end_date = end_date.astimezone(eastern)
    
    try:
        df = yf.download(tickers, start=start_date, end=end_date, interval=interval)
        print(f"Raw columns from yf.download: {df.columns.tolist()}")
        
        if df.empty:
            print("No data returned. Check if the date range includes trading hours or if tickers are valid.")
            return pd.DataFrame()

        if len(tickers) == 1:
            ticker = tickers[0]
            if isinstance(df.columns, pd.MultiIndex):
                df.columns = [col[0] for col in df.columns]
            
            missing_columns = [col for col in required_columns if col not in df.columns]
            if missing_columns:
                print(f"Missing required columns for {ticker}: {missing_columns}")
                return pd.DataFrame()
            
            df = df.reset_index()
            if 'Date' not in df.columns:
                if df.columns[0] in ['Datetime', 'index', 'DateTime']:
                    df = df.rename(columns={df.columns[0]: 'Date'})
                else:
                    print(f"Unexpected index column: {df.columns[0]}")
                    return pd.DataFrame()
            
            df['Ticker'] = ticker
            df['Adj Close'] = df.get('Adj Close', df['Close'])
            df = df[['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 'Ticker']]
            print(f"Fetched {len(df)} data points for {ticker} at {interval} interval")
            print(f"Columns after processing: {df.columns.tolist()}")
            historical_data.append(df)
        else:
            for ticker in tickers:
                if ticker not in df.columns.get_level_values(1):
                    print(f"No data returned for {ticker}")
                    continue
                ticker_df = df.xs(ticker, level=1, axis=1).copy()
                if ticker_df.empty:
                    print(f"No data returned for {ticker}")
                    continue
                
                missing_columns = [col for col in required_columns if col not in ticker_df.columns]
                if missing_columns:
                    print(f"Missing required columns for {ticker}: {missing_columns}")
                    continue
                
                ticker_df = ticker_df.reset_index()
                if 'Date' not in ticker_df.columns:
                    if ticker_df.columns[0] in ['Datetime', 'index', 'DateTime']:
                        ticker_df = ticker_df.rename(columns={ticker_df.columns[0]: 'Date'})
                    else:
                        print(f"Unexpected index column for {ticker}: {ticker_df.columns[0]}")
                        continue
                
                ticker_df['Ticker'] = ticker
                ticker_df['Adj Close'] = ticker_df.get('Adj Close', ticker_df['Close'])
                ticker_df = ticker_df[['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 'Ticker']]
                print(f"Fetched {len(ticker_df)} data points for {ticker} at {interval} interval")
                print(f"Columns after processing: {ticker_df.columns.tolist()}")
                historical_data.append(ticker_df)
        
        if not historical_data:
            print("No data collected for any tickers. Possible non-trading day or invalid tickers.")
            return pd.DataFrame()

        combined_df = pd.concat(historical_data, ignore_index=True)
        combined_df['Date'] = pd.to_datetime(combined_df['Date'])
        print(f"Total combined data points: {len(combined_df)}")
        print(f"Final columns: {combined_df.columns.tolist()}")
        return combined_df

    except Exception as e:
        print(f"Error fetching data: {e}")
        return pd.DataFrame()

In [12]:
def clean_stock_data(data):
    """
    Clean stock data by handling missing values and ensuring data quality.
    """
    if data.empty:
        print("No data to clean.")
        return data

    required_columns = ['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 'Ticker']
    if not all(col in data.columns for col in required_columns):
        print("Missing required columns.")
        return data

    price_columns = ['Open', 'High', 'Low', 'Close', 'Adj Close']
    data[price_columns] = data[price_columns].ffill()
    data['Volume'] = data['Volume'].fillna(0)
    data = data.dropna()
    data = data.drop_duplicates(subset=['Ticker', 'Date'], keep='last')
    
    for col in price_columns + ['Volume']:
        data[col] = data[col].clip(lower=0)

    data['Price_Change'] = data.groupby('Ticker')['Close'].pct_change()
    data = data[data['Price_Change'].abs() < 0.05]  # Adjusted for intraday data
    data = data.drop(columns=['Price_Change'], errors='ignore')
    
    return data

# **Fetch news headlines**

In [13]:
def fetch_news_data(tickers, api_key, start_date, end_date):
    """
    Fetch news articles related to the stock tickers using NewsAPI within the specified date range.
    
    Args:
        tickers (list): List of stock ticker symbols (e.g., ['AAPL']).
        api_key (str): NewsAPI key.
        start_date (datetime): Start date for news collection.
        end_date (datetime): End date for news collection.
    
    Returns:
        pd.DataFrame: DataFrame with news data (columns: Ticker, Date, Title, Description, Source, URL).
    """
    newsapi = NewsApiClient(api_key=api_key)
    news_data = []
    
    if isinstance(start_date, datetime):
        start_date = start_date.astimezone(pytz.UTC)
    if isinstance(end_date, datetime):
        end_date = end_date.astimezone(pytz.UTC)
    
    for ticker in tickers:
        try:
            query = f"{ticker} stock"
            articles = newsapi.get_everything(
                q=query,
                from_param=start_date.strftime('%Y-%m-%d'),
                to=end_date.strftime('%Y-%m-%d'),
                language='en',
                sort_by='publishedAt'
            )
            
            for article in articles['articles']:
                news_data.append({
                    'Ticker': ticker,
                    'Date': pd.to_datetime(article['publishedAt']),
                    'Title': article['title'],
                    'Description': article['description'] or '',
                    'Source': article['source']['name'],
                    'URL': article['url']
                })
                
        except Exception as e:
            print(f"Error fetching news for {ticker}: {e}")
    
    news_df = pd.DataFrame(news_data)
    if not news_df.empty:
        print(f"Fetched {len(news_df)} news articles for {tickers}")
        print(f"News columns: {news_df.columns.tolist()}")
    else:
        print(f"No news articles found for {tickers} in the specified date range.")
    return news_df

# **Exploratory data analysis**

In [14]:
def perform_eda(df, ticker='AAPL', interval='5m'):
    """
    Perform basic EDA for a specific ticker with specified interval data.
    Displays plots inline, with price/volume trends and aggregated average closing price.
    
    Args:
        df (pd.DataFrame): DataFrame with stock data (columns: Date, Open, High, Low, Close, Volume, Adj Close, Ticker).
        ticker (str): Ticker symbol to analyze (default: 'AAPL').
        interval (str): Data interval (e.g., '1m', '5m', '1d'). Default: '5m'.
    """
    ticker_df = df[df['Ticker'] == ticker].copy()
    
    if ticker_df.empty:
        print(f"No data available for ticker {ticker}")
        return
    
    # Summary statistics
    print(f"\nSummary Statistics for {ticker}:")
    print(ticker_df.describe())
    
    # # Combined plot: Closing price, Opening price, and Volume trends
    # fig, ax1 = plt.subplots(figsize=(12, 6))
    
    # # Primary y-axis: Closing and Opening prices
    # ax1.plot(ticker_df['Date'], ticker_df['Close'], label='Close Price', color='blue')
    # ax1.plot(ticker_df['Date'], ticker_df['Open'], label='Open Price', color='green', linestyle='--')
    # ax1.set_xlabel('Time' if interval in ['1m', '2m', '5m', '15m', '30m', '60m', '90m'] else 'Date')
    # ax1.set_ylabel('Price (USD)', color='blue')
    # ax1.set_title(f'{ticker} Price and Volume Trends ({interval} Intervals)')
    # ax1.legend(loc='upper left')
    # ax1.tick_params(axis='y', labelcolor='blue')
    
    # # Secondary y-axis: Volume
    # bar_width = {
    #     '1m': 0.0002, '2m': 0.0003, '5m': 0.0005, '15m': 0.001, '30m': 0.002, '60m': 0.004, '90m': 0.006,
    #     '1d': 0.02, '1wk': 0.1, '1mo': 0.3, '3mo': 0.5
    # }
    # ax2 = ax1.twinx()
    # ax2.bar(ticker_df['Date'], ticker_df['Volume'], color='gray', alpha=0.3, width=bar_width.get(interval, 0.0005))
    # ax2.set_ylabel('Volume', color='gray')
    # ax2.tick_params(axis='y', labelcolor='gray')
    
    # # Adjust layout and display
    # plt.tight_layout()
    # plt.show()
    
    # # Aggregated average closing price
    # agg_settings = {
    #     '1m': ('5min', '5-Minute'), '2m': ('5min', '5-Minute'),
    #     '5m': ('H', 'Hourly'), '15m': ('H', 'Hourly'), '30m': ('H', 'Hourly'),
    #     '60m': ('H', 'Hourly'), '90m': ('H', 'Hourly'),
    #     '1d': ('D', 'Daily'), '1wk': ('W', 'Weekly'),
    #     '1mo': ('M', 'Monthly'), '3mo': ('M', 'Monthly')
    # }
    # freq, agg_label = agg_settings.get(interval, ('H', 'Hourly'))
    
    # ticker_df['Time_Agg'] = ticker_df['Date'].dt.floor(freq)
    # agg_avg = ticker_df.groupby('Time_Agg')['Close'].mean()
    
    # plt.figure(figsize=(12, 6))
    # agg_avg.plot(kind='line', marker='o')
    # plt.title(f'{ticker} {agg_label} Average Closing Price')
    # plt.xlabel(agg_label)
    # plt.ylabel('Average Close Price (USD)')
    # plt.grid(True)
    # plt.tight_layout()
    # plt.show()

In [15]:
def create_gui():
    """
    Create a tkinter GUI with calendar widgets for date input and interval selection.
    Fetches stock and news data, saves to CSV, and performs EDA.
    """
    def open_calendar(is_start_date):
        top = tk.Toplevel(root)
        top.title("Select Date")
        top.geometry("300x300")
        
        cal = Calendar(top, selectmode='day', date_pattern='yyyy-mm-dd')
        cal.pack(pady=10)
        
        def select_date():
            selected_date = cal.get_date()
            if is_start_date:
                start_date_var.set(selected_date)
            else:
                end_date_var.set(selected_date)
            error_label.config(text="", fg="red")
            top.destroy()
        
        tk.Button(top, text="Select", command=select_date).pack(pady=10)

    def submit():
        start_date_str = start_date_var.get()
        end_date_str = end_date_var.get()
        interval = interval_var.get()
        
        error_label.config(text="", fg="red")
        
        try:
            start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
            end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
            start_date = validate_date(start_date)
            end_date = validate_date(end_date)
        except ValueError:
            error_label.config(text="Invalid date format. Use YYYY-MM-DD.", fg="red")
            return
        
        if start_date is None or end_date is None:
            error_label.config(text="Invalid date selection. Please select valid dates.", fg="red")
            return
        if not validate_interval(interval):
            error_label.config("Invalid interval. Choose from dropdown.", fg="red")
            return
        if end_date < start_date:
            error_label.config(text="End date must be on or after start date.", fg="red")
            return
        
        range_limits = {
            '1m': 7, '2m': 60, '5m': 60, '15m': 60, '30m': 60, '60m': 60, '90m': 60,
            '1d': 730, '1wk': 1825, '1mo': 7300, '3mo': 7300
        }
        max_days = range_limits.get(interval, 60)
        days_diff = (end_date - start_date).days
        if days_diff > max_days:
            error_label.config(text=f"{interval} interval limited to {max_days} days.", fg="red")
            return
        
        root.destroy()
        
        tickers = ['AAPL']
        api_key = '6b423b01d98e47859e2ecbc296aa9b2b'
        
        print(f"\nFetching stock data for {tickers} from {start_date.date()} to {end_date.date()} at {interval} interval...")
        try:
            historical_data = fetch_historical_stock_data(tickers, start_date, end_date, interval)
            if historical_data.empty:
                raise ValueError("No historical data retrieved")

            cleaned_data = clean_stock_data(historical_data)
            
            historical_data.to_csv("historical_stock_data.csv", index=False)
            cleaned_data.to_csv("cleaned_stock_data.csv", index=False)
            print(f"Historical and cleaned stock data saved for {tickers}")
            
            
            print(f"\nFetching news data for {tickers} from {start_date.date()} to {end_date.date()}...")
            news_data = fetch_news_data(tickers, api_key, start_date, end_date)
            if not news_data.empty:
                news_data.to_csv("news_data.csv", index=False)
                print("News data saved to news_data.csv")
            else:
                print("No news data to save.")
            
            print("\nPerforming exploratory data analysis...")
            perform_eda(cleaned_data, ticker='AAPL', interval=interval)

        except Exception as e:
            print(f"Error processing data: {e}")
            tk.Tk().withdraw()
            messagebox.showerror("Error", f"Error processing data: {e}")

    def cancel():
        root.destroy()

    def clear_error_on_interaction(*args):
        error_label.config(text="", fg="red")

    root = tk.Tk()
    root.title("Stock Data Input")
    root.geometry("400x400")
    
    tk.Label(root, text="Start Date:").pack(pady=10)
    start_date_var = tk.StringVar(value=datetime.now().strftime('%Y-%m-%d'))
    start_date_entry = tk.Entry(root, textvariable=start_date_var, state='readonly')
    start_date_entry.pack()
    tk.Button(root, text="Select Start Date", command=lambda: open_calendar(True)).pack(pady=5)
    start_date_var.trace('w', clear_error_on_interaction)
    
    tk.Label(root, text="End Date:").pack(pady=10)
    end_date_var = tk.StringVar(value=datetime.now().strftime('%Y-%m-%d'))
    end_date_entry = tk.Entry(root, textvariable=end_date_var, state='readonly')
    end_date_entry.pack()
    tk.Button(root, text="Select End Date", command=lambda: open_calendar(False)).pack(pady=5)
    end_date_var.trace('w', clear_error_on_interaction)
    
    tk.Label(root, text="Interval:").pack(pady=10)
    interval_var = tk.StringVar(value='5m')
    interval_menu = ttk.OptionMenu(root, interval_var, '5m', '1m', '2m', '5m', '15m', '30m', '60m', '90m', '1d', '1wk', '1mo', '3mo')
    interval_menu.pack()
    interval_var.trace('w', clear_error_on_interaction)
    
    error_label = tk.Label(root, text="", fg="red", wraplength=350)
    error_label.pack(pady=10)
    
    button_frame = tk.Frame(root)
    button_frame.pack(pady=20)
    tk.Button(button_frame, text="Submit", command=submit).pack(side=tk.LEFT, padx=10)
    tk.Button(button_frame, text="Cancel", command=cancel).pack(side=tk.LEFT, padx=10)
    
    root.mainloop()

In [None]:
# Ensure inline plotting in Jupyter
%matplotlib inline

# Launch GUI
create_gui()


Fetching stock data for ['AAPL'] from 2025-04-22 to 2025-05-02 at 30m interval...
