# **Import modules**

In [32]:
import pandas as pd
import numpy as np
import yfinance as yf
from newsapi import NewsApiClient
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime, timedelta

### **Fetch and Clean historical stock data**

In [33]:
def fetch_historical_stock_data(tickers, start_date, end_date):
    """
    Fetch historical stock data for a list of tickers using yfinance.
    Args:
        tickers (list): List of stock ticker symbols (e.g., ['AAPL']).
        start_date (datetime): Start date for data collection.
        end_date (datetime): End date for data collection.
    Returns:
        pd.DataFrame: Combined DataFrame with historical stock data.
    """
    historical_data= []
    
    for ticker in tickers:
        try:
            # validate ticker
            if not isinstance(ticker, str) or not ticker:
                raise ValueError(f"Invalid ticker: {ticker}")
            
            # download the data
            stock = yf.Ticker(ticker) # create a Ticker object
            df = stock.history(start=start_date, end=end_date, interval='1d') # fetch historical data
            
            # check if data is empty
            if df.empty:
                print(f"No data returned for {ticker}")
                continue
            
            df['Ticker'] = ticker # add a column for the ticker symbol
            
            # reset the index to make 'Date' a column
            df = df.reset_index()
            
            
            # add Adj Close (use Close if Adj Close not available)
            df['Adj Close'] = df.get('Adj Close', df['Close'])
            
            historical_data.append(df['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 'Ticker'])
        except Exception as e:
            print(f"Error fetching data for {ticker}: {e}")
            
    # Check if any data was collected
    if not historical_data:
        raise ValueError("No data collected for any tickers")
    
    # Combine data
    combined_df = pd.concat(historical_data, ignore_index=True)
    
    # Ensure Date is datetime
    combined_df['Date'] = pd.to_datetime(combined_df['Date'])
    
    return combined_df

In [34]:
def clean_stock_data(data):
    """
    Clean the stock data by removing duplicates and handling missing values and ensure data integrity.
    """
    # check for missing values
    print("Missing values before cleaning: ", data.isnull().sum())
    
    # handle missing values
    price_columns = ['open', 'high', 'low', 'close', 'volume', 'adj close']
    for col in price_columns:
        data[col] = data[col].fillna(method='ffill') # forward fill missing values
        data[col] = data[col].fillna(method='bfill') # backward fill missing values
    
    data['Volume'] = data['volume'].fillna(0) # fill missing volume with 0
    
    # drop any remaining rows with missing critical data
    data = data.dropna()
    
    # remove duplicates
    data = data.drop_duplicates(subset=['date', 'ticker'], keep='last')
    
    # ensure non-negative values for prices and volume
    for col in price_columns:
        data[col] = data[col].clip(lower=0) # set negative values to 0
    data['Volume'] = data['Volume'].clip(lower=0) # set negative volume to 0
    
    # check for outliers (e.g., extreme price changes)
    data['price_change'] = data.groupby('ticker')['close'].pct_change() # calculate price changes
    data = data[data['Price_Change'].abs() < 0.5]  # Remove >50% daily changes (likely errors)
    
    return data

# **Fetch real-time stock prices**

In [35]:
def fetch_realtime_stock_data(tickers, period='1mo'):
    """
    Fetch real-time stock data for a list of tickers using Yahoo Finance.
    """
    realtime_data = []
    
    for ticker in tickers:
        try:
            # download the data
            stock = yf.Ticker(ticker) # create a Ticker object
            df = stock.history(period=period, interval='1d') # fetch historical data
            df['ticker'] = ticker # add a column for the ticker symbol
            
            df = df.reset_index() # reset the index to make 'Date' a column
            
            df = df.rename(columns={
                'Date': 'date', 
                'Open': 'open', 
                'High': 'high', 
                'Low': 'low', 
                'Close': 'close', 
                'Volume': 'volume'
            })
            
            df['adj close'] = df['close'] # add Adj Close (approximated as Close if not available)
            realtime_data.append(df[['date', 'open', 'high', 'low', 'close', 'volume', 'adj close', 'ticker']])
            
        except Exception as e:
            print(f"Error fetching data for {ticker}: {e}")
            
    return pd.concat(realtime_data, ignore_index=True)

# **Fetch news headlines**

In [36]:
def fetch_news_data(tickers, api_key, days_back=7):
    """
    Fetch news articles related to the stock tickers using NewsAPI.
    """
    newsapi = NewsApiClient(api_key=api_key) # create a NewsApiClient object
    news_data = []
    
    end_date = datetime.now()
    start_date = end_date - timedelta(days=days_back) # calculate the start date
    
    for ticker in tickers:
        try:
            # Query news for the ticker
            query = f"{ticker} stock"
            articles = news_api.get_everything(
                q=query,
                from_param=start_date.strftime('%Y-%m-%d'),
                to=end_date.strftime('%Y-%m-%d'),
                language='en',
                sort_by='publishedAt'
            )
            
            for article in articles['articles']:
                news_data.append({
                    'Ticker': ticker,
                    'Date': pd.to_datetime(article['publishedAt']),
                    'Title': article['title'],
                    'Description': article['description'] or '',
                    'Source': article['source']['name'],
                    'URL': article['url']
                })
                
        except Exception as e:
            print(f"Error fetching news for {ticker}: {e}")
    
    return pd.DataFrame(news_data)

# **Exploratory data analysis**

In [37]:
def perform_eda(df, ticker='AAPL'):
    """
    Perform basic EDA: summary statistics and visualization for a specific ticker.
    """
    ticker_df = df[df['Ticker'] == ticker].copy()
    
    if ticker_df.empty:
        print(f"No data available for ticker {ticker}")
        return
    
    # Summary statistics
    print(f"\nSummary Statistics for {ticker}:")
    print(ticker_df.describe())
    
    # Plot closing price trend
    plt.figure(figsize=(12, 6))
    plt.plot(ticker_df['Date'], ticker_df['Close'], label='Close Price')
    plt.title(f'{ticker} Closing Price Trend')
    plt.xlabel('Date')
    plt.ylabel('Price (USD)')
    plt.legend()
    plt.tight_layout()
    plt.savefig(f'{ticker}_closing_price.png')
    plt.close()
    
    # Plot volume trend
    plt.figure(figsize=(12, 6))
    plt.bar(ticker_df['Date'], ticker_df['Volume'], color='gray', alpha=0.5)
    plt.title(f'{ticker} Trading Volume')
    plt.xlabel('Date')
    plt.ylabel('Volume')
    plt.tight_layout()
    plt.savefig(f'{ticker}_volume.png')
    plt.close()
    
    # Check for seasonality (monthly average closing price)
    ticker_df['Month'] = ticker_df['Date'].dt.to_period('M')
    monthly_avg = ticker_df.groupby('Month')['Close'].mean()
    
    plt.figure(figsize=(12, 6))
    monthly_avg.plot(kind='line')
    plt.title(f'{ticker} Monthly Average Closing Price')
    plt.xlabel('Month')
    plt.ylabel('Average Close Price (USD)')
    plt.tight_layout()
    plt.savefig(f'{ticker}_monthly_avg.png')
    plt.close()

In [38]:
if __name__ == "__main__":
    # define tickers and date range
    tickers = ['AAPL']
    end_date = datetime.now()
    start_date = datetime.now() - timedelta(days=365)  # 1 year back
    
    # fetch and clean historical stock data
    print("Fetching historical stock data...")
    try:
        historical_data = fetch_historical_stock_data(tickers, start_date, end_date)
        cleaned_data = clean_stock_data(historical_data)
        # save data with ticker-specific filenames
        historical_data.to_csv(f"{tickers[0]}_historical_stock_data.csv", index=False)
        cleaned_data.to_csv(f"{tickers[0]}_cleaned_stock_data.csv", index=False)
        print(f"Historical and cleaned data saved for {tickers[0]}")

    except Exception as e:
        print(f"Error processing historical data: {e}")
        exit(1)
        

Fetching historical stock data...
Error fetching data for AAPL: ('Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close', 'Ticker')
Error processing historical data: No data collected for any tickers
