In [61]:
import sys
sys.path.append("..")

import pandas as pd
from tabulate import tabulate

In [62]:
print("Loading stock data from CSV files")
tickers = ["AAPL", "AMZN", "GOOG", "META", "MSFT", "NVDA"]

stock_data = {}

for t in tickers:
    df = pd.read_csv(f"../data/finance_data/{t}.csv")
    stock_data[t] = df


Loading stock data from CSV files


In [63]:
print("Checking for missing values in each dataset:")
for ticker, df in stock_data.items():
    print(f"\nSummary statistics for {ticker}:")
    missing_summary = df.isna().sum().reset_index()
    missing_summary.columns = ['Column', 'Missing Values']
    print(tabulate(missing_summary, headers="keys", tablefmt="psql", showindex=False))

Checking for missing values in each dataset:

Summary statistics for AAPL:
+----------+------------------+
| Column   |   Missing Values |
|----------+------------------|
| Date     |                0 |
| Close    |                0 |
| High     |                0 |
| Low      |                0 |
| Open     |                0 |
| Volume   |                0 |
+----------+------------------+

Summary statistics for AMZN:
+----------+------------------+
| Column   |   Missing Values |
|----------+------------------|
| Date     |                0 |
| Close    |                0 |
| High     |                0 |
| Low      |                0 |
| Open     |                0 |
| Volume   |                0 |
+----------+------------------+

Summary statistics for GOOG:
+----------+------------------+
| Column   |   Missing Values |
|----------+------------------|
| Date     |                0 |
| Close    |                0 |
| High     |                0 |
| Low      |                0 |
|

In [None]:
print("Cleaning and aligning datasets:")

summary = []

date_sets = [set(pd.to_datetime(df["Date"])) for df in stock_data.values()]
common_dates = set.intersection(*date_sets)

for ticker, df in stock_data.items():
    before_rows = df.shape[0]

    # Convert Date column to datetime
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

    # Drop duplicates
    df.drop_duplicates(subset="Date", inplace=True)

    # Filter to common dates
    df = df[df["Date"].isin(common_dates)].copy()

    # Sort by date
    df.sort_values("Date", inplace=True)

    # Adjust index
    df.reset_index(drop=True, inplace=True)

    # Get after rows
    after_rows = df.shape[0]

    # Save back to dictionary
    stock_data[ticker] = df

    summary.append(
        {
            "Ticker": ticker,
            "Rows Before": before_rows,
            "Rows After": after_rows,
            "Duplicates Removed": before_rows - after_rows,
            "Date dtype": df["Date"].dtype,
        }
    )

print(tabulate(summary, headers="keys", tablefmt="psql"))

Cleaning and aligning datasets:
+----------+---------------+--------------+----------------------+----------------+
| Ticker   |   Rows Before |   Rows After |   Duplicates Removed | Date dtype     |
|----------+---------------+--------------+----------------------+----------------|
| AAPL     |          3774 |         3774 |                    0 | datetime64[ns] |
| AMZN     |          3774 |         3774 |                    0 | datetime64[ns] |
| GOOG     |          3774 |         3774 |                    0 | datetime64[ns] |
| META     |          2923 |         2923 |                    0 | datetime64[ns] |
| MSFT     |          3774 |         3774 |                    0 | datetime64[ns] |
| NVDA     |          3774 |         3774 |                    0 | datetime64[ns] |
+----------+---------------+--------------+----------------------+----------------+
