In [None]:
# Cell 1: Notebook Purpose
# This notebook performs initial exploratory data analysis (EDA) on raw and processed datasets.
# Outputs: Visualizations, summary statistics, and preliminary insights for feature engineering and modeling.

In [None]:

# Cell 2: Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

In [None]:
# Cell 3: Data Loading
raw_path = Path("../data/raw/raw_price_data.csv")
processed_path = Path("../data/processed/cleaned_price_data.csv")

raw_df = pd.read_csv(raw_path, index_col=[0,1])  # Adjust index_col as needed
processed_df = pd.read_csv(processed_path, index_col=[0,1])

print("Raw data shape:", raw_df.shape)
print("Processed data shape:", processed_df.shape)


In [None]:
# Cell 4: Descriptive Statistics
display(raw_df.describe())
display(processed_df.describe())


In [None]:
# Cell 5: Missing Values
print("Raw missing values:\n", raw_df.isnull().sum())
print("Processed missing values:\n", processed_df.isnull().sum())


In [None]:
# Cell 6: Visualize Price Distributions
plt.figure(figsize=(10,5))
sns.histplot(processed_df['Close'].dropna(), bins=100, kde=True)
plt.title("Distribution of Close Prices (Processed Data)")
plt.show()

In [None]:
# Cell 7: Time Series Plot for a Sample Ticker
sample_ticker = processed_df.index.get_level_values(1).unique()[0]
sample = processed_df.xs(sample_ticker, level=1)
sample['Close'].plot(figsize=(12,4), title=f"Close Price Time Series: {sample_ticker}")
plt.show()


In [None]:
# Cell 8: Correlation Matrix
corr = processed_df.reset_index().pivot(index='date', columns='ticker', values='Close').corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr, cmap='coolwarm', center=0)
plt.title("Cross-Sectional Correlation of Close Prices")
plt.show()


In [None]:
# Cell 9: Outlier Detection (Z-score)
z_scores = (processed_df['Close'] - processed_df['Close'].mean()) / processed_df['Close'].std()
outliers = processed_df[np.abs(z_scores) > 5]
print("Number of outliers (z > 5):", len(outliers))
