In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ------------------------------------------------------------------------------
# 1. LOAD DATASET
# ------------------------------------------------------------------------------
# We use the Stock Market Dataset.
# Target: Close Price | Features: Open, High, Low, Volume

try:
    # Load with header=[0,1] for the multi-level headers in the CSV
    df_raw = pd.read_csv('stock_market_data.csv', header=[0,1])
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("ERROR: 'stock_market_data.csv' not found. Please check the file path.")

# Create a clean DataFrame with just the Apple (AAPL) data
data = pd.DataFrame()
data['Target_Close'] = df_raw['Close']['AAPL']
data['Feat_Open']    = df_raw['Open']['AAPL']
data['Feat_High']    = df_raw['High']['AAPL']
data['Feat_Low']     = df_raw['Low']['AAPL']
data['Feat_Vol']     = df_raw['Volume']['AAPL']

# ------------------------------------------------------------------------------
# 2. DATA CLEANING & TRAIN-TEST SPLIT CALCULATION
# ------------------------------------------------------------------------------
# Requirement: "80% of dataset keep for training and remaining 20% for testing."

# 1. Remove missing values (NaNs)
print(f"Original Shape: {data.shape}")
data = data.dropna()
print(f"Cleaned Shape:  {data.shape}")

# 2. Calculate the Split Index (80% / 20%)
# We do not split the data physically yet, but we define WHERE to split it.
total_rows = len(data)
split_idx = int(total_rows * 0.80)

print("\n--- Train/Test Split Plan ---")
print(f"Total Samples: {total_rows}")
print(f"Training Set (First 80%): 0 to {split_idx} ({split_idx} samples)")
print(f"Testing Set  (Last 20%):  {split_idx} to {total_rows} ({total_rows - split_idx} samples)")

# ------------------------------------------------------------------------------
# 3. STATISTICAL METRICS (FROM SCRATCH)
# ------------------------------------------------------------------------------
# We calculate Mean, Variance, Std Dev using math logic (No .describe())

def get_mean(values):
    return sum(values) / len(values)

def get_std_dev(values):
    mu = get_mean(values)
    variance = sum((x - mu)**2 for x in values) / len(values)
    return variance ** 0.5

print("\n--- Summary Statistics (Whole Dataset) ---")
print(f"{'Feature':<15} | {'Mean':<10} | {'Std Dev':<10} | {'Min':<10} | {'Max':<10}")
print("-" * 65)

for col in data.columns:
    vals = data[col].values
    mu = get_mean(vals)
    sigma = get_std_dev(vals)
    print(f"{col:<15} | {mu:<10.2f} | {sigma:<10.2f} | {min(vals):<10.2f} | {max(vals):<10.2f}")

# ------------------------------------------------------------------------------
# 4. VISUALIZATION
# ------------------------------------------------------------------------------
# A. Feature Distributions (Histograms)
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(data['Feat_Open'], bins=30, color='skyblue', edgecolor='black')
plt.title('Distribution: Open Price')
plt.xlabel('Price')

plt.subplot(1, 2, 2)
plt.hist(data['Feat_Vol'], bins=30, color='salmon', edgecolor='black')
plt.title('Distribution: Volume')
plt.xlabel('Volume')

plt.tight_layout()
plt.show()

# B. Correlation Heatmap (Math Logic)
def get_correlation(x, y):
    mu_x, mu_y = get_mean(x), get_mean(y)
    numerator = sum((x[i] - mu_x) * (y[i] - mu_y) for i in range(len(x)))
    denominator = (sum((x[i] - mu_x)**2 for i in range(len(x))) * sum((y[i] - mu_y)**2 for i in range(len(y)))) ** 0.5
    return numerator / denominator if denominator != 0 else 0

# Select columns for correlation
cols = data.columns
n_cols = len(cols)
corr_matrix = np.zeros((n_cols, n_cols))

# Calculate Matrix
for i in range(n_cols):
    for j in range(n_cols):
        corr_matrix[i, j] = get_correlation(data[cols[i]].values, data[cols[j]].values)

# Plot Heatmap
plt.figure(figsize=(6, 5))
plt.imshow(corr_matrix, cmap='coolwarm', vmin=-1, vmax=1)
plt.colorbar(label='Correlation')
plt.xticks(range(n_cols), cols, rotation=45, ha='right')
plt.yticks(range(n_cols), cols)
plt.title('Feature Correlation Matrix')

# Add text annotations
for i in range(n_cols):
    for j in range(n_cols):
        plt.text(j, i, f"{corr_matrix[i, j]:.2f}", ha='center', va='center', color='black')

plt.show()