# 01 - Data Exploration

Notebook ini berisi eksplorasi data saham yang akan digunakan untuk prediksi harga menggunakan LSTM.

## Steps:
1. Download data menggunakan yfinance
2. Exploratory Data Analysis (EDA)
3. Visualisasi trend harga
4. Analisis statistik dasar

In [None]:
# Import libraries
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.data_loader import DataLoader
from src.visualizer import StockVisualizer

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print("Libraries imported successfully!")

## 1. Download Stock Data

Pilih salah satu ticker:
- **AAPL** - Apple Inc. (US Stock)
- **BBCA.JK** - Bank Central Asia (Indonesia Stock)
- **GOOGL** - Alphabet Inc.
- **MSFT** - Microsoft Corporation

In [None]:
# Initialize data loader
loader = DataLoader(data_dir='../data/raw')

# Choose your ticker
TICKER = "AAPL"  # Change this to your preferred stock
PERIOD = "5y"    # 5 years of data

# Download data
data = loader.download_stock_data(ticker=TICKER, period=PERIOD)
print(f"\nData shape: {data.shape}")

In [None]:
# Display first few rows
print("First 5 rows:")
data.head()

In [None]:
# Display last few rows
print("Last 5 rows:")
data.tail()

## 2. Basic Data Information

In [None]:
# Data info
print("Data Info:")
print("="*50)
data.info()

In [None]:
# Check for missing values
print("\nMissing Values:")
print("="*50)
print(data.isnull().sum())
print(f"\nTotal missing: {data.isnull().sum().sum()}")

In [None]:
# Statistical summary
print("\nStatistical Summary:")
print("="*50)
data.describe()

## 3. Date Range Analysis

In [None]:
# Ensure Date is datetime
data['Date'] = pd.to_datetime(data['Date'])

print(f"Date Range: {data['Date'].min()} to {data['Date'].max()}")
print(f"Total Trading Days: {len(data)}")
print(f"Total Calendar Days: {(data['Date'].max() - data['Date'].min()).days}")

## 4. Price Visualization

In [None]:
# Initialize visualizer
viz = StockVisualizer()

# Plot stock price history
viz.plot_stock_data(
    data, 
    columns=['Close'], 
    title=f'{TICKER} Stock Price History'
)

In [None]:
# Plot OHLC prices
viz.plot_stock_data(
    data, 
    columns=['Open', 'High', 'Low', 'Close'], 
    title=f'{TICKER} OHLC Prices'
)

In [None]:
# Plot with moving averages
viz.plot_with_moving_averages(
    data,
    windows=[7, 21, 50],
    title=f'{TICKER} Price with Moving Averages'
)

In [None]:
# Plot volume
viz.plot_volume(data, title=f'{TICKER} Trading Volume')

## 5. Returns Analysis

In [None]:
# Calculate daily returns
data['Daily_Return'] = data['Close'].pct_change() * 100

# Plot returns distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Daily returns over time
axes[0].plot(data['Date'], data['Daily_Return'], alpha=0.7)
axes[0].axhline(y=0, color='r', linestyle='--')
axes[0].set_title(f'{TICKER} Daily Returns (%)')
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Return (%)')
axes[0].grid(True, alpha=0.3)

# Returns distribution
axes[1].hist(data['Daily_Return'].dropna(), bins=50, alpha=0.7, edgecolor='white')
axes[1].axvline(x=0, color='r', linestyle='--')
axes[1].set_title(f'{TICKER} Returns Distribution')
axes[1].set_xlabel('Return (%)')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# Print statistics
print(f"\nReturns Statistics:")
print(f"Mean Daily Return: {data['Daily_Return'].mean():.4f}%")
print(f"Std Daily Return: {data['Daily_Return'].std():.4f}%")
print(f"Max Daily Return: {data['Daily_Return'].max():.4f}%")
print(f"Min Daily Return: {data['Daily_Return'].min():.4f}%")

## 6. Correlation Analysis

In [None]:
# Correlation matrix
corr_cols = ['Open', 'High', 'Low', 'Close', 'Volume']
correlation_matrix = data[corr_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='RdYlGn', center=0, 
            square=True, linewidths=0.5, fmt='.3f')
plt.title(f'{TICKER} Feature Correlation Matrix')
plt.tight_layout()
plt.show()

## 7. Volatility Analysis

In [None]:
# Calculate rolling volatility (20-day)
data['Volatility_20'] = data['Daily_Return'].rolling(window=20).std()

fig, axes = plt.subplots(2, 1, figsize=(14, 10), gridspec_kw={'height_ratios': [2, 1]})

# Price
axes[0].plot(data['Date'], data['Close'], color='blue', linewidth=1.5)
axes[0].set_title(f'{TICKER} Stock Price')
axes[0].set_ylabel('Price (USD)')
axes[0].grid(True, alpha=0.3)

# Volatility
axes[1].fill_between(data['Date'], data['Volatility_20'], alpha=0.5, color='orange')
axes[1].set_title('20-Day Rolling Volatility')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Volatility (%)')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Monthly Returns Heatmap

In [None]:
# Create monthly returns heatmap
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month

# Calculate monthly returns
monthly_returns = data.groupby(['Year', 'Month'])['Close'].agg(['first', 'last'])
monthly_returns['Return'] = (monthly_returns['last'] / monthly_returns['first'] - 1) * 100
monthly_pivot = monthly_returns['Return'].unstack(level=1)

# Plot heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(monthly_pivot, annot=True, cmap='RdYlGn', center=0, 
            fmt='.1f', linewidths=0.5)
plt.title(f'{TICKER} Monthly Returns (%)')
plt.xlabel('Month')
plt.ylabel('Year')
plt.tight_layout()
plt.show()

## 9. Summary & Next Steps

### Key Insights:
1. Data sudah di-download dan di-save ke folder `data/raw/`
2. Tidak ada missing values
3. Korelasi tinggi antara Open, High, Low, Close (expected)
4. Volume memiliki korelasi lebih rendah dengan harga

### Next Steps:
1. **02_preprocessing.ipynb** - Normalisasi data dan create sequences untuk LSTM
2. **03_model_training.ipynb** - Build dan train LSTM model

In [None]:
# Save cleaned data for next notebook
print(f"\nData saved to: ../data/raw/{TICKER.replace('.', '_')}_data.csv")
print(f"Total records: {len(data)}")
print("\nReady for preprocessing!")