# 02 - Data Preprocessing

Notebook ini berisi preprocessing data untuk LSTM model:

## Steps:
1. Load data yang sudah di-download
2. Normalisasi menggunakan MinMaxScaler
3. Create sequences (60 hari untuk prediksi hari ke-61)
4. Split train/test dengan ratio 80:20

In [None]:
# Import libraries
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.data_loader import DataLoader
from src.preprocessor import StockPreprocessor, MultiFeaturePreprocessor
from src.visualizer import StockVisualizer

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")

## 1. Load Data

In [None]:
# Initialize data loader
loader = DataLoader(data_dir='../data/raw')

# Load previously downloaded data
TICKER = "AAPL"  # Same ticker as exploration notebook

try:
    data = loader.load_data(TICKER)
except FileNotFoundError:
    print("Data not found. Downloading...")
    data = loader.download_stock_data(TICKER, period="5y")

print(f"\nData shape: {data.shape}")
data.head()

In [None]:
# Quick data check
print(f"Date range: {data['Date'].min()} to {data['Date'].max()}")
print(f"Missing values: {data.isnull().sum().sum()}")

# Close price statistics
print(f"\nClose Price Statistics:")
print(data['Close'].describe())

## 2. Preprocessing Parameters

In [None]:
# Define preprocessing parameters
SEQUENCE_LENGTH = 60  # 60 days lookback
TRAIN_RATIO = 0.8     # 80% for training
TARGET_COLUMN = 'Close'

print(f"Sequence Length: {SEQUENCE_LENGTH} days")
print(f"Train/Test Split: {TRAIN_RATIO*100:.0f}/{(1-TRAIN_RATIO)*100:.0f}")
print(f"Target Column: {TARGET_COLUMN}")

## 3. Basic Preprocessing (Single Feature - Close Price)

In [None]:
# Initialize preprocessor
preprocessor = StockPreprocessor(
    sequence_length=SEQUENCE_LENGTH,
    feature_columns=[TARGET_COLUMN]
)

# Prepare data
X_train, y_train, X_test, y_test = preprocessor.prepare_data(
    data,
    target_column=TARGET_COLUMN,
    train_ratio=TRAIN_RATIO
)

In [None]:
# Display shapes
print(f"\n=== Data Shapes ===")
print(f"X_train: {X_train.shape} - (samples, timesteps, features)")
print(f"y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_test: {y_test.shape}")

print(f"\nTotal samples: {len(X_train) + len(X_test)}")
print(f"Training samples: {len(X_train)} ({len(X_train)/(len(X_train)+len(X_test))*100:.1f}%)")
print(f"Testing samples: {len(X_test)} ({len(X_test)/(len(X_train)+len(X_test))*100:.1f}%)")

In [None]:
# Visualize sample sequence
sample_idx = 100

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Sample input sequence
axes[0].plot(X_train[sample_idx, :, 0])
axes[0].set_title(f'Sample Input Sequence (Normalized) - Index {sample_idx}')
axes[0].set_xlabel('Time Steps')
axes[0].set_ylabel('Normalized Value')
axes[0].grid(True, alpha=0.3)

# Target distribution
axes[1].hist(y_train, bins=50, alpha=0.7, edgecolor='white', label='Train')
axes[1].hist(y_test, bins=50, alpha=0.7, edgecolor='white', label='Test')
axes[1].set_title('Target Values Distribution (Normalized)')
axes[1].set_xlabel('Normalized Close Price')
axes[1].set_ylabel('Frequency')
axes[1].legend()

plt.tight_layout()
plt.show()

## 4. Verify Normalization (MinMax Scaling)

In [None]:
# Check normalized values are in [0, 1] range
print("=== Normalization Verification ===")
print(f"\nX_train:")
print(f"  Min: {X_train.min():.6f}")
print(f"  Max: {X_train.max():.6f}")

print(f"\ny_train:")
print(f"  Min: {y_train.min():.6f}")
print(f"  Max: {y_train.max():.6f}")

print(f"\ny_test:")
print(f"  Min: {y_test.min():.6f}")
print(f"  Max: {y_test.max():.6f}")

In [None]:
# Test inverse transform
sample_predictions = y_test[:5].reshape(-1, 1)
original_scale = preprocessor.inverse_transform_predictions(sample_predictions)

print("\n=== Inverse Transform Test ===")
print("Normalized -> Original Scale:")
for norm, orig in zip(sample_predictions.flatten(), original_scale.flatten()):
    print(f"  {norm:.4f} -> ${orig:.2f}")

## 5. Multi-Feature Preprocessing (Optional)

Menggunakan multiple features seperti Open, High, Low, Close, Volume dan technical indicators.

In [None]:
# Multi-feature preprocessing
multi_preprocessor = MultiFeaturePreprocessor(
    sequence_length=SEQUENCE_LENGTH,
    feature_columns=['Close', 'Open', 'High', 'Low', 'Volume']
)

# Add technical indicators
data_with_indicators = multi_preprocessor.add_technical_indicators(data)

print("\nFeatures available:")
print(data_with_indicators.columns.tolist())

In [None]:
# Preview data with indicators
data_with_indicators.head()

In [None]:
# Optional: Use multi-feature for training
USE_MULTI_FEATURE = False  # Set to True to use multiple features

if USE_MULTI_FEATURE:
    # Select features to use
    selected_features = ['Close', 'Volume', 'MA_7', 'MA_21', 'RSI']
    
    multi_preprocessor_full = MultiFeaturePreprocessor(
        sequence_length=SEQUENCE_LENGTH,
        feature_columns=selected_features
    )
    
    X_train_multi, y_train_multi, X_test_multi, y_test_multi = multi_preprocessor_full.prepare_data(
        data_with_indicators,
        target_column='Close',
        train_ratio=TRAIN_RATIO
    )
    
    print(f"\nMulti-feature shapes:")
    print(f"X_train: {X_train_multi.shape}")
    print(f"X_test: {X_test_multi.shape}")

## 6. Visualize Train/Test Split

In [None]:
# Calculate split point in original data
split_point = int(len(data) * TRAIN_RATIO)

# Account for sequence length
train_end_idx = split_point + SEQUENCE_LENGTH

fig, ax = plt.subplots(figsize=(14, 7))

# Plot training data
ax.plot(data['Date'][:train_end_idx], data['Close'][:train_end_idx], 
        color='#1B998B', label='Training Data', linewidth=1.5)

# Plot test data
ax.plot(data['Date'][train_end_idx:], data['Close'][train_end_idx:], 
        color='#F46036', label='Test Data', linewidth=1.5)

# Add vertical line for split
ax.axvline(x=data['Date'].iloc[train_end_idx], color='gray', 
           linestyle='--', linewidth=2, label='Train/Test Split')

ax.set_title(f'{TICKER} Train/Test Split Visualization', fontsize=14, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Close Price (USD)')
ax.legend(loc='upper left')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nSplit Date: {data['Date'].iloc[train_end_idx].strftime('%Y-%m-%d')}")

## 7. Save Preprocessed Data & Scaler

In [None]:
# Save scaler for later use
preprocessor.save_scaler(filepath='../models/scaler.pkl')

# Save preprocessed data as numpy arrays
import os
os.makedirs('../data/processed', exist_ok=True)

np.save('../data/processed/X_train.npy', X_train)
np.save('../data/processed/y_train.npy', y_train)
np.save('../data/processed/X_test.npy', X_test)
np.save('../data/processed/y_test.npy', y_test)

print("Preprocessed data saved to ../data/processed/")
print("Scaler saved to ../models/scaler.pkl")

## 8. Summary

### Preprocessing Complete!

| Parameter | Value |
|-----------|-------|
| Sequence Length | 60 days |
| Train/Test Ratio | 80/20 |
| Normalization | MinMaxScaler (0-1) |
| Target Column | Close Price |

### Files Created:
- `data/processed/X_train.npy` - Training features
- `data/processed/y_train.npy` - Training targets
- `data/processed/X_test.npy` - Test features
- `data/processed/y_test.npy` - Test targets
- `models/scaler.pkl` - Saved scaler for inverse transform

### Next Step:
â†’ **03_model_training.ipynb** - Build and train LSTM model

In [None]:
print("\n" + "="*50)
print("PREPROCESSING SUMMARY")
print("="*50)
print(f"Ticker: {TICKER}")
print(f"Sequence Length: {SEQUENCE_LENGTH}")
print(f"Features: {preprocessor.feature_columns}")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print("\nReady for model training!")