# Stock Price Prediction Demo

This notebook demonstrates the complete workflow for stock price prediction using machine learning.

In [None]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_collector import StockDataCollector
from feature_engineering import FeatureEngineering
from ml_models import StockPredictor, compare_models

plt.style.use('default')
sns.set_palette("husl")

## 1. Data Collection

Let's start by collecting stock data for a popular stock like Apple (AAPL).

In [None]:
# Initialize data collector
collector = StockDataCollector()

# Fetch Apple stock data for the last 2 years
symbol = 'AAPL'
data = collector.fetch_stock_data(symbol, period='2y', interval='1d')

print(f"Data shape: {data.shape}")
print(f"Date range: {data.index.min()} to {data.index.max()}")
data.head()

## 2. Exploratory Data Analysis

In [None]:
# Plot stock price over time
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Price chart
axes[0, 0].plot(data.index, data['Close'], label='Close Price')
axes[0, 0].set_title(f'{symbol} Stock Price')
axes[0, 0].set_ylabel('Price ($)')
axes[0, 0].legend()

# Volume chart
axes[0, 1].bar(data.index, data['Volume'], alpha=0.7, color='orange')
axes[0, 1].set_title(f'{symbol} Trading Volume')
axes[0, 1].set_ylabel('Volume')

# Daily returns
returns = data['Close'].pct_change().dropna()
axes[1, 0].hist(returns, bins=50, alpha=0.7, color='green')
axes[1, 0].set_title('Daily Returns Distribution')
axes[1, 0].set_xlabel('Daily Return')
axes[1, 0].set_ylabel('Frequency')

# Price vs Volume scatter
axes[1, 1].scatter(data['Volume'], data['Close'], alpha=0.5)
axes[1, 1].set_title('Price vs Volume')
axes[1, 1].set_xlabel('Volume')
axes[1, 1].set_ylabel('Close Price ($)')

plt.tight_layout()
plt.show()

## 3. Feature Engineering

Now let's create technical indicators and other features for our machine learning model.

In [None]:
# Initialize feature engineering
fe = FeatureEngineering()

# Create features for predicting next day's direction (up/down)
processed_data = fe.prepare_features(data, target_days=1, target_type='direction')

print(f"Original data shape: {data.shape}")
print(f"Processed data shape: {processed_data.shape}")
print(f"Number of features: {processed_data.shape[1] - 1}")

# Show target distribution
print("\nTarget Distribution:")
print(processed_data['Target'].value_counts())
print(f"Up days: {processed_data['Target'].mean()*100:.1f}%")

## 4. Model Training and Evaluation

Let's train different machine learning models and compare their performance.

In [None]:
# Prepare data for training
predictor = StockPredictor()
X_train, X_test, y_train, y_test = predictor.prepare_data(processed_data, time_series_split=True)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")
print(f"Number of features: {X_train.shape[1]}")

In [None]:
# Compare different models
results = compare_models(X_train, X_test, y_train, y_test, task_type='classification')

# Create a comparison dataframe
comparison_df = pd.DataFrame({model: result['metrics'] for model, result in results.items()}).T
print("\nModel Comparison:")
print(comparison_df.round(4))

## 5. Feature Importance Analysis

In [None]:
# Get feature importance from Random Forest model
rf_model = results['random_forest']['model']
feature_importance = rf_model.get_feature_importance(top_n=15)

if feature_importance is not None:
    # Plot feature importance
    plt.figure(figsize=(10, 8))
    sns.barplot(data=feature_importance, y='feature', x='importance')
    plt.title('Top 15 Most Important Features')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()
    
    print("Top 10 Features:")
    print(feature_importance.head(10))

## 6. Model Predictions and Analysis

In [None]:
# Make predictions on test set
best_model = results['random_forest']['model']
test_predictions = best_model.predict(processed_data.iloc[-len(X_test):][best_model.feature_columns])
test_probabilities = best_model.predict_proba(processed_data.iloc[-len(X_test):][best_model.feature_columns])

# Create a results dataframe
test_results = pd.DataFrame({
    'Date': processed_data.index[-len(X_test):],
    'Actual': y_test.values,
    'Predicted': test_predictions,
    'Probability_Up': test_probabilities[:, 1],
    'Close_Price': processed_data['Close'].iloc[-len(X_test):].values
})

test_results.head(10)

In [None]:
# Plot predictions vs actual
fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Price chart with predictions
axes[0].plot(test_results['Date'], test_results['Close_Price'], label='Close Price', color='blue')
correct_predictions = test_results[test_results['Actual'] == test_results['Predicted']]
wrong_predictions = test_results[test_results['Actual'] != test_results['Predicted']]

axes[0].scatter(correct_predictions['Date'], correct_predictions['Close_Price'], 
                color='green', alpha=0.6, label='Correct Predictions', s=30)
axes[0].scatter(wrong_predictions['Date'], wrong_predictions['Close_Price'], 
                color='red', alpha=0.6, label='Wrong Predictions', s=30)

axes[0].set_title('Stock Price with Prediction Accuracy')
axes[0].set_ylabel('Price ($)')
axes[0].legend()

# Prediction confidence
axes[1].plot(test_results['Date'], test_results['Probability_Up'], label='Probability of Up Move')
axes[1].axhline(y=0.5, color='red', linestyle='--', alpha=0.7, label='Decision Threshold')
axes[1].set_title('Model Confidence (Probability of Up Move)')
axes[1].set_ylabel('Probability')
axes[1].set_xlabel('Date')
axes[1].legend()

plt.tight_layout()
plt.show()

# Calculate accuracy
accuracy = (test_results['Actual'] == test_results['Predicted']).mean()
print(f"\nTest Accuracy: {accuracy:.1%}")

## 7. Save the Model

Let's save our best performing model for future use.

In [None]:
# Save the best model
model_name = f"{symbol}_direction_predictor"
best_model.save_model(model_name)

print(f"Model saved as: {model_name}")
print(f"Model accuracy: {results['random_forest']['metrics']['accuracy']:.1%}")
print(f"Model can be loaded later using: predictor.load_model('{model_name}')")

## 8. Next Steps

This demo shows a basic stock prediction workflow. To improve the model, consider:

1. **More data**: Use multiple stocks, longer time periods, or higher frequency data
2. **Additional features**: 
   - Economic indicators (GDP, unemployment, interest rates)
   - Market sentiment data (VIX, news sentiment)
   - Sector/industry specific indicators
3. **Advanced models**: 
   - LSTM/GRU neural networks for sequence modeling
   - Transformer models
   - Ensemble methods
4. **Better evaluation**: 
   - Walk-forward analysis
   - Out-of-sample testing
   - Risk-adjusted returns
5. **Trading strategy**: 
   - Transaction costs
   - Position sizing
   - Risk management rules

Remember: Stock prediction is inherently difficult and uncertain. Always validate models thoroughly and consider the limitations of historical data.