# Stock Price Prediction System - Week 1 Prototype

## Overview
This notebook implements a comprehensive stock price prediction system that combines:
- **News Sentiment Analysis**: Using FinBERT to embed financial news articles
- **Technical Indicators**: Historical stock data and financial markers  
- **Machine Learning**: Neural network for price prediction
- **Cloud Storage**: GCS for scalable data storage
- **Cost Optimization**: Free tier utilization and efficient processing

## Architecture
```
NewsAPI + yfinance → Feature Engineering → FinBERT Embeddings → PyTorch MLP → Price Prediction
                        ↓
                   Cloud Storage (GCS)
```

## Target
Predict stock price 7 days in the future using:
- News sentiment from preceding 7 days
- Financial indicators and technical analysis
- Cost-optimized cloud infrastructure

## 1. Environment Setup and Dependencies

First, let's set up our environment with UV package manager and install all required dependencies.

In [None]:
# Check if we're in Colab
import sys
import os
from pathlib import Path

# If in Colab, install packages
if 'google.colab' in sys.modules:
    print("Running in Google Colab - installing packages...")
    !pip install yfinance newsapi-python transformers torch dask[complete] google-cloud-storage pandas pyarrow optuna scikit-learn matplotlib seaborn plotly
else:
    print("Running locally - assuming packages are installed via UV")

# Set up paths
if 'google.colab' in sys.modules:
    # In Colab, we'll work in a temporary directory
    project_root = Path('/content/stock_prediction')
    project_root.mkdir(exist_ok=True)
    os.chdir(project_root)
else:
    # Local development - navigate to project root
    project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
    os.chdir(project_root)
    
print(f"Working directory: {os.getcwd()}")
print(f"Project root: {project_root}")

In [None]:
# Import required libraries
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from datetime import datetime, timedelta
import time
import json
import logging
from typing import List, Dict, Any, Optional, Tuple

# Financial data libraries
import yfinance as yf
from newsapi import NewsApiClient

# ML libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

# Transformers for FinBERT
from transformers import AutoTokenizer, AutoModel
import transformers
transformers.logging.set_verbosity_error()

# Cloud storage
try:
    from google.cloud import storage
    GCS_AVAILABLE = True
except ImportError:
    print("Google Cloud Storage not available - will save locally only")
    GCS_AVAILABLE = False

# Set up plotting
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

print("All libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

## 2. Data Collection Configuration

Set up configuration for our data collection including API keys, stock symbols, and date ranges.

In [None]:
# Configuration
CONFIG = {
    # API Keys - UPDATE THESE WITH YOUR ACTUAL KEYS
    'NEWS_API_KEY': 'XXX',  # Your NewsAPI key
    'GCP_PROJECT_ID': 'your-gcp-project-id',  # Update with your GCP project ID
    'GCP_BUCKET_NAME': 'stock-prediction-data',  # Update with your bucket name
    
    # Stock symbols for initial testing (5 major tech stocks)
    'STOCK_SYMBOLS': ['AAPL', 'GOOGL', 'MSFT', 'TSLA', 'NVDA'],
    
    # Date range for data collection
    'START_DATE': '2024-09-01',
    'END_DATE': '2025-08-31',
    
    # For prototype, use smaller date range
    'PROTOTYPE_START_DATE': (datetime.now() - timedelta(days=30)).strftime('%Y-%m-%d'),
    'PROTOTYPE_END_DATE': datetime.now().strftime('%Y-%m-%d'),
    
    # Prediction parameters
    'PREDICTION_HORIZON_DAYS': 7,  # Predict price 7 days in future
    'NEWS_LOOKBACK_DAYS': 7,       # Use news from past 7 days
    
    # API rate limits
    'NEWS_API_REQUESTS_PER_MINUTE': 50,
    'NEWS_API_REQUESTS_PER_DAY': 1000,
    
    # Model parameters
    'FINBERT_MODEL': 'ProsusAI/finbert',
    'EMBEDDING_DIM': 768,
    'FINANCIAL_FEATURES': 5,
    'TOTAL_FEATURES': 773,  # 768 + 5
    
    # Training parameters
    'BATCH_SIZE': 32,
    'EPOCHS': 20,
    'LEARNING_RATE': 0.001,
    'TRAIN_TEST_SPLIT': 0.8,
}

# Stock-specific keywords for news filtering
STOCK_KEYWORDS = {
    'AAPL': ['apple', 'iphone', 'ipad', 'mac', 'tim cook', 'app store'],
    'GOOGL': ['google', 'alphabet', 'search', 'youtube', 'android', 'sundar pichai'],
    'MSFT': ['microsoft', 'windows', 'office', 'azure', 'satya nadella'],
    'TSLA': ['tesla', 'elon musk', 'electric vehicle', 'ev', 'model s', 'model 3'],
    'NVDA': ['nvidia', 'gpu', 'artificial intelligence', 'ai', 'gaming', 'data center']
}

print("Configuration loaded successfully!")
print(f"Target stocks: {CONFIG['STOCK_SYMBOLS']}")
print(f"Prototype date range: {CONFIG['PROTOTYPE_START_DATE']} to {CONFIG['PROTOTYPE_END_DATE']}")
print(f"Full date range: {CONFIG['START_DATE']} to {CONFIG['END_DATE']}")

## 3. News Data Scraping and Processing

Implement NewsAPI integration to collect timestamped news articles for our target stocks.

In [None]:
class NewsCollector:
    """Collect news data from NewsAPI with rate limiting and filtering."""
    
    def __init__(self, api_key: str):
        self.client = NewsApiClient(api_key=api_key)
        self.rate_limit_delay = 60 / CONFIG['NEWS_API_REQUESTS_PER_MINUTE']
        
    def collect_news_for_stock(self, symbol: str, start_date: str, end_date: str) -> List[Dict]:
        """Collect news articles for a specific stock symbol."""
        articles = []
        keywords = STOCK_KEYWORDS.get(symbol, [])
        
        # Construct search query
        query_terms = [symbol] + keywords[:3]  # Limit to avoid query length issues
        query = f"({' OR '.join(query_terms)})"
        
        try:
            print(f"Collecting news for {symbol}...")
            
            response = self.client.get_everything(
                q=query,
                from_param=start_date,
                to=end_date,
                language='en',
                sort_by='publishedAt',
                page_size=100
            )
            
            if response['status'] == 'ok':
                articles = response['articles']
                print(f"Found {len(articles)} articles for {symbol}")
            else:
                print(f"API error for {symbol}: {response}")
                
            # Rate limiting
            time.sleep(self.rate_limit_delay)
            
        except Exception as e:
            print(f"Error collecting news for {symbol}: {e}")
            
        return articles
    
    def filter_relevant_articles(self, articles: List[Dict], symbol: str) -> List[Dict]:
        """Filter articles for relevance to the stock."""
        filtered_articles = []
        keywords = STOCK_KEYWORDS.get(symbol, [])
        
        for article in articles:
            title = article.get('title', '').lower()
            description = article.get('description', '').lower() if article.get('description') else ''
            
            text_content = f"{title} {description}"
            
            # Check relevance
            relevance_score = 0
            if symbol.lower() in text_content:
                relevance_score += 3
                
            for keyword in keywords:
                if keyword.lower() in text_content:
                    relevance_score += 1
            
            # Keep articles with minimum relevance
            if relevance_score >= 2:
                article['relevance_score'] = relevance_score
                article['symbol'] = symbol
                article['processed_text'] = f"{title}. {description}"
                filtered_articles.append(article)
                
        print(f"Filtered to {len(filtered_articles)} relevant articles for {symbol}")
        return filtered_articles

# Initialize news collector
news_collector = NewsCollector(CONFIG['NEWS_API_KEY'])
print("News collector initialized successfully!")

## 4. Financial Data Extraction

Use yfinance to collect historical stock data including prices, volume, and technical indicators.

In [None]:
class StockDataCollector:
    """Collect stock price data and calculate technical indicators."""
    
    def __init__(self):
        pass
    
    def collect_stock_data(self, symbol: str, start_date: str, end_date: str) -> pd.DataFrame:
        """Collect stock price data for a symbol."""
        try:
            print(f"Collecting stock data for {symbol}...")
            
            ticker = yf.Ticker(symbol)
            data = ticker.history(start=start_date, end=end_date, interval='1d')
            
            if data.empty:
                print(f"No stock data found for {symbol}")
                return pd.DataFrame()
            
            # Reset index to make Date a column
            data = data.reset_index()
            data['Symbol'] = symbol
            
            print(f"Collected {len(data)} days of stock data for {symbol}")
            return data
            
        except Exception as e:
            print(f"Error collecting stock data for {symbol}: {e}")
            return pd.DataFrame()
    
    def calculate_technical_indicators(self, data: pd.DataFrame) -> pd.DataFrame:
        """Calculate technical indicators for stock data."""
        if data.empty:
            return data
            
        try:
            # Simple Moving Average (20 days)
            data['SMA_20'] = data['Close'].rolling(window=20).mean()
            
            # RSI (14 days)
            delta = data['Close'].diff()
            gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
            loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
            rs = gain / loss
            data['RSI_14'] = 100 - (100 / (1 + rs))
            
            # MACD
            exp1 = data['Close'].ewm(span=12).mean()
            exp2 = data['Close'].ewm(span=26).mean()
            data['MACD'] = exp1 - exp2
            data['MACD_Signal'] = data['MACD'].ewm(span=9).mean()
            
            # Bollinger Bands
            data['BB_Middle'] = data['Close'].rolling(window=20).mean()
            bb_std = data['Close'].rolling(window=20).std()
            data['BB_Upper'] = data['BB_Middle'] + (bb_std * 2)
            data['BB_Lower'] = data['BB_Middle'] - (bb_std * 2)
            
            # Volume indicators
            data['Volume_SMA_20'] = data['Volume'].rolling(window=20).mean()
            
            # Volatility (20-day rolling standard deviation of returns)
            data['Returns'] = data['Close'].pct_change()
            data['Volatility_20'] = data['Returns'].rolling(window=20).std()
            
            # Price change indicators
            data['Price_Change'] = data['Close'].diff()
            data['Price_Change_Pct'] = data['Close'].pct_change()
            
            print(f"Calculated technical indicators for {data['Symbol'].iloc[0]}")
            return data
            
        except Exception as e:
            print(f"Error calculating technical indicators: {e}")
            return data

# Initialize stock data collector
stock_collector = StockDataCollector()
print("Stock data collector initialized successfully!")

## 5. Data Collection Execution

Now let's collect the actual data for our prototype using the past 30 days.

In [None]:
# Collect data for all symbols
all_news_data = []
all_stock_data = []

print("Starting data collection for prototype...")
print(f"Date range: {CONFIG['PROTOTYPE_START_DATE']} to {CONFIG['PROTOTYPE_END_DATE']}")
print("="*60)

for symbol in CONFIG['STOCK_SYMBOLS']:
    print(f"\nProcessing {symbol}...")
    
    # Collect news data
    try:
        news_articles = news_collector.collect_news_for_stock(
            symbol, 
            CONFIG['PROTOTYPE_START_DATE'], 
            CONFIG['PROTOTYPE_END_DATE']
        )
        
        # Filter relevant articles
        relevant_articles = news_collector.filter_relevant_articles(news_articles, symbol)
        all_news_data.extend(relevant_articles)
        
    except Exception as e:
        print(f"Failed to collect news for {symbol}: {e}")
    
    # Collect stock data
    try:
        stock_data = stock_collector.collect_stock_data(
            symbol, 
            CONFIG['PROTOTYPE_START_DATE'], 
            CONFIG['PROTOTYPE_END_DATE']
        )
        
        if not stock_data.empty:
            stock_data = stock_collector.calculate_technical_indicators(stock_data)
            all_stock_data.append(stock_data)
            
    except Exception as e:
        print(f"Failed to collect stock data for {symbol}: {e}")

print("\\n" + "="*60)
print("Data collection completed!")
print(f"Total news articles collected: {len(all_news_data)}")
print(f"Total stock symbols with data: {len(all_stock_data)}")

# Create DataFrames
if all_news_data:
    news_df = pd.DataFrame(all_news_data)
    print(f"News DataFrame shape: {news_df.shape}")
else:
    news_df = pd.DataFrame()
    print("No news data collected")

if all_stock_data:
    stock_df = pd.concat(all_stock_data, ignore_index=True)
    print(f"Stock DataFrame shape: {stock_df.shape}")
else:
    stock_df = pd.DataFrame()
    print("No stock data collected")

## 6. Data Exploration and Visualization

Let's explore the collected data to understand its structure and quality.

In [None]:
# Explore news data
if not news_df.empty:
    print("NEWS DATA ANALYSIS")
    print("="*50)
    print(f"Total articles: {len(news_df)}")
    print(f"Date range: {news_df['publishedAt'].min()} to {news_df['publishedAt'].max()}")
    print(f"Symbols covered: {news_df['symbol'].value_counts()}")
    print(f"Average relevance score: {news_df['relevance_score'].mean():.2f}")
    
    # News by symbol
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Articles per symbol
    news_df['symbol'].value_counts().plot(kind='bar', ax=axes[0])
    axes[0].set_title('News Articles per Symbol')
    axes[0].set_xlabel('Symbol')
    axes[0].set_ylabel('Article Count')
    
    # Relevance score distribution
    news_df['relevance_score'].hist(bins=10, ax=axes[1])
    axes[1].set_title('Relevance Score Distribution')
    axes[1].set_xlabel('Relevance Score')
    axes[1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()
    
    # Show sample articles
    print("\\nSample news articles:")
    for i, row in news_df.head(3).iterrows():
        print(f"\\n{row['symbol']}: {row['title']}")
        print(f"Published: {row['publishedAt']}")
        print(f"Relevance: {row['relevance_score']}")
else:
    print("No news data to analyze")

In [None]:
# Explore stock data
if not stock_df.empty:
    print("STOCK DATA ANALYSIS")
    print("="*50)
    print(f"Total records: {len(stock_df)}")
    print(f"Symbols: {stock_df['Symbol'].unique()}")
    print(f"Date range: {stock_df['Date'].min()} to {stock_df['Date'].max()}")
    
    # Stock price trends
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Stock Prices', 'Volume', 'RSI', 'Volatility'),
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]]
    )
    
    colors = ['blue', 'red', 'green', 'orange', 'purple']
    
    for i, symbol in enumerate(stock_df['Symbol'].unique()):
        symbol_data = stock_df[stock_df['Symbol'] == symbol]
        color = colors[i % len(colors)]
        
        # Stock prices
        fig.add_trace(
            go.Scatter(x=symbol_data['Date'], y=symbol_data['Close'], 
                      name=f'{symbol} Close', line=dict(color=color)),
            row=1, col=1
        )
        
        # Volume
        fig.add_trace(
            go.Scatter(x=symbol_data['Date'], y=symbol_data['Volume'], 
                      name=f'{symbol} Volume', line=dict(color=color)),
            row=1, col=2
        )
        
        # RSI
        fig.add_trace(
            go.Scatter(x=symbol_data['Date'], y=symbol_data['RSI_14'], 
                      name=f'{symbol} RSI', line=dict(color=color)),
            row=2, col=1
        )
        
        # Volatility
        fig.add_trace(
            go.Scatter(x=symbol_data['Date'], y=symbol_data['Volatility_20'], 
                      name=f'{symbol} Vol', line=dict(color=color)),
            row=2, col=2
        )
    
    fig.update_layout(height=800, title_text="Stock Data Overview")
    fig.show()
    
    # Statistical summary
    print("\\nStock price statistics:")
    print(stock_df.groupby('Symbol')['Close'].agg(['min', 'max', 'mean', 'std']).round(2))
else:
    print("No stock data to analyze")

## 7. News Embedding with FinBERT

Load FinBERT model and create embeddings for news articles to capture sentiment and meaning.

In [None]:
class NewsEmbedder:
    """Create embeddings for news articles using FinBERT."""
    
    def __init__(self, model_name: str = 'ProsusAI/finbert'):
        print(f"Loading FinBERT model: {model_name}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)
        
        # Move to GPU if available
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        self.model.eval()
        
        print(f"FinBERT model loaded on {self.device}")
    
    def embed_texts(self, texts: List[str], batch_size: int = 16) -> np.ndarray:
        """Create embeddings for a list of texts."""
        embeddings = []
        
        print(f"Creating embeddings for {len(texts)} texts...")
        
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            
            # Tokenize
            inputs = self.tokenizer(
                batch_texts,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors='pt'
            ).to(self.device)
            
            # Get embeddings
            with torch.no_grad():
                outputs = self.model(**inputs)
                # Use [CLS] token embedding
                batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                embeddings.append(batch_embeddings)
        
        embeddings = np.vstack(embeddings)
        print(f"Created embeddings with shape: {embeddings.shape}")
        return embeddings

# Initialize news embedder (this may take a while to download the model)
if not news_df.empty:
    print("Initializing FinBERT for news embedding...")
    news_embedder = NewsEmbedder(CONFIG['FINBERT_MODEL'])
else:
    print("Skipping FinBERT initialization - no news data available")

In [None]:
# Create news embeddings
if not news_df.empty and 'news_embedder' in locals():
    print("Creating news embeddings...")
    
    # Extract processed texts
    news_texts = news_df['processed_text'].fillna('').tolist()
    
    # Create embeddings
    news_embeddings = news_embedder.embed_texts(news_texts, batch_size=8)
    
    # Add embeddings to DataFrame
    for i in range(news_embeddings.shape[1]):
        news_df[f'embedding_{i}'] = news_embeddings[:, i]
    
    print(f"News embeddings created: {news_embeddings.shape}")
    print(f"News DataFrame now has {len(news_df.columns)} columns")
    
    # Analyze embedding statistics
    embedding_cols = [col for col in news_df.columns if col.startswith('embedding_')]
    embedding_stats = news_df[embedding_cols].describe()
    
    print("\\nEmbedding statistics:")
    print(f"Mean embedding magnitude: {np.mean(np.linalg.norm(news_embeddings, axis=1)):.4f}")
    print(f"Embedding dimensions: {len(embedding_cols)}")
    
else:
    print("Skipping news embedding creation - no news data or embedder available")
    news_embeddings = None

## 8. Feature Engineering and Dataset Creation

Combine news embeddings with financial indicators to create training datasets with proper time alignment.

In [None]:
class FeatureEngineering:
    """Create features for training by combining news and stock data."""
    
    def __init__(self):
        pass
    
    def create_training_dataset(self, stock_df: pd.DataFrame, news_df: pd.DataFrame) -> pd.DataFrame:
        """Create training dataset with proper time alignment."""
        if stock_df.empty:
            print("No stock data available for feature engineering")
            return pd.DataFrame()
        
        print("Creating training dataset...")
        
        # Convert dates
        stock_df['Date'] = pd.to_datetime(stock_df['Date'])
        if not news_df.empty:
            news_df['publishedAt'] = pd.to_datetime(news_df['publishedAt'])
        
        training_data = []
        
        for symbol in stock_df['Symbol'].unique():
            symbol_stock_data = stock_df[stock_df['Symbol'] == symbol].sort_values('Date')
            symbol_news_data = news_df[news_df['symbol'] == symbol] if not news_df.empty else pd.DataFrame()
            
            print(f"Processing {symbol}: {len(symbol_stock_data)} stock records, {len(symbol_news_data)} news articles")
            
            # Create features for each trading day
            for idx, row in symbol_stock_data.iterrows():
                current_date = row['Date']
                
                # Skip if we don't have enough historical data
                if idx < 20:  # Need 20 days for technical indicators
                    continue
                
                # Create feature vector
                features = {}
                features['Symbol'] = symbol
                features['Date'] = current_date
                features['Target_Date'] = current_date + pd.Timedelta(days=7)
                
                # Financial features (5 key indicators)
                features['Close_Price'] = row['Close']
                features['SMA_20'] = row.get('SMA_20', 0)
                features['RSI_14'] = row.get('RSI_14', 50)
                features['MACD'] = row.get('MACD', 0)
                features['Volatility_20'] = row.get('Volatility_20', 0)
                
                # News features (aggregate embeddings from past 7 days)
                if not symbol_news_data.empty:
                    start_date = current_date - pd.Timedelta(days=7)
                    relevant_news = symbol_news_data[
                        (symbol_news_data['publishedAt'] >= start_date) & 
                        (symbol_news_data['publishedAt'] <= current_date)
                    ]
                    
                    if len(relevant_news) > 0:
                        # Aggregate news embeddings (mean)
                        embedding_cols = [col for col in relevant_news.columns if col.startswith('embedding_')]
                        if embedding_cols:
                            news_embedding = relevant_news[embedding_cols].mean().values
                            for i, emb_val in enumerate(news_embedding):
                                features[f'news_emb_{i}'] = emb_val
                        
                        features['news_count'] = len(relevant_news)
                        features['avg_relevance'] = relevant_news['relevance_score'].mean()
                    else:
                        # No news - use zero embeddings
                        for i in range(768):  # FinBERT embedding dimension
                            features[f'news_emb_{i}'] = 0.0
                        features['news_count'] = 0
                        features['avg_relevance'] = 0
                else:
                    # No news data - use zero embeddings
                    for i in range(768):
                        features[f'news_emb_{i}'] = 0.0
                    features['news_count'] = 0
                    features['avg_relevance'] = 0
                
                # Target: stock price 7 days later
                target_date = current_date + pd.Timedelta(days=7)
                future_data = symbol_stock_data[symbol_stock_data['Date'] >= target_date]
                
                if len(future_data) > 0:
                    features['Target_Price'] = future_data['Close'].iloc[0]
                    features['Price_Change'] = features['Target_Price'] - features['Close_Price']
                    features['Price_Change_Pct'] = (features['Price_Change'] / features['Close_Price']) * 100
                    
                    training_data.append(features)
        
        if training_data:
            training_df = pd.DataFrame(training_data)
            print(f"Created training dataset with {len(training_df)} records")
            print(f"Features: {len(training_df.columns)} columns")
            return training_df
        else:
            print("No training data created")
            return pd.DataFrame()

# Create feature engineering instance and training dataset
feature_engineer = FeatureEngineering()
training_df = feature_engineer.create_training_dataset(stock_df, news_df)

if not training_df.empty:
    print(f"\\nTraining dataset summary:")
    print(f"Shape: {training_df.shape}")
    print(f"Date range: {training_df['Date'].min()} to {training_df['Date'].max()}")
    print(f"Symbols: {training_df['Symbol'].value_counts()}")
    
    # Remove rows with missing targets
    training_df = training_df.dropna(subset=['Target_Price'])
    print(f"After removing missing targets: {training_df.shape}")
else:
    print("No training dataset created")

## 9. Model Architecture and Training

Create a PyTorch MLP model that combines news embeddings and financial features for price prediction.

In [None]:
class StockPredictionMLP(nn.Module):
    """Multi-layer perceptron for stock price prediction."""
    
    def __init__(self, input_dim: int, hidden_dims: List[int] = [512, 256, 128], dropout_rate: float = 0.3):
        super().__init__()
        
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout_rate)
            ])
            prev_dim = hidden_dim
        
        # Output layer
        layers.append(nn.Linear(prev_dim, 1))
        
        self.network = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.network(x)

class StockDataset(Dataset):
    """PyTorch dataset for stock prediction."""
    
    def __init__(self, features: np.ndarray, targets: np.ndarray):
        self.features = torch.FloatTensor(features)
        self.targets = torch.FloatTensor(targets)
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

def train_model(model, train_loader, val_loader, num_epochs: int = 20, learning_rate: float = 0.001):
    """Train the stock prediction model."""
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    train_losses = []
    val_losses = []
    
    print(f"Training on {device}")
    print(f"Training samples: {len(train_loader.dataset)}")
    print(f"Validation samples: {len(val_loader.dataset)}")
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0
        
        for batch_features, batch_targets in train_loader:
            batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(batch_features).squeeze()
            loss = criterion(outputs, batch_targets)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        
        with torch.no_grad():
            for batch_features, batch_targets in val_loader:
                batch_features, batch_targets = batch_features.to(device), batch_targets.to(device)
                outputs = model(batch_features).squeeze()
                loss = criterion(outputs, batch_targets)
                val_loss += loss.item()
        
        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        
        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)
        
        if epoch % 5 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}')
    
    return train_losses, val_losses

print("Model classes defined successfully!")

In [None]:
# Prepare data for training
if not training_df.empty:
    print("Preparing data for training...")
    
    # Separate features and targets
    feature_cols = [col for col in training_df.columns if col not in ['Symbol', 'Date', 'Target_Date', 'Target_Price', 'Price_Change', 'Price_Change_Pct']]
    
    X = training_df[feature_cols].values
    y = training_df['Target_Price'].values
    
    print(f"Feature matrix shape: {X.shape}")
    print(f"Target vector shape: {y.shape}")
    
    # Handle missing values
    X = np.nan_to_num(X, nan=0.0)
    
    # Normalize features
    scaler_X = StandardScaler()
    X_scaled = scaler_X.fit_transform(X)
    
    # Split data temporally (80% train, 20% validation)
    split_idx = int(0.8 * len(X_scaled))
    X_train, X_val = X_scaled[:split_idx], X_scaled[split_idx:]
    y_train, y_val = y[:split_idx], y[split_idx:]
    
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Validation set: {X_val.shape[0]} samples")
    
    # Create datasets and data loaders
    train_dataset = StockDataset(X_train, y_train)
    val_dataset = StockDataset(X_val, y_val)
    
    train_loader = DataLoader(train_dataset, batch_size=CONFIG['BATCH_SIZE'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=CONFIG['BATCH_SIZE'], shuffle=False)
    
    # Initialize model
    input_dim = X_scaled.shape[1]
    model = StockPredictionMLP(input_dim=input_dim)
    
    print(f"Model initialized with input dimension: {input_dim}")
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    # Train model
    print("\\nStarting training...")
    train_losses, val_losses = train_model(
        model, train_loader, val_loader, 
        num_epochs=CONFIG['EPOCHS'], 
        learning_rate=CONFIG['LEARNING_RATE']
    )
    
    print("Training completed!")
    
else:
    print("No training data available for model training")

## 10. Model Evaluation and Visualization

Evaluate the trained model and visualize predictions vs actual prices.

In [None]:
# Evaluate model performance
if not training_df.empty and 'model' in locals():
    print("Evaluating model performance...")
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.eval()
    
    # Make predictions on validation set
    with torch.no_grad():
        val_predictions = []
        val_actuals = []
        
        for batch_features, batch_targets in val_loader:
            batch_features = batch_features.to(device)
            outputs = model(batch_features).squeeze()
            val_predictions.extend(outputs.cpu().numpy())
            val_actuals.extend(batch_targets.numpy())
    
    val_predictions = np.array(val_predictions)
    val_actuals = np.array(val_actuals)
    
    # Calculate metrics
    mae = mean_absolute_error(val_actuals, val_predictions)
    rmse = np.sqrt(mean_squared_error(val_actuals, val_predictions))
    r2 = r2_score(val_actuals, val_predictions)
    
    # Calculate directional accuracy
    actual_direction = np.sign(np.diff(val_actuals))
    pred_direction = np.sign(np.diff(val_predictions))
    directional_accuracy = np.mean(actual_direction == pred_direction) * 100
    
    print(f"\\nModel Performance Metrics:")
    print(f"Mean Absolute Error (MAE): ${mae:.2f}")
    print(f"Root Mean Square Error (RMSE): ${rmse:.2f}")
    print(f"R² Score: {r2:.4f}")
    print(f"Directional Accuracy: {directional_accuracy:.2f}%")
    
    # Plot training curves
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Training/Validation Loss
    axes[0].plot(train_losses, label='Training Loss', color='blue')
    axes[0].plot(val_losses, label='Validation Loss', color='red')
    axes[0].set_title('Training and Validation Loss')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Loss')
    axes[0].legend()
    axes[0].grid(True)
    
    # Predictions vs Actuals
    axes[1].scatter(val_actuals, val_predictions, alpha=0.6, color='blue')
    axes[1].plot([val_actuals.min(), val_actuals.max()], [val_actuals.min(), val_actuals.max()], 'r--', lw=2)
    axes[1].set_xlabel('Actual Prices')
    axes[1].set_ylabel('Predicted Prices')
    axes[1].set_title(f'Predictions vs Actuals (R² = {r2:.4f})')
    axes[1].grid(True)
    
    plt.tight_layout()
    plt.show()
    
    # Plot time series predictions
    if len(val_predictions) > 10:
        plt.figure(figsize=(12, 6))
        indices = range(len(val_predictions))
        plt.plot(indices, val_actuals, label='Actual', color='blue', linewidth=2)
        plt.plot(indices, val_predictions, label='Predicted', color='red', linewidth=2, alpha=0.7)
        plt.title('Time Series: Actual vs Predicted Stock Prices')
        plt.xlabel('Time Index')
        plt.ylabel('Stock Price ($)')
        plt.legend()
        plt.grid(True)
        plt.show()
    
else:
    print("No trained model available for evaluation")

## 11. Data Storage and Cloud Integration

Save processed data locally and demonstrate Google Cloud Storage integration for scalability.

In [None]:
# Create data directory
import os
os.makedirs('data', exist_ok=True)
os.makedirs('data/raw', exist_ok=True)
os.makedirs('data/processed', exist_ok=True)
os.makedirs('models', exist_ok=True)

print("Created local data directories")

# Save data locally
if not news_df.empty:
    news_file = 'data/raw/prototype_news_data.parquet'
    news_df.to_parquet(news_file, index=False)
    print(f"Saved news data: {news_file} ({os.path.getsize(news_file) / 1024:.1f} KB)")

if not stock_df.empty:
    stock_file = 'data/raw/prototype_stock_data.parquet'
    stock_df.to_parquet(stock_file, index=False)
    print(f"Saved stock data: {stock_file} ({os.path.getsize(stock_file) / 1024:.1f} KB)")

if not training_df.empty:
    training_file = 'data/processed/prototype_training_data.parquet'
    training_df.to_parquet(training_file, index=False)
    print(f"Saved training data: {training_file} ({os.path.getsize(training_file) / 1024:.1f} KB)")

# Save model
if 'model' in locals():
    model_file = 'models/prototype_model.pth'
    torch.save({
        'model_state_dict': model.state_dict(),
        'scaler': scaler_X,
        'feature_columns': feature_cols,
        'model_config': {
            'input_dim': input_dim,
            'hidden_dims': [512, 256, 128],
            'dropout_rate': 0.3
        }
    }, model_file)
    print(f"Saved model: {model_file} ({os.path.getsize(model_file) / 1024:.1f} KB)")

print("\\nLocal data saving completed!")

# Demonstrate GCS integration (if available)
if GCS_AVAILABLE and 'google.colab' not in sys.modules:
    print("\\n" + "="*50)
    print("GOOGLE CLOUD STORAGE INTEGRATION")
    print("="*50)
    print("To enable GCS integration:")
    print("1. Set up GCP project and enable Cloud Storage API")
    print("2. Create a service account and download credentials")
    print("3. Save credentials as 'gcp-credentials.json'")
    print("4. Update CONFIG['GCP_PROJECT_ID'] and CONFIG['GCP_BUCKET_NAME']")
    print("5. Uncomment the code below to upload data to GCS")
    
    # Uncomment this section when GCS is properly configured
    # try:
    #     from google.cloud import storage
    #     
    #     # Initialize GCS client
    #     client = storage.Client(project=CONFIG['GCP_PROJECT_ID'])
    #     bucket = client.bucket(CONFIG['GCP_BUCKET_NAME'])
    #     
    #     # Upload files
    #     files_to_upload = [
    #         ('data/raw/prototype_news_data.parquet', 'raw_data/prototype_news_data.parquet'),
    #         ('data/raw/prototype_stock_data.parquet', 'raw_data/prototype_stock_data.parquet'),
    #         ('data/processed/prototype_training_data.parquet', 'processed_data/prototype_training_data.parquet'),
    #         ('models/prototype_model.pth', 'models/prototype_model.pth')
    #     ]
    #     
    #     for local_file, gcs_path in files_to_upload:
    #         if os.path.exists(local_file):
    #             blob = bucket.blob(gcs_path)
    #             blob.upload_from_filename(local_file)
    #             print(f"Uploaded {local_file} to GCS: gs://{CONFIG['GCP_BUCKET_NAME']}/{gcs_path}")
    #     
    #     print("GCS upload completed!")
    #     
    # except Exception as e:
    #     print(f"GCS upload failed: {e}")
    #     print("Make sure GCP credentials are properly configured")

else:
    print("\\nGCS integration not available (running in Colab or missing dependencies)")

## 12. Cost Monitoring and Next Steps

Implement cost monitoring strategies and outline next steps for full system deployment.

In [None]:
# Calculate resource usage and cost estimates
print("RESOURCE USAGE AND COST ANALYSIS")
print("="*60)

# Data usage
total_data_size = 0
data_files = [
    'data/raw/prototype_news_data.parquet',
    'data/raw/prototype_stock_data.parquet', 
    'data/processed/prototype_training_data.parquet',
    'models/prototype_model.pth'
]

for file_path in data_files:
    if os.path.exists(file_path):
        size_kb = os.path.getsize(file_path) / 1024
        total_data_size += size_kb
        print(f"{file_path}: {size_kb:.1f} KB")

print(f"\\nTotal data size: {total_data_size:.1f} KB ({total_data_size/1024:.2f} MB)")

# Estimate full-scale costs
print("\\nFULL-SCALE COST ESTIMATES (1 year of data)")
print("-" * 50)

# NewsAPI costs
daily_articles_per_stock = 50
total_stocks = len(CONFIG['STOCK_SYMBOLS'])
days_per_year = 365
total_api_calls = daily_articles_per_stock * total_stocks * days_per_year

print(f"NewsAPI Usage:")
print(f"- Articles per stock per day: {daily_articles_per_stock}")
print(f"- Total stocks: {total_stocks}")
print(f"- Total API calls per year: {total_api_calls:,}")
print(f"- Cost: FREE (within 1000 requests/day limit)")

# GCS storage costs
estimated_full_data_size_gb = (total_data_size * 365) / (1024 * 1024)  # Scale up for full year
gcs_storage_cost_per_gb_per_month = 0.020  # Standard storage
gcs_monthly_cost = estimated_full_data_size_gb * gcs_storage_cost_per_gb_per_month

print(f"\\nGoogle Cloud Storage:")
print(f"- Estimated full dataset size: {estimated_full_data_size_gb:.2f} GB")
print(f"- Monthly storage cost: ${gcs_monthly_cost:.2f}")
print(f"- Annual storage cost: ${gcs_monthly_cost * 12:.2f}")

# Compute costs (training)
print(f"\\nCompute Costs (Training):")
print(f"- Colab Pro: $9.99/month (100 compute units)")
print(f"- Alternative: Kaggle (30 hours/week free GPU)")
print(f"- GCP Compute Engine: ~$0.50/hour (preemptible GPU)")

# Total estimated costs
total_monthly_cost = gcs_monthly_cost + 9.99  # GCS + Colab Pro
print(f"\\nESTIMATED TOTAL MONTHLY COST: ${total_monthly_cost:.2f}")
print(f"ESTIMATED TOTAL ANNUAL COST: ${total_monthly_cost * 12:.2f}")

print("\\n" + "="*60)
print("COST OPTIMIZATION STRATEGIES")
print("="*60)
print("1. Use free tiers extensively:")
print("   - GCP Free Tier: $300 credit + always-free resources")
print("   - Colab Free: Limited but sufficient for prototyping")
print("   - NewsAPI Free: 1000 requests/day")

print("\\n2. Efficient data processing:")
print("   - Batch process news embeddings")
print("   - Use parquet format for compression")
print("   - Implement data partitioning by date/symbol")

print("\\n3. Smart resource usage:")
print("   - Use spot/preemptible instances")
print("   - Scale compute resources dynamically") 
print("   - Cache processed embeddings")

print("\\n4. Set up billing alerts:")
print("   - Daily spending alerts")
print("   - Budget limits with automatic shutdowns")
print("   - Monitor API usage quotas")

print("\\n" + "="*60)
print("NEXT STEPS FOR FULL IMPLEMENTATION")
print("="*60)
print("Week 2-3: Full Data Collection & Model Training")
print("- Extend date range to full year (Sep 2024 - Aug 2025)")
print("- Implement parallel processing for multiple stocks")
print("- Scale up to 10-20 stocks for better diversification")
print("- Implement advanced hyperparameter tuning with Optuna")
print("- Add more sophisticated feature engineering")

print("\\nWeek 3-4: Deployment & Monitoring")
print("- Create FastAPI wrapper for model serving")
print("- Deploy to Cloud Run or Hugging Face Spaces")
print("- Build Streamlit dashboard for visualization")
print("- Implement real-time prediction pipeline")
print("- Set up automated model retraining")

print("\\nWeek 4+: Production & Optimization")
print("- Implement A/B testing for model versions")
print("- Add more data sources (Twitter, Reddit, SEC filings)")
print("- Implement ensemble methods")
print("- Create backtesting framework")
print("- Add risk management features")

print(f"\\n{'='*60}")
print("PROTOTYPE COMPLETED SUCCESSFULLY!")
print(f"{'='*60}")