In [15]:
import yfinance as yf
import pandas as pd
from google.colab import files
import numpy as np
from sklearn.model_selection import train_test_split

In [20]:
def fetch_and_clean_stock_data(ticker, start_date, end_date, interval="1d"):
    """
    Fetches historical stock data, selects the 'Close' column, and fills missing values.

    Args:
        ticker (str): Stock ticker symbol.
        start_date (str): Start date for data fetching.
        end_date (str): End date for data fetching.
        interval (str): Data interval (e.g., "1d").

    Returns:
        pd.DataFrame: Cleaned stock data with only the 'Close' column.
    """
    stock_data = yf.download(ticker, start=start_date, end=end_date, interval=interval)
    stock_data = stock_data[['Close']].dropna(how='all').ffill().bfill()
    return stock_data

def create_sliding_window(data, window_size=5):
    """
    Creates a dataset using a sliding window approach and returns corresponding dates.

    Args:
        data (pd.DataFrame): DataFrame containing only the 'Close' column.
        window_size (int): Number of previous days to consider in each sample.

    Returns:
        X (np.ndarray): Features for linear regression, each row contains `window_size` past closing prices.
        y (np.ndarray): Target variable, the next closing price after each window.
        dates (pd.Series): Dates corresponding to each target value.
    """
    X, y, dates = [], [], []
    for i in range(len(data) - window_size):
        X.append(data['Close'].iloc[i:i+window_size].values.flatten())
        y.append(data['Close'].iloc[i+window_size])
        dates.append(data.index[i + window_size])  # Date corresponding to the target

    return np.array(X), np.array(y), pd.Series(dates)

def save_sliding_window_data(X, y, dates, ticker, window_size, test_size=0.2):
    """
    Splits the data into training and testing sets, and saves them to CSV files with dates.

    Args:
        X (np.ndarray): Feature matrix with shape (num_samples, window_size).
        y (np.ndarray): Target vector with shape (num_samples,).
        dates (pd.Series): Series containing dates for each sample.
        ticker (str): Stock ticker symbol, used for naming the file.
        window_size (int): Number of previous days in each sample.
        test_size (float): Proportion of the data to use as the test set.
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test, dates_train, dates_test = train_test_split(
        X, y, dates, test_size=test_size, random_state=42
    )

    # Prepare DataFrames for saving
    train_columns = [f'Close_lag_{i+1}' for i in range(window_size)]
    train_df = pd.DataFrame(X_train, columns=train_columns)
    train_df['Target'] = y_train
    train_df['Date'] = dates_train.values  # Add date column

    test_columns = [f'Close_lag_{i+1}' for i in range(window_size)]
    test_df = pd.DataFrame(X_test, columns=test_columns)
    test_df['Target'] = y_test
    test_df['Date'] = dates_test.values  # Add date column

    # Save DataFrames to CSV files
    train_filename = f"{ticker}_train_data.csv"
    test_filename = f"{ticker}_test_data.csv"
    train_df.to_csv(train_filename, index=False)
    test_df.to_csv(test_filename, index=False)

    print(f"Training data saved as {train_filename}")
    print(f"Testing data saved as {test_filename}")

    return train_filename, test_filename

def save_file(filename):
    """
    Downloads a file to the local machine.

    Args:
        filename (str): Name of the file to be downloaded.
    """
    files.download(filename)

In [22]:
# Configuration
ticker = "AAPL"
start_date = "2019-01-01"
end_date = "2023-12-31"
window_size = 365

# Fetch, process, and save data
stock_data = fetch_and_clean_stock_data(ticker, start_date, end_date)
X, y, dates = create_sliding_window(stock_data, window_size=window_size)
train_filename, test_filename = save_sliding_window_data(X, y, dates, ticker=ticker, window_size=window_size)

[*********************100%***********************]  1 of 1 completed


Training data saved as AAPL_train_data.csv
Testing data saved as AAPL_test_data.csv


In [23]:
save_file(train_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [24]:
save_file(test_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>