In [56]:
import yfinance as yf
import pandas as pd
from google.colab import files
import numpy as np
from sklearn.model_selection import train_test_split

In [57]:
def fetch_and_clean_stock_data(ticker, start_date, end_date, interval="1d"):
    stock_data = yf.download(ticker, start=start_date, end=end_date, interval=interval)
    stock_data = stock_data[['Close']]  # Select only the 'Close' column
    stock_data = stock_data.dropna(how='all')  # Drop rows where 'Close' is NaN
    stock_data = stock_data.ffill().bfill()  # Forward-fill and back-fill missing values
    return stock_data

def create_sliding_window(data, window_size=5):
    """
    Creates a dataset using a sliding window approach.

    Args:
        data (pd.DataFrame): DataFrame containing only the 'Close' column.
        window_size (int): Number of previous days to consider in each sample.

    Returns:
        X (np.ndarray): Features for linear regression, each row contains `window_size` past closing prices.
        y (np.ndarray): Target variable, the next closing price after each window.
    """
    X = []
    y = []

    for i in range(len(data) - window_size):
        # Flatten the window to a 1D array
        X.append(data['Close'].iloc[i:i+window_size].values.flatten())
        y.append(data['Close'].iloc[i+window_size])

    return np.array(X), np.array(y)

def save_sliding_window_data(X, y, ticker, window_size, test_size=0.2):
    """
    Splits the data into training and testing sets, and saves them to CSV files.

    Args:
        X (np.ndarray): Feature matrix with shape (num_samples, window_size).
        y (np.ndarray): Target vector with shape (num_samples,).
        ticker (str): Stock ticker symbol, used for naming the file.
        window_size (int): Number of previous days in each sample.
        test_size (float): Proportion of the data to use as the test set.
    """
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # Convert X_train, y_train, X_test, and y_test to DataFrames for saving
    train_columns = [f'Close_lag_{i+1}' for i in range(window_size)]
    train_df = pd.DataFrame(X_train, columns=train_columns)
    train_df['Target'] = y_train

    test_columns = [f'Close_lag_{i+1}' for i in range(window_size)]
    test_df = pd.DataFrame(X_test, columns=test_columns)
    test_df['Target'] = y_test

    # Save the DataFrames to CSV files
    train_filename = f"{ticker}_train_data.csv"
    test_filename = f"{ticker}_test_data.csv"
    train_df.to_csv(train_filename, index=False)
    test_df.to_csv(test_filename, index=False)

    print(f"Training data saved as {train_filename}")
    print(f"Testing data saved as {test_filename}")

    # Download the files to the local machine (Colab)
    files.download(train_filename)
    files.download(test_filename)

In [58]:
ticker = "AAPL"  # Example stock ticker
start_date = "2019-01-01"
end_date = "2023-12-31"
window_size = 365

# Fetch, clean, and prepare data
stock_data = fetch_and_clean_stock_data(ticker, start_date, end_date)
X, y = create_sliding_window(stock_data, window_size=window_size)

save_sliding_window_data(X, y, ticker=ticker, window_size=window_size)

[*********************100%***********************]  1 of 1 completed


Training data saved as AAPL_train_data.csv
Testing data saved as AAPL_test_data.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>