In [1]:
!pip install yfinance pandas



In [3]:
import logging
import pandas as pd
import yfinance as yf
from datetime import datetime

In [4]:
tickers = ['AAPL', 'GOOGL', 'MSFT']

# Define dates
start_date = "2024-01-01"
end_date = datetime.today().strftime('%Y-%m-%d')

In [5]:
# Create a dictionary to store our final data
stock_data_dictionary = {}

print(f"Starting extraction for: {tickers}")
print(f"Date Range: {start_date} to {end_date}\n")

Starting extraction for: ['AAPL', 'GOOGL', 'MSFT']
Date Range: 2024-01-01 to 2026-02-13



In [6]:
# --- 2. MAIN LOOP (The Fetcher & Cleaner) ---
for ticker in tickers:
    print(f"Processing {ticker}...")

    try:
        # A. DOWNLOAD RAW DATA
        stock = yf.Ticker(ticker)
        df = stock.history(start=start_date, end=end_date)

        # Check if data exists
        if df.empty:
            print(f"⚠️ Warning: No data found for {ticker}")
            continue  # Skip to the next ticker

        # B. CLEAN THE DATA (The "Extractor" logic)
        # Keep only the Close column and rename it to 'Price'
        df = df[["Close"]].rename(columns={"Close": "Price"})

        # Calculate daily returns (percentage change)
        df["Returns"] = df["Price"].pct_change()

        # Drop the first row (because it has NaN return)
        df = df.dropna()

        # C. FORMAT THE DATES
        # The index currently has hours/minutes (2023-01-01 00:00:00).
        # We want just the date (2023-01-01).
        df.index = df.index.date
        df.index.name = "Date"

        # D. STORE IN DICTIONARY
        stock_data_dictionary[ticker] = df
        print(f"✅ {ticker} successfully added.")

    except Exception as e:
        print(f"❌ Error processing {ticker}: {e}")

Processing AAPL...
✅ AAPL successfully added.
Processing GOOGL...
✅ GOOGL successfully added.
Processing MSFT...
✅ MSFT successfully added.


In [7]:
# --- 3. INSPECT THE RESULTS ---
print("\n" + "="*30)
print("FINAL DATASET SUMMARY")
print("="*30)

print(f"Stocks stored: {list(stock_data_dictionary.keys())}")

# Let's look at the first few rows of the first stock in your list
first_ticker = tickers[0]
if first_ticker in stock_data_dictionary:
    print(f"\nSample Data for {first_ticker}:")
    print(stock_data_dictionary[first_ticker].head())

    print(f"\nShape of {first_ticker} dataframe: {stock_data_dictionary[first_ticker].shape}")


FINAL DATASET SUMMARY
Stocks stored: ['AAPL', 'GOOGL', 'MSFT']

Sample Data for AAPL:
                 Price   Returns
Date                            
2024-01-03  182.355606 -0.007488
2024-01-04  180.039688 -0.012700
2024-01-05  179.317154 -0.004013
2024-01-08  183.652130  0.024175
2024-01-09  183.236435 -0.002263

Shape of AAPL dataframe: (530, 2)


In [8]:
import pandas as pd
from datetime import timedelta

# --- CONFIGURATION ---
LOOKBACK_WINDOW = 30  # How many days of past data you want to keep for the model

print("Starting Data Processing...")


distinct_dates = []
normalized_data = {}

for ticker, df in stock_data_dictionary.items():
    # Create a copy to avoid SettingWithCopy warnings
    df_copy = df.copy()

    # Force the index to be just Date objects (no time)
    df_copy.index = pd.to_datetime(df_copy.index).date

    # Store for the next step
    normalized_data[ticker] = df_copy

    # Add the set of dates to our list
    distinct_dates.append(set(df_copy.index))
common_dates = sorted(set.intersection(*distinct_dates))

print(f"Found {len(common_dates)} common dates across all tickers.")
aligned_stock_data = {}

for ticker, df in normalized_data.items():
    # .loc allows us to select specific rows by their index (the date)
    aligned_stock_data[ticker] = df.loc[common_dates]

print("✅ Data Alignment Complete.")

Starting Data Processing...
Found 530 common dates across all tickers.
✅ Data Alignment Complete.


In [9]:
recent_prices = {}

print(f"\nExtracting recent {LOOKBACK_WINDOW} days of prices...")

for ticker, df in aligned_stock_data.items():
    # If a dataframe is somehow empty, skip it
    if df.empty:
        recent_prices[ticker] = []
        continue

    # Identify the cutoff date (Last Date - 30 Days)
    # Note: simple subtraction of days might include weekends,
    # but strictly filtering by index position (tail) is often safer for trading days.
    # However, to stick to your original logic using time delta:
    last_date = df.index[-1]
    cutoff_date = last_date - timedelta(days=LOOKBACK_WINDOW)

    # Filter rows that are AFTER the cutoff date
    # We only want the "Price" column
    recent_series = df.loc[df.index >= cutoff_date, "Price"]

    # Convert to a simple list of floats
    recent_prices[ticker] = [float(x) for x in recent_series.tolist()]

print("✅ Recent prices extracted.")


# --- 3. INSPECTION ---
print("\n" + "="*30)
print("PROCESSING SUMMARY")
print("="*30)

first_ticker = list(aligned_stock_data.keys())[0]

print(f"Aligned Data for {first_ticker} (Tail):")
print(aligned_stock_data[first_ticker].tail())

print(f"\nRecent Price List for {first_ticker} (Length: {len(recent_prices[first_ticker])}):")
print(recent_prices[first_ticker])


Extracting recent 30 days of prices...
✅ Recent prices extracted.

PROCESSING SUMMARY
Aligned Data for AAPL (Tail):
                 Price   Returns
2026-02-06  277.859985  0.008010
2026-02-09  274.619995 -0.011661
2026-02-10  273.679993 -0.003423
2026-02-11  275.500000  0.006650
2026-02-12  261.730011 -0.049982

Recent Price List for AAPL (Length: 22):
[260.8059387207031, 259.71697998046875, 257.9685974121094, 255.29112243652344, 246.46937561035156, 247.41848754882812, 248.1178436279297, 247.80812072753906, 255.17123413085938, 258.0285339355469, 256.2002868652344, 258.0385437011719, 259.2374267578125, 269.7575988769531, 269.22808837890625, 276.23150634765625, 275.6520690917969, 277.8599853515625, 274.6199951171875, 273.67999267578125, 275.5, 261.7300109863281]
