In [1]:
import yfinance as yf
import logging
import os
import sys
from datetime import datetime
import pandas as pd
import numpy as np

In [2]:
!pip install yfinance



In [3]:
# track stock data for Apple, Microsoft, Google, Tesla, IBM, Oracle, Amazon
tickers = ['AAPL', 'MSFT', 'GOOGL', 'TSLA', 'IBM', 'ORCL', 'AMZN']
df = yf.download(tickers, start="2024-01-01", end=datetime.today(), group_by='ticker')
#
all_dfs = []
for ticker in tickers:
    stock_df = df[ticker].copy()
    # which stock does this row refer to?
    stock_df['Ticker'] = ticker
    # reset index
    stock_df = stock_df.reset_index()
    all_dfs.append(stock_df)

combined_df = pd.concat(all_dfs, ignore_index=True)


# # reorder columns
columns_order = ['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Volume']
combined_df = combined_df[columns_order]

  df = yf.download(tickers, start="2024-01-01", end=datetime.today(), group_by='ticker')
[*********************100%***********************]  7 of 7 completed


In [4]:
combined_df

Price,Date,Ticker,Open,High,Low,Close,Volume
0,2024-01-02,AAPL,185.399096,186.677036,182.169601,183.903229,82488700
1,2024-01-03,AAPL,182.496496,184.140970,181.713879,182.526215,58414500
2,2024-01-04,AAPL,180.445860,181.377068,179.187752,180.208115,71983600
3,2024-01-05,AAPL,180.287390,181.050175,178.484409,179.484955,62379700
4,2024-01-08,AAPL,180.386422,183.863594,179.801946,183.823959,59144500
...,...,...,...,...,...,...,...
3677,2026-01-30,AMZN,239.889999,243.320007,237.639999,239.300003,46585000
3678,2026-02-02,AMZN,238.309998,245.630005,238.169998,242.960007,37546100
3679,2026-02-03,AMZN,244.979996,246.350006,235.449997,238.619995,53831300
3680,2026-02-04,AMZN,238.860001,238.860001,231.820007,232.990005,51299900


In [5]:
# pre-processing the data

def data_clean_and_validation(df):
  # check for anamoloies
  # flag/remove negative values
  columns_subset = ['Open', 'High',	'Low',	'Close']
  mask = (df[columns_subset] >= 0).all(axis=1)
  df = df[mask]

  # check for duplicates (duplicate timestamps for the same stock)
  df_unique = df.drop_duplicates(subset=['Date', 'Ticker'], keep='first')

  # sort by timestamp (and ticker) ascending
  df_unique = df_unique.sort_values(by=['Date', 'Ticker'])

  # fill null values with previous day data
  df_unique = df_unique.ffill()

  # reset index
  df_unique = df_unique.reset_index(drop=True)

  return df_unique


In [6]:
df = data_clean_and_validation(combined_df)
df

Price,Date,Ticker,Open,High,Low,Close,Volume
0,2024-01-02,AAPL,185.399096,186.677036,182.169601,183.903229,82488700
1,2024-01-02,AMZN,151.539993,152.380005,148.389999,149.929993,47339400
2,2024-01-02,GOOGL,137.510990,138.404235,135.456506,137.133835,23711200
3,2024-01-02,IBM,153.190998,153.623759,150.961299,151.939728,3825000
4,2024-01-02,MSFT,368.367665,370.377704,361.381827,365.421600,25258600
...,...,...,...,...,...,...,...
3677,2026-02-05,GOOGL,312.220001,332.690002,306.459991,331.250000,87810100
3678,2026-02-05,IBM,286.100006,291.809998,285.100006,289.890015,5463900
3679,2026-02-05,MSFT,407.440002,408.299988,392.320007,393.670013,65999900
3680,2026-02-05,ORCL,145.600006,146.550003,135.250000,136.479996,42539100


In [7]:
# feature engineering (for downstream algorithms)
def feature_engineering(df):
  # add simple moving average features
  processed_dfs = []
  tickers = df['Ticker'].unique()

  for ticker in tickers:
    df_ticker = df[df['Ticker'] == ticker].copy()
    df_ticker = df_ticker.sort_values(by=['Date'])

    # short term moving average (SMA)
    df_ticker['SMA_20'] = df_ticker['Close'].rolling(window=20).mean()
    # long term moving average
    df_ticker['SMA_50'] = df_ticker['Close'].rolling(window=50).mean()

    # exponential moving average features (EMA)
    # more responsive to recent price changes than SMA
    df_ticker['EMA_12'] = df_ticker['Close'].ewm(span=12, adjust=False).mean()
    df_ticker['EMA_26'] = df_ticker['Close'].ewm(span=26, adjust=False).mean()

    # Pre-calculate buy/sell signals
    df_ticker['MA_Signal'] = np.where(
        df_ticker['SMA_20'] > df_ticker['SMA_50'],
        1,   # if True
        -1   # if False
    )

    df_ticker['EMA_Signal'] = np.where(
        df_ticker['EMA_12'] > df_ticker['EMA_26'], 1, -1
    )

    #adding features for LSTM (in the future)
    df_ticker['daily_return'] = df_ticker['Close'].pct_change()

    # Previous close -- gives LSTM recent history
    df_ticker['Close_lag1'] = df_ticker['Close'].shift(1)

    # Volume ratio & volume context
    df_ticker['volume_ma_20'] = df_ticker['Volume'].rolling(20).mean()
    df_ticker['volume_ratio'] = df_ticker['Volume'] / df_ticker['volume_ma_20']

    # Price range: shows volatility within day
    df_ticker['price_range'] = df_ticker['High'] - df_ticker['Low']

    # Rolling volatility
    df_ticker['volatility_20'] = df_ticker['Close'].rolling(20).std()

    # extract Day of week, which helps capture weekly patterns
    df_ticker['day_of_week'] = df_ticker['Date'].dt.dayofweek

    processed_dfs.append(df_ticker)

  # Combine all DataFrames into one
  df_combined = pd.concat(processed_dfs, ignore_index=True)
  df_combined = df_combined.sort_values(['Date', 'Ticker']).reset_index(drop=True)

  return df_combined


In [8]:
df = feature_engineering(df)
df

Price,Date,Ticker,Open,High,Low,Close,Volume,SMA_20,SMA_50,EMA_12,EMA_26,MA_Signal,EMA_Signal,daily_return,Close_lag1,volume_ma_20,volume_ratio,price_range,volatility_20,day_of_week
0,2024-01-02,AAPL,185.399096,186.677036,182.169601,183.903229,82488700,,,183.903229,183.903229,-1,-1,,,,,4.507435,,1
1,2024-01-02,AMZN,151.539993,152.380005,148.389999,149.929993,47339400,,,149.929993,149.929993,-1,-1,,,,,3.990005,,1
2,2024-01-02,GOOGL,137.510990,138.404235,135.456506,137.133835,23711200,,,137.133835,137.133835,-1,-1,,,,,2.947729,,1
3,2024-01-02,IBM,153.190998,153.623759,150.961299,151.939728,3825000,,,151.939728,151.939728,-1,-1,,,,,2.662461,,1
4,2024-01-02,MSFT,368.367665,370.377704,361.381827,365.421600,25258600,,,365.421600,365.421600,-1,-1,,,,,8.995877,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3677,2026-02-05,GOOGL,312.220001,332.690002,306.459991,331.250000,87810100,332.853500,321.611461,334.146892,328.827417,1,1,-0.005375,333.040009,36000310.0,2.439148,26.230011,5.151094,3
3678,2026-02-05,IBM,286.100006,291.809998,285.100006,289.890015,5463900,299.960002,302.407601,297.863121,299.428793,-1,-1,0.002906,289.049988,5378130.0,1.015948,6.709991,7.796245,3
3679,2026-02-05,MSFT,407.440002,408.299988,392.320007,393.670013,65999900,451.778999,470.337001,434.229368,451.664163,-1,-1,-0.049542,414.190002,39964755.0,1.651453,15.979980,26.124629,3
3680,2026-02-05,ORCL,145.600006,146.550003,135.250000,136.479996,42539100,176.996999,189.992124,163.194789,176.227550,-1,-1,-0.069476,146.669998,29036680.0,1.465013,11.300003,18.198948,3


In [9]:
df.to_csv('cleaned_yfinance_data.csv')

In [10]:
# Git setup
!git config --global user.name "Riya Berry"
!git config --global user.email "riyaberr@usc.edu"

In [11]:
# Clone repo
%cd /content/560_labs_no_plan
!git clone https://github.com/18rberry/560_labs_no_plan.git

/content/560_labs_no_plan
fatal: destination path '560_labs_no_plan' already exists and is not an empty directory.


In [14]:
# add files
!git add Lab4/*
!git commit -m "first commit for lab4!"

On branch main
Your branch is ahead of 'origin/main' by 1 commit.
  (use "git push" to publish your local commits)

nothing to commit, working tree clean


In [16]:
%cd /content
!rm -rf 560_labs_no_plan

/content


In [17]:
# Cell 2: Clone correctly (to /content, not inside itself)
%cd /content
!git clone https://github.com/18rberry/560_labs_no_plan.git

/content
Cloning into '560_labs_no_plan'...
remote: Enumerating objects: 102, done.[K
remote: Counting objects: 100% (102/102), done.[K
remote: Compressing objects: 100% (73/73), done.[K
remote: Total 102 (delta 31), reused 72 (delta 17), pack-reused 0 (from 0)[K
Receiving objects: 100% (102/102), 1.32 MiB | 8.22 MiB/s, done.
Resolving deltas: 100% (31/31), done.
