### Apple Data

In [23]:
import yfinance as yf
import pandas as pd

# Download Apple stock data from Yahoo Finance (last 5 years)
ticker = "AAPL"
data = yf.download(ticker, period="5y")

# Reset index to make 'Date' a column and ensure correct structure
data.reset_index(inplace=True)

# Save it to a CSV file
data.to_csv("AAPL_cleaned_data.csv", index=False)

# Show the first few rows of the cleaned data to confirm
print(data.head())


[*********************100%***********************]  1 of 1 completed

Price        Date      Close       High        Low       Open     Volume
Ticker                  AAPL       AAPL       AAPL       AAPL       AAPL
0      2020-03-05  71.085556  72.694512  70.719108  71.716516  187572800
1      2020-03-06  70.141525  70.575923  68.248632  68.435492  226176800
2      2020-03-09  64.593880  67.486610  63.824584  64.006593  286744800
3      2020-03-10  69.246040  69.512989  65.370456  67.256077  285290000
4      2020-03-11  66.841087  68.246201  65.974721  67.316744  255598800





### Prepared Apple Data

In [25]:
import pandas as pd
import numpy as np

# Load the cleaned data (make sure to replace the file name with your actual CSV path)
data = pd.read_csv("AAPL_cleaned_data.csv")

# Convert all columns to numeric where possible (this will coerce errors like 'AAPL' into NaN)
data['Close'] = pd.to_numeric(data['Close'], errors='coerce')
data['Open'] = pd.to_numeric(data['Open'], errors='coerce')
data['High'] = pd.to_numeric(data['High'], errors='coerce')
data['Low'] = pd.to_numeric(data['Low'], errors='coerce')
data['Volume'] = pd.to_numeric(data['Volume'], errors='coerce')

# Remove rows where any of the numeric columns have NaN values (caused by coercion)
data.dropna(subset=['Close', 'Open', 'High', 'Low', 'Volume'], inplace=True)

# 1. Add Moving Averages (SMA)
data['SMA_10'] = data['Close'].rolling(window=10).mean()
data['SMA_50'] = data['Close'].rolling(window=50).mean()
data['SMA_200'] = data['Close'].rolling(window=200).mean()

# 2. Add Exponential Moving Averages (EMA)
data['EMA_10'] = data['Close'].ewm(span=10, adjust=False).mean()
data['EMA_50'] = data['Close'].ewm(span=50, adjust=False).mean()

# 3. Add Relative Strength Index (RSI)
delta = data['Close'].diff()
gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()

rs = gain / loss
data['RSI'] = 100 - (100 / (1 + rs))

# 4. Add Future Price (Target/Label) - Here we're predicting the next day's closing price
data['Target'] = data['Close'].shift(-1)

# Drop rows with missing values due to rolling operations
data.dropna(inplace=True)

# Save the prepared data for machine learning
data.to_csv("AAPL_prepared_data.csv", index=False)

# Show the first few rows of the data
print(data.head())


           Date       Close        High         Low        Open       Volume  \
200  2020-12-16  124.842369  125.389364  123.621393  124.451663   98208600.0   
201  2020-12-17  125.711739  126.571311  125.067059  125.907092   94359800.0   
202  2020-12-18  123.719093  126.102442  123.191631  125.965693  192541500.0   
203  2020-12-21  125.252640  125.330784  120.583627  122.117173  121251600.0   
204  2020-12-22  128.817902  131.289157  126.639669  128.554167  168904800.0   

         SMA_10      SMA_50    SMA_200      EMA_10      EMA_50        RSI  \
200  120.946978  115.607111  94.827037  121.001448  115.924141  77.142749   
201  121.509605  115.877059  95.100167  121.857864  116.307968  77.485184   
202  121.940368  116.109299  95.368055  122.196269  116.598600  67.607518   
203  122.377966  116.333205  95.671349  122.751973  116.937975  64.120907   
204  123.110554  116.483518  95.969208  123.854869  117.403854  69.298180   

         Target  
200  125.711739  
201  123.719093  
20

### 100 most popular tickers

In [41]:
import yfinance as yf
import pandas as pd

# List of 100 popular S&P 500 stocks
tickers = [
    "AAPL", "MSFT", "GOOGL", "AMZN", "NVDA", "META", "TSLA", "BRK-B", "V", "JNJ",
    "UNH", "XOM", "JPM", "PG", "MA", "HD", "LLY", "CVX", "ABBV", "AVGO", "PFE", "COST", "PEP", "KO",
    "MRK", "NFLX", "TMO", "WMT", "DIS", "AMD", "BAC", "CSCO", "ABT", "ACN", "DHR", "NKE", "LIN",
    "INTC", "MCD", "TXN", "VZ", "HON", "CRM", "GS", "MS", "NEE", "UNP", "BMY", "SCHW", "RTX", "AMGN",
    "LOW", "C", "LMT", "CVS", "DE", "CAT", "ADBE", "IBM", "PYPL", "INTU", "MDT", "SBUX", "QCOM",
    "PLD", "T", "NOW", "SPGI", "ISRG", "VRTX", "BKNG", "BLK", "CHTR", "MO", "GILD", "TGT", "EL",
    "SYK", "MDLZ", "ZTS", "DUK", "SO", "PGR", "HUM", "CI", "MMC", "FIS", "CB", "ADI", "ICE", "PNC",
    "BDX", "CSX", "NSC", "ITW", "EQIX", "CME", "EW", "TRV", "SHW", "HCA"
]

# List to store processed data
all_data = []

for ticker in tickers:
    print(f"Processing {ticker}...")

    # Download stock data
    data = yf.download(ticker, period="5y")

    # Skip if no data
    if data.empty:
        print(f"❌ No data for {ticker}. Skipping.")
        continue

    # Reset index to keep 'Date' as a column
    data.reset_index(inplace=True)

    # Ensure columns are numeric
    for col in ['Close', 'Open', 'High', 'Low', 'Volume']:
        if col in data.columns:
            try:
                data[col] = pd.to_numeric(data[col], errors='coerce')
            except Exception as e:
                print(f"Error processing {col} for {ticker}: {e}")

    # Drop rows with missing values
    data.dropna(subset=['Close', 'Open', 'High', 'Low', 'Volume'], inplace=True)

    # Add Moving Averages
    data['SMA_10'] = data['Close'].rolling(window=10).mean()
    data['SMA_50'] = data['Close'].rolling(window=50).mean()
    data['SMA_200'] = data['Close'].rolling(window=200).mean()

    # Add Exponential Moving Averages
    data['EMA_10'] = data['Close'].ewm(span=10, adjust=False).mean()
    data['EMA_50'] = data['Close'].ewm(span=50, adjust=False).mean()

    # Add Relative Strength Index (RSI)
    delta = data['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    data['RSI'] = 100 - (100 / (1 + rs))

    # Add Future Price (Target)
    data['Target'] = data['Close'].shift(-1)

    # Drop rows with missing values
    data.dropna(inplace=True)

    # Add ticker column
    data['Ticker'] = ticker

    # Store the processed data
    all_data.append(data)

# Combine all stock data into one CSV
if all_data:
    final_data = pd.concat(all_data, ignore_index=True)
    final_data.to_csv("popular_stocks_prepared_data.csv", index=False)
    print("✅ Data saved to popular_stocks_prepared_data.csv")
else:
    print("❌ No valid data found for any ticker.")


Processing AAPL...


[*********************100%***********************]  1 of 1 completed

Error processing Close for AAPL: arg must be a list, tuple, 1-d array, or Series
Error processing Open for AAPL: arg must be a list, tuple, 1-d array, or Series
Error processing High for AAPL: arg must be a list, tuple, 1-d array, or Series
Error processing Low for AAPL: arg must be a list, tuple, 1-d array, or Series
Error processing Volume for AAPL: arg must be a list, tuple, 1-d array, or Series





KeyError: ['Close', 'Open', 'High', 'Low', 'Volume']