In [319]:
import yfinance as yf
import pandas as pd
import os
import warnings
import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import seaborn as sb
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor


BASE_DIR="/Users/dannyyu/Desktop/AI_Trader/data"
TICKERS = ['AAPL', 'MSFT', 'GOOGL', 'AMZN', 'META', 'NVDA', 'TSLA', 'INTC', 'AMD', 'IBM']
START_DATE = "2022-01-01"
END_DATE = "2025-5-18"

In [12]:
"""
Returns the most recent available closing price on or before target_date
by checking historical data within the past `lookback_days`.
"""
def get_closest_price(ticker_obj, target_date, lookback_days=3):
    try:
        start_date = target_date - pd.Timedelta(days=lookback_days)
        end_date = target_date + pd.Timedelta(days=1)

        hist = ticker_obj.history(start=start_date, end=end_date)
        
        if not hist.empty:
            last_row = hist.iloc[-1]
            timestamp_str = hist.index[-1].strftime("%Y-%m-%d")
           # print("Data retrieved on date "+timestamp_str)
            return last_row['Close']
        else:
            print(f"No price data found for {ticker_obj.ticker} between {start_date.date()} and {end_date.date()}")
            return None

    except Exception as e:
        print(f"Error retrieving price for {ticker_obj.ticker} near {target_date.date()}: {e}")
        return None

def read_CSV_File(ticker,folderName):
    dir = BASE_DIR+"/"+folderName+"/"+ticker+"_"+folderName+".csv"
    table= pd.read_csv(dir,index_col=0,low_memory=False)
    return table

In [182]:
## Takes in a table and the number of lag days as input
## Create or edit a column with lag data on a specified number of entries earlier. 
## Note, the lag is based on entries not on actual date
def lag_feature(table, lag, column="Close"):
    shifted = table[column].shift(lag)
    return shifted
    
def simple_moving_average(table,window_in,column="Close"):
    sma = table[column].rolling(window=window_in).mean()
    return sma

def rolling_median(table,window_in,column="Close"):
    median = table[column].rolling(window=window_in).median()
    return median
    
def exponential_moving_average(table,window,column="Close"):
    ema = table[column].ewm(span=window, adjust=False).mean()
    return ema

def relative_strength_index(table,window=14):
    change=table["Close"].diff()
    change.dropna(inplace=True)
    
    change_up = change.copy()
    change_down = change.copy()
    change_up[change_up<0] = 0
    change_down[change_down>0] = 0

    avg_up = change_up.rolling(window).mean()
    avg_down = change_down.rolling(window).mean().abs()
    rsi = 100 * avg_up / (avg_up + avg_down)
    return rsi


def plot_RSI(table,window=14):
    plt.style.use('fivethirtyeight')
    plt.rcParams['figure.figsize'] = (20, 20)
    ax1 = plt.subplot2grid((10,1), (0,0), rowspan = 4, colspan = 1)
    ax2 = plt.subplot2grid((10,1), (5,0), rowspan = 4, colspan = 1)
    ax1.plot(table['Close'], linewidth=2)
    company=table["Company"].iloc[1]
    ax1.set_title(f'{company} Close Price')
    ax2.set_title('Relative Strength Index')
    ax2.plot(table[f"RSI_{window}"], color='orange', linewidth=1)
    # Add two horizontal lines, signalling the buy and sell ranges.
    # Oversold
    ax2.axhline(30, linestyle='--', linewidth=1.5, color='green')
    # Overbought
    ax2.axhline(70, linestyle='--', linewidth=1.5, color='red')
    plt.show()


def bollinger_band(table,window=15,column="Close",k=2):
    sma=simple_moving_average(table,window)
    rolling_std = table[column].rolling(window=window).std()
    bollinger=pd.DataFrame({f"Bollinger_Middle_{window}":sma})
    bollinger[f"Bollinger_Upper_{window}"] = sma + (k * rolling_std)
    bollinger[f"Bollinger_Lower_{window}"] = sma - (k * rolling_std)
    bollinger.index=table.index
    return bollinger
    
def plot_bollinger_band(table,window=15):
    plt.figure(figsize=(14, 7))

    # Plot the stock price and bands
    plt.plot(table.index, table['Close'], label='Stock Price', color='blue', linewidth=1.5)
    plt.plot(table.index, table[f"Bollinger_Upper_{window}"], label='Upper Band', color='green', linewidth=1)
    plt.plot(table.index, table[f"Bollinger_Lower_{window}"], label='Lower Band', color='green', linewidth=1)
    plt.plot(table.index, table[f"Bollinger_Middle_{window}"], label='Rolling Mean', color='red', linewidth=1.5)
    
    # Shaded area between upper and lower band
    plt.fill_between(table.index,
                     table[f"Bollinger_Lower_{window}"],
                     table[f"Bollinger_Upper_{window}"],
                     color='gray',
                     alpha=0.2)
    
    plt.title(f'Bollinger Bands (Window = {window})')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()
    plt.xticks([], [])
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    
def moving_average_convergence_divergence(table, short_window=12, long_window=26, signal_window=9, column='Close'):
    short_ema = exponential_moving_average(table,short_window,column="Close")
    long_ema = exponential_moving_average(table,long_window,column="Close")
    macd_line=short_ema-long_ema
    macd=pd.DataFrame({"MACD_Line":macd_line})
    macd['MACD_Signal'] = macd['MACD_Line'].ewm(span=signal_window, adjust=False).mean()
    macd['MACD_Histogram'] = macd['MACD_Line'] - macd['MACD_Signal']
    macd.index=table.index
    return macd

def plot_MACD(table):
    plt.figure(figsize=(14, 7))

    # Plot MACD and Signal
    plt.plot(table.index, table['MACD_Line'], label='MACD', color='blue', linewidth=1.5)
    plt.plot(table.index, table['MACD_Signal'], label='Signal Line', color='red', linewidth=1.5)
    
    # Bar plot for histogram
    plt.bar(table.index, table['MACD_Histogram'], label='Histogram', color='gray', alpha=0.4,width=1,)
    
    plt.title('MACD Indicator')
    plt.xlabel('Date')
    plt.ylabel('MACD')
    plt.legend()
    plt.grid(True)
    plt.xticks([], [])
    plt.tight_layout()
    plt.show()


def volatility(table, window=20, column="Close"):
    returns = table[column].pct_change()
    returns = returns.rolling(window=window).std()
    return returns

aapl=read_CSV_File("AAPL","metadata")


In [217]:
for ticker in TICKERS:
    data=read_CSV_File(ticker,"metadata")
    data["Lag_1"]=lag_feature(data,1)
    data["Lag_3"]=lag_feature(data,3)
    data["Lag_5"]=lag_feature(data,5)
    data["SMA_15"]=simple_moving_average(data,15)
    data["SMA_30"]=simple_moving_average(data,30)
    data["Rolling_Median_15"]=rolling_median(data,15)
    data["EMA_15"]=exponential_moving_average(data,15)
    data["EMA_30"]=exponential_moving_average(data,30)
    data["RSI_15"]=relative_strength_index(data,15)
    data["RSI_30"]=relative_strength_index(data,30)
    bollinger=bollinger_band(data,window=15)
    data["Bollinger_Middle_15"]=bollinger["Bollinger_Middle_15"]
    data["Bollinger_Upper_15"]=bollinger["Bollinger_Upper_15"]
    data["Bollinger_Lower_15"]=bollinger["Bollinger_Lower_15"]
    macd=moving_average_convergence_divergence(data)
    data["MACD_Line"]=macd["MACD_Line"]
    data["MACD_Signal"]=macd["MACD_Signal"]
    data["MACD_Histogram"]=macd["MACD_Histogram"]
    data["Volatility_20"]=volatility(data,20)
    data.to_csv(os.path.join(BASE_DIR, "feature", f"{ticker}_feature.csv"), index=True)



    


In [397]:
correlation_matrices = []

for ticker in TICKERS:
    data = read_CSV_File(ticker,"correlation")
    correlation_matrices.append(data)

# Ensure all matrices have the same index/column order
base_index = correlation_matrices[0].index
correlation_matrices = [df.loc[base_index, base_index] for df in correlation_matrices]

    # Stack into 3D numpy array
stacked = np.stack([df.values for df in correlation_matrices])

    # Compute mean along axis 0
avg_matrix = np.mean(stacked, axis=0)

    # Convert back to DataFrame
avg_corr = pd.DataFrame(avg_matrix, index=base_index, columns=base_index)
avg_corr.to_csv(os.path.join(BASE_DIR, "correlation", "MASTER_correlation.csv"), index=True)

In [301]:
for ticker in TICKERS:
    data=read_CSV_File(ticker,"feature")
    selected = ['Close', 'Lag_1', 'Lag_3', 'Lag_5',
       'SMA_15', 'SMA_30', 'Rolling_Median_15', 'EMA_15', 'EMA_30', 'RSI_15',
       'RSI_30', 'Bollinger_Middle_15', 'Bollinger_Upper_15',
       'Bollinger_Lower_15', 'MACD_Line', 'MACD_Signal', 'MACD_Histogram',
       'Volatility_20','DilutedEPS', 'PE', 'Revenue', 'CashFlow', 'EBITDA',
       'GrossProfit', 'OperatingMargin', 'ROE', 'DebtToEquity', 'Revenue_perShare', 'CashFlow_perShare',
       'EBITDA_perShare', 'GrossProfit_perShare']
    correlation_matrix = data[selected].corr()
    mask = np.tril(np.ones(correlation_matrix.shape)).astype(bool)
    correlation_matrix = correlation_matrix.mask(mask)
    correlation_matrix.to_csv(os.path.join(BASE_DIR, "correlation", f"{ticker}_correlation.csv"), index=True)
"""
correlation_matrix=sb.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True)
plt.title("Feature Correlation Matrix (Lower Triangle Only)")
plt.tight_layout()
plt.show()  """

'\ncorrelation_matrix=sb.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True)\nplt.title("Feature Correlation Matrix (Lower Triangle Only)")\nplt.tight_layout()\nplt.show()  '

In [365]:
for ticker in TICKERS:
    data=read_CSV_File(ticker,"feature")
    data=data.drop(columns=['Company',"High","Low","Open",'Lag_1',
           'Lag_3', 'Lag_5', 'SMA_15', 'SMA_30', 'Rolling_Median_15', 'EMA_15',
           'EMA_30', 'RSI_15', 'RSI_30', 'Bollinger_Middle_15',
           'Bollinger_Upper_15', 'Bollinger_Lower_15', 'MACD_Line', 'MACD_Signal',
           'MACD_Histogram', 'Volatility_20',"Year"])
    y = data['Close']
    X = data.drop(columns=['Close'])
    model = RandomForestRegressor()
    model.fit(X, y)
    importances = model.feature_importances_
    feature_importance = pd.DataFrame({
        'Indicator': X.columns,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    feature_importance.to_csv(os.path.join(BASE_DIR, "indicator_importance", f"{ticker}_indicator_importance.csv"), index=True)



In [387]:
combined = pd.DataFrame()
for ticker in TICKERS:
    data=read_CSV_File(ticker,"indicator_importance")
    if combined.empty:
        combined = data.copy()
    else:
        combined = combined.merge(data, on="Indicator", how="outer", suffixes=("", f"_{ticker}"))

combined['Avg_Importance'] = combined.select_dtypes(include='number').mean(axis=1)
combined.sort_values("Avg_Importance", ascending=False, inplace=True)

combined = combined.rename(columns={"Importance": "Importance_AAPL"})
last_col = combined.columns[-1]
combined = combined[[last_col] + list(combined.columns[:-1])]
combined.to_csv(os.path.join(BASE_DIR, "indicator_importance", "MASTER_indicator_importance.csv"), index=True)

In [361]:
for ticker in TICKERS:
    data=read_CSV_File(ticker,"feature")
    data=data.drop(columns=['Company','Volume','High', 'Low', 'Open', 'Dividends', 'Stock Splits', 'Year', 'DilutedEPS',
           'PE', 'Revenue', 'CashFlow', 'EBITDA', 'GrossProfit', 'OperatingMargin',
           'ROE', 'DebtToEquity', 'HasDividend', 'Revenue_perShare',
           'CashFlow_perShare', 'EBITDA_perShare', 'GrossProfit_perShare','Lag_1'])
    y = data['Close']
    X = data.drop(columns=['Close'])
    model = RandomForestRegressor()
    model.fit(X, y)
    importances = model.feature_importances_
    feature_importance = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)
    feature_importance.to_csv(os.path.join(BASE_DIR, "feature_importance", f"{ticker}_feature_importance.csv"), index=True)

In [391]:
combined = pd.DataFrame()
for ticker in TICKERS:
    data=read_CSV_File(ticker,"feature_importance")
    if combined.empty:
        combined = data.copy()
    else:
        combined = combined.merge(data, on="Feature", how="outer", suffixes=("", f"_{ticker}"))

combined['Avg_Importance'] = combined.select_dtypes(include='number').mean(axis=1)
combined.sort_values("Avg_Importance", ascending=False, inplace=True)

combined = combined.rename(columns={"Importance": "Importance_AAPL"})
last_col = combined.columns[-1]
combined = combined[[last_col] + list(combined.columns[:-1])]
combined.to_csv(os.path.join(BASE_DIR, "feature_importance", "MASTER_feature_importance.csv"), index=True)