<a href="https://colab.research.google.com/github/AdamWittmann/StockModel/blob/main/FinancialOutlookModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Machine Learning Study Project
##Using various libraries I will create a model to forcast stocks using concepts learned in calss

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yfinance as yf
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score


In [None]:
ticker1 = "RR"
ticker2 = "VOO"

start= "2021-9-01"

# stock1 = yf.download(ticker1, start)
stock2 = yf.download(ticker1, start)

stock2.head(20)

  stock2 = yf.download(ticker1, start)
[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,RR,RR,RR,RR,RR
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2023-11-17,5.25,6.329,4.5,5.0,2056300
2023-11-20,5.66,6.0,4.789,4.96,1229300
2023-11-21,5.55,6.54,5.5,5.84,583600
2023-11-22,5.21,5.6,5.21,5.37,225400
2023-11-24,4.62,5.22,4.5,5.09,130200
2023-11-27,4.56,4.933,4.54,4.56,210200
2023-11-28,4.66,4.96,4.54,4.75,164000
2023-11-29,4.19,4.76,4.19,4.74,142900
2023-11-30,4.27,4.46,4.15,4.21,66800
2023-12-01,4.48,4.75,4.32,4.37,80400


##Inspecting shape, null values, and column headers

In [None]:
print(stock2.shape)

stock2.info()

(470, 5)
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 470 entries, 2023-11-17 to 2025-10-03
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   (Close, RR)   470 non-null    float64
 1   (High, RR)    470 non-null    float64
 2   (Low, RR)     470 non-null    float64
 3   (Open, RR)    470 non-null    float64
 4   (Volume, RR)  470 non-null    int64  
dtypes: float64(4), int64(1)
memory usage: 22.0 KB


#Create Features for the model to use as indicators
##Moving Avgs, Daily Return, delta volume, volitility, price change (high-low)
##Using numpy features like rolling(window=x) and mean() to smooth out value changes

In [None]:
df = stock2.copy()
#Moving AVGs-- 5day -- 20day
df["MA_5"] = df['Close'].rolling(window=5).mean()
df["MA_20"] = df['Close'].rolling(window=20).mean()

#Daily Return
df["Daily_Return"] = df['Close'].pct_change()
#Lagged features
df['lagged'] = df['Close'].shift(1)

#Volatility
df['Volatility'] = df['Daily_Return'].rolling(window=10).std()

#High Low range

df["High_Low_Rng"] = df["High"] - df['Low']


##Now that we have our features, null values are present
##Clean the data(2 options impute/dropna)
###We could impute missing data here, but since its not going to be real data, im going to drop the columns using numpy

In [None]:
#You can see null values are present at the start of the data bc moving averages, lagged data, daily return, and volatility all depend on previous data(which we don't have)
df.head()

Price,Close,High,Low,Open,Volume,MA_5,MA_20,Daily_Return,lagged,Volatility,High_Low_Rng
Ticker,RR,RR,RR,RR,RR,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
2023-11-17,5.25,6.329,4.5,5.0,2056300,,,,,,1.829
2023-11-20,5.66,6.0,4.789,4.96,1229300,,,0.078095,5.25,,1.211
2023-11-21,5.55,6.54,5.5,5.84,583600,,,-0.019435,5.66,,1.04
2023-11-22,5.21,5.6,5.21,5.37,225400,,,-0.061261,5.55,,0.39
2023-11-24,4.62,5.22,4.5,5.09,130200,5.258,,-0.113244,5.21,,0.72


In [None]:
#Now look at the tail of the data... No missing values
df.tail()

Price,Close,High,Low,Open,Volume,MA_5,MA_20,Daily_Return,lagged,Volatility,High_Low_Rng
Ticker,RR,RR,RR,RR,RR,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
2025-09-29,4.1,4.59,4.06,4.46,55185500,4.334,3.69575,-0.02381,4.2,0.108721,0.53
2025-09-30,4.29,4.31,3.85,4.02,51088100,4.218,3.76575,0.046341,4.1,0.109129,0.46
2025-10-01,4.76,4.81,4.13,4.2,48283000,4.288,3.86975,0.109557,4.29,0.113146,0.68
2025-10-02,5.16,5.26,4.73,4.88,51263300,4.502,4.00075,0.084034,4.76,0.109413,0.53
2025-10-03,6.18,6.3,5.02,5.305,102059100,4.898,4.18325,0.197674,5.16,0.122281,1.28


##Drop missing values

In [None]:
df_clean = df.dropna()
df_clean.head()
df_clean.describe()

Price,Close,High,Low,Open,Volume,MA_5,MA_20,Daily_Return,lagged,Volatility,High_Low_Rng
Ticker,RR,RR,RR,RR,RR,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
count,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0
mean,2.074144,2.232213,1.934634,2.082984,11080900.0,2.071419,2.079212,0.007646,2.070929,0.091244,0.297579
std,1.528297,1.676019,1.361791,1.526501,19292400.0,1.47848,1.374062,0.114035,1.521149,0.067397,0.514137
min,0.37,0.508,0.302,0.482,10300.0,0.5122,0.60505,-0.739437,0.37,0.011609,0.02
25%,1.23,1.3,1.16,1.23,264550.0,1.24,1.218625,-0.048107,1.23,0.049069,0.0955
50%,1.79,1.91,1.71,1.81,5028300.0,1.836,1.89925,-0.008909,1.79,0.071775,0.167
75%,2.265,2.45,2.135,2.27,11503550.0,2.324,2.30775,0.045149,2.265,0.107743,0.31
max,11.1,12.29,9.71,11.71,188115400.0,9.236,7.4057,0.697095,11.1,0.361204,6.89


##Now you can see all the rows with null values got dropped and we have consistent counts accross the board

#Let'd add our target: Next days closing price
##Since we get our target from the next days closing price we'll lose our last row as target becomes null

In [None]:
df_clean = df.dropna().copy() # Create a copy to avoid SettingWithCopyWarning
df_clean["Target"] = df_clean["Close"].shift(-1)
df_clean.describe()

Price,Close,High,Low,Open,Volume,MA_5,MA_20,Daily_Return,lagged,Volatility,High_Low_Rng,Target
Ticker,RR,RR,RR,RR,RR,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
count,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,451.0,450.0
mean,2.074144,2.232213,1.934634,2.082984,11080900.0,2.071419,2.079212,0.007646,2.070929,0.091244,0.297579,2.068287
std,1.528297,1.676019,1.361791,1.526501,19292400.0,1.47848,1.374062,0.114035,1.521149,0.067397,0.514137,1.524922
min,0.37,0.508,0.302,0.482,10300.0,0.5122,0.60505,-0.739437,0.37,0.011609,0.02,0.37
25%,1.23,1.3,1.16,1.23,264550.0,1.24,1.218625,-0.048107,1.23,0.049069,0.0955,1.23
50%,1.79,1.91,1.71,1.81,5028300.0,1.836,1.89925,-0.008909,1.79,0.071775,0.167,1.79
75%,2.265,2.45,2.135,2.27,11503550.0,2.324,2.30775,0.045149,2.265,0.107743,0.31,2.26
max,11.1,12.29,9.71,11.71,188115400.0,9.236,7.4057,0.697095,11.1,0.361204,6.89,11.1


In [None]:
df_final = df_clean[:-1]
df_final.tail()

Price,Close,High,Low,Open,Volume,MA_5,MA_20,Daily_Return,lagged,Volatility,High_Low_Rng,Target
Ticker,RR,RR,RR,RR,RR,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2025-09-26,4.2,4.3,3.92,4.24,38614100,4.622,3.643,0.026895,4.09,0.120712,0.38,4.1
2025-09-29,4.1,4.59,4.06,4.46,55185500,4.334,3.69575,-0.02381,4.2,0.108721,0.53,4.29
2025-09-30,4.29,4.31,3.85,4.02,51088100,4.218,3.76575,0.046341,4.1,0.109129,0.46,4.76
2025-10-01,4.76,4.81,4.13,4.2,48283000,4.288,3.86975,0.109557,4.29,0.113146,0.68,5.16
2025-10-02,5.16,5.26,4.73,4.88,51263300,4.502,4.00075,0.084034,4.76,0.109413,0.53,6.18


In [None]:
X = df_final[['MA_5', 'MA_20', 'Daily_Return', 'lagged', 'Volatility', 'High_Low_Rng']]
y = df_final['Target']


##Normally we use Sklearn and use random state to randomly divvy up our training and test data, but for stock it makes more sense to train on the past data and test on the most recent data.
###For this I'll use 70% Train - 30% Test
###OR 2021-2023 and July2023-Now

In [None]:
#Split index: Where were cutting it off between train and test
split_idx = int(len(df_final)*0.7)

X_train = X[:split_idx]
X_test = X[split_idx:]

y_train = y[:split_idx]
y_test = y[split_idx:]

X_train.tail()

Price,MA_5,MA_20,Daily_Return,lagged,Volatility,High_Low_Rng
Ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2025-03-14,1.624,1.91175,-0.017442,1.72,0.074644,0.2
2025-03-17,1.786,1.88625,0.325444,1.69,0.123548,0.58
2025-03-18,1.837,1.8535,-0.149554,2.24,0.135051,0.31
2025-03-19,1.937,1.8325,0.11811,1.905,0.138423,0.29
2025-03-20,2.021,1.816,0.004695,2.13,0.133655,0.41


##Because the data values in each column are vastly different numerical values we need to scale them so volatility which is often a small number (especially for VOO) has an actual impact against the moving avgs.
##This is done with Scaling

###Output should be small numbers

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) #Dont fit on the test data, causes leakage

In [None]:

# Check training set scaling
print("=== X_train_scaled Statistics ===")
print(f"Mean: {np.mean(X_train_scaled, axis=0)}")  # Should be ~0
print(f"Std: {np.std(X_train_scaled, axis=0)}")     # Should be ~1

# Check test set (will be close but not exactly 0 and 1)
print("\n=== X_test_scaled Statistics ===")
print(f"Mean: {np.mean(X_test_scaled, axis=0)}")
print(f"Std: {np.std(X_test_scaled, axis=0)}")

=== X_train_scaled Statistics ===
Mean: [-4.51138245e-17  2.25569122e-17 -1.12784561e-17 -4.51138245e-17
 -9.02276490e-17 -1.12784561e-17]
Std: [1. 1. 1. 1. 1. 1.]

=== X_test_scaled Statistics ===
Mean: [ 0.25203552  0.13037491  0.02390981  0.26086727 -0.29332841 -0.10498775]
Std: [0.41798945 0.2830823  0.61169172 0.43321979 0.40827564 0.32655994]


###Make sure columns are all there

In [None]:
print(f"Number of features: {X_train_scaled.shape[1]}")
print(f"Feature names: {X_train.columns.tolist()}")

Number of features: 6
Feature names: [('MA_5', ''), ('MA_20', ''), ('Daily_Return', ''), ('lagged', ''), ('Volatility', ''), ('High_Low_Rng', '')]


##Next we need a cost function-interchangeable with loss function, calculates how much predictions are off by.
##This serves as a baseline before training

In [None]:
# Augmenting X_train is not needed when handling the bias term separately in the gradient descent function.
# Keeping this cell here as a placeholder, but it will be empty.

In [None]:
def gradfn(weights, X, y):
    '''
    weights: a current "Guess" of what our weights should be
          X: matrix of shape (n,m) of input features
          y: target y values
    Return gradient of each weight evaluated at the current value
    '''
    n, m = np.shape(X)
    yhat = X @ weights
    error = yhat - y
    return (np.transpose(X) @ error)/float(n)

In [None]:
def solve_via_gradient_descent(X, y, print_every=100000, niter=1000000, eta=0.0005):
    '''
    X: matrix of shape (n,m) of input features (scaled)
    y: target y values
    weights: a current "Guess" of what our weights should be
    Return the learned weights `w` and bias `b`
    '''
    n, m = X.shape
    w = np.zeros(m)
    b = 0.0

    for k in range(niter):
        yhat = np.dot(X, w) + b  # Use scaled X and add bias
        error = yhat - y
        dw = (X.T @ error) / n
        db = np.sum(error) / n

        # Update weights and bias
        w = w - eta * dw
        b = b - eta * db

        if k % print_every == 0:
            cost = np.sum(error**2) / (2*n)
            print(f"Iteration {k}: cost={cost:.4f}") # Removed printing w and b here

    return w, b

In [None]:
# Pass scaled training data X_train_scaled to the gradient descent function
w, b = solve_via_gradient_descent(X=X_train_scaled, y=y_train)

print('\nLearned parameters:')
print(f'Bias (b) = {b:.5f}')
for i in range(len(w)):
  print(f'Weight w{i+1} = {w[i]:.5f}')

Iteration 0: cost=3.2929
Iteration 100000: cost=0.1159
Iteration 200000: cost=0.1126
Iteration 300000: cost=0.1119
Iteration 400000: cost=0.1117
Iteration 500000: cost=0.1117
Iteration 600000: cost=0.1117
Iteration 700000: cost=0.1117
Iteration 800000: cost=0.1117
Iteration 900000: cost=0.1117

Learned parameters:
Bias (b) = 1.91308
Weight w1 = -0.22668
Weight w2 = -0.03551
Weight w3 = 0.26481
Weight w4 = 2.12285
Weight w5 = 0.00107
Weight w6 = -0.35578


##VS Sklearn

In [None]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train_scaled, y_train)
print(f'intercept={reg.intercept_:.5f}')
for i in range(len(reg.coef_)):
  print(f'w{i+1}={reg.coef_[i]:.5f}')

intercept=1.91308
w1=-0.22711
w2=-0.03544
w3=0.26483
w4=2.12325
w5=0.00108
w6=-0.35581


##Now that we have our model lets use linear regression to predict data

In [None]:
def linreg(w,b, X):
  return X @ w + b

In [None]:
w1=-0.22711
w2=-0.03544
w3=0.26483
w4=2.12325
w5=0.00108
w6=-0.35581
w = np.array([w1,w2,w3,w4,w5,w6]).T
y_pred = linreg(w,1.91308, X_test)
print(y_pred)

Date
2025-03-21     5.819681
2025-03-24     6.076137
2025-03-25     6.757678
2025-03-26     6.446563
2025-03-27     6.151219
                ...    
2025-09-26     9.290408
2025-09-29     9.520690
2025-09-30     9.375715
2025-10-01     9.698016
2025-10-02    10.689308
Length: 135, dtype: float64


In [None]:
stock2.tail()

Price,Close,High,Low,Open,Volume
Ticker,RR,RR,RR,RR,RR
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2025-09-29,4.1,4.59,4.06,4.46,55185500
2025-09-30,4.29,4.31,3.85,4.02,51088100
2025-10-01,4.76,4.81,4.13,4.2,48283000
2025-10-02,5.16,5.26,4.73,4.88,51263300
2025-10-03,6.18,6.3,5.02,5.305,102059100
