## Packages

In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [3]:
from scipy.stats import t

In [14]:
from fractions import Fraction

## Collecting Data

In [None]:
eurusd = yf.Ticker("EURUSD=X").history(start="2024-01-01", end="2024-02-01", interval="1d")
gbpusd = yf.Ticker("GBPUSD=X").history(start="2024-01-01", end="2024-02-01", interval="1d")
# eurusd.to_csv('../00-data/eurusd.csv')
# gbpusd.to_csv('../00-data/gbpusd.csv')

In [2]:
# apple = yf.Ticker("AAPL").history(start="2025-06-01", end="2025-06-20", interval="1d")
apple = yf.Ticker("AAPL").history(start="1996-06-01", end="1996-06-16", interval="1d")
microsoft = yf.Ticker("MSFT").history(start="1996-06-01", end="1996-06-16", interval="1d")

google = yf.Ticker("GOOG").history(start="2021-06-01", end="2025-06-16", interval="1d")
amazon = yf.Ticker("AMZN").history(start="2025-06-01", end="2025-06-16", interval="1d")

In [85]:
byd = yf.Ticker("BYDDY").history(start="2025-09-01", end="2025-09-10", interval="1d")
tesla = yf.Ticker("TSLA").history(start="2025-09-01", end="2025-09-10", interval="1d")

In [116]:
coke = yf.Ticker("COKE").history(start="2025-09-01", end="2025-09-10", interval="1d")
coke.Open.round()

Date
2025-09-02 00:00:00-04:00    117.0
2025-09-03 00:00:00-04:00    117.0
2025-09-04 00:00:00-04:00    117.0
2025-09-05 00:00:00-04:00    119.0
2025-09-08 00:00:00-04:00    121.0
2025-09-09 00:00:00-04:00    123.0
Name: Open, dtype: float64

In [117]:
pepsi = yf.Ticker("PEP").history(start="2025-09-01", end="2025-09-10", interval="1d")
pepsi.Open.round()

Date
2025-09-02 00:00:00-04:00    156.0
2025-09-03 00:00:00-04:00    148.0
2025-09-04 00:00:00-04:00    148.0
2025-09-05 00:00:00-04:00    146.0
2025-09-08 00:00:00-04:00    146.0
2025-09-09 00:00:00-04:00    141.0
Name: Open, dtype: float64

In [126]:
pepper = yf.Ticker("KDP").history(start="2025-09-01", end="2025-09-10", interval="1d")
pepper.Open.round()

Date
2025-09-02 00:00:00-04:00    29.0
2025-09-03 00:00:00-04:00    28.0
2025-09-04 00:00:00-04:00    29.0
2025-09-05 00:00:00-04:00    29.0
2025-09-08 00:00:00-04:00    28.0
2025-09-09 00:00:00-04:00    27.0
Name: Open, dtype: float64

In [127]:
monster = yf.Ticker("MNST").history(start="2025-09-01", end="2025-09-10", interval="1d")
monster.Open.round()

Date
2025-09-02 00:00:00-04:00    62.0
2025-09-03 00:00:00-04:00    62.0
2025-09-04 00:00:00-04:00    64.0
2025-09-05 00:00:00-04:00    64.0
2025-09-08 00:00:00-04:00    62.0
2025-09-09 00:00:00-04:00    63.0
Name: Open, dtype: float64

## 1 OLS

The Gauss-Markov theorem states 
that Ordinary Least Squares (OLS) estimators 
are the best linear unbiased estimators (BLUE) 
in a linear regression model, 
meaning they have the minimum variance among all linear unbiased estimators.

## 2 Hypothesis Testing

### t-Distribution

In [1]:
from scipy.stats import t

In [4]:
# Define degrees of freedom
df = 29 # For a sample size of 30 (n-1)

In [5]:
# Calculate PDF at a specific value
x_val = 1.5
pdf_value = t.pdf(x_val, df)
print(f"PDF at x={x_val}: {pdf_value}")

PDF at x=1.5: 0.12893966337578505


In [6]:
# Calculate CDF at a specific value
cdf_value = t.cdf(x_val, df)
print(f"CDF at x={x_val}: {cdf_value}")

CDF at x=1.5: 0.9277881519798071


In [None]:
# Find the critical t-value for a 95% confidence interval (two-tailed)
# This means alpha/2 = 0.025 in each tail, so we look for 0.975
critical_t_value = t.ppf(0.975, df)
# Calculates the Percent Point Function (PPF), also known as the quantile function or inverse CDF. 
# This gives the value x for a given cumulative probability q.
print(f"Critical t-value for 95% CI: {critical_t_value}")

Critical t-value for 95% CI: 2.045229642132703


### 2D Playground

In [128]:
x = coke.Open.round().values #.reshape(-1, 1)
y = pepsi.Open.round().values

In [129]:
print(f"\u03A3 xy = {np.inner(x, y)}")
print(f"\u03A3 x = {np.sum(x)}")
print(f"\u03A3 y = {np.sum(y)}")
print(f"\u03A3 x^2 = {sum([num ** 2 for num in x])}")
print(f"\u03A3 y^2 = {sum([num ** 2 for num in y])}")

Σ xy = 105267.0
Σ x = 714.0
Σ y = 885.0
Σ x^2 = 84998.0
Σ y^2 = 130657.0


In [138]:
beta = (len(x)*np.inner(x, y)-np.sum(x)*np.sum(y))/(len(x)*sum([num ** 2 for num in x]) - np.sum(x)*np.sum(x))
beta

np.float64(-1.5)

In [131]:
alpha = (np.sum(y) - beta * np.sum(x))/len(x)
alpha

np.float64(326.0)

In [132]:
e = y - alpha - beta * x # unbiased estimate of error
sigma2 = sum([num ** 2 for num in e])/(len(x)-2)
sigma = np.sqrt(sum([num ** 2 for num in e])/(len(x)-2))

print(f"\u03A3  e^2 = {sum([num ** 2 for num in e])}")
print(f"\u03C3^2 = 1/(n-2)\u03A3 e^2 =  {sum([num ** 2 for num in e])/(len(x)-2)}")
print(f"\u03C3 = {np.sqrt(sum([num ** 2 for num in e])/(len(x)-2))}")

Σ  e^2 = 47.5
σ^2 = 1/(n-2)Σ e^2 =  11.875
σ = 3.4460121880225554


In [133]:
sigma_b = np.sqrt( sigma2/sum( [ num**2 for num in (x-x.mean()) ] ) )

print(f"\u03C3^2_b = \u03C3^2 / \u03A3 (x-x_avg)^2=  {sigma2/sum( [num**2 for num in (x-x.mean())])}")
print(f"\u03C3_b =  {np.sqrt(sigma2/sum( [num**2 for num in (x-x.mean())]))}")

σ^2_b = σ^2 / Σ (x-x_avg)^2=  0.37109375
σ_b =  0.6091746465505602


In [134]:
sigma_a = np.sqrt( sigma2 * sum ([num**2 for num in x]) / (len(x) * sum([num**2 for num in x-x.mean()])) )

print(f"\u03C3^2_a = \u03C3^2\u03A3 x^2/ n\u03A3 (x-x_avg)^2=  {sigma_a**2}")
print(f"\u03C3_a =  {sigma_a}")

σ^2_a = σ^2Σ x^2/ nΣ (x-x_avg)^2=  5257.037760416668
σ_a =  72.50543262691885


In [125]:
# 2. Add a constant to the independent variable for the intercept
# This creates the design matrix for statsmodels
X_with_intercept = sm.add_constant(x)

# 3. Create and fit the OLS (Ordinary Least Squares) model
model = sm.OLS(y, X_with_intercept)
results = model.fit()

# 4. Print the detailed summary
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.603
Model:                            OLS   Adj. R-squared:                  0.503
Method:                 Least Squares   F-statistic:                     6.063
Date:                Sat, 04 Oct 2025   Prob (F-statistic):             0.0695
Time:                        12:43:14   Log-Likelihood:                -14.721
No. Observations:                   6   AIC:                             33.44
Df Residuals:                       4   BIC:                             33.02
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        326.0000     72.505      4.496      0.0

  warn("omni_normtest is not valid with less than 8 observations; %i "


In [136]:
print(f"{beta + t.ppf(0.025, len(x)-2)*sigma_b}, {beta - t.ppf(0.025, len(x)-2)*sigma_b}")

-3.1913399656259025, 0.1913399656259025


In [137]:
t.ppf(0.025, len(x)-2)

np.float64(-2.7764451051977996)

In [135]:
results.tvalues

array([ 4.49621481, -2.46234805])

In [72]:
t.ppf(0.05, len(x)-2)

np.float64(-1.8595480375228428)

In [64]:
print(f"\u03C3_b = {sigma_b},    \u03C3_a = {sigma_a}")

σ_b = 8.096540558524469,    σ_a = 1.4982625879987566


In [67]:
print(f"{alpha + t.ppf(0.025, len(x)-2)*sigma_a}, {alpha - t.ppf(0.025, len(x)-2)*sigma_a}")

1.5513600590613592, 8.461359506155016


In [60]:
print(f"{beta + t.ppf(0.025, len(x)-2)*sigma_b}, {beta - t.ppf(0.025, len(x)-2)*sigma_b}")

-20.730438617498578, 16.610873400112773


In [66]:
print(f"t.ppf = {t.ppf(0.025, len(x)-2)},    t.sf = {t.sf(0.025, len(x)-2)}")

t.ppf = -2.3060041352041662,    t.sf = 0.490333657237232


In [59]:
beta + t.ppf(0.025, len(x)-2)*sigma_b

np.float64(-20.730438617498578)

In [71]:
beta - t.ppf(0.025, len(x)-2)*sigma_b

np.float64(16.610873400112773)

### 3D Playground

In [150]:
# x = coke.Open.round().values #.reshape(-1, 1)
y = np.array([60, 36, 36, 15, 90])
x1 = np.array([40, 55, 45, 30, 30])
x2 = np.array([3, 6, 5, 3.5, 1.5])

In [151]:
x = np.array([x1, x2]) @ np.array([x1, x2]).T

In [164]:
x

array([[8450. ,  825. ],
       [ 825. ,   84.5]])

In [152]:
np.linalg.inv(x)

array([[ 0.00252994, -0.0247006 ],
       [-0.0247006 ,  0.25299401]])

In [154]:
np.linalg.inv(x) @ x

array([[1.00000000e+00, 3.05311332e-16],
       [6.10622664e-16, 1.00000000e+00]])

In [158]:
# Convert to pandas DataFrame for easier handling, especially with the formula API
df = pd.DataFrame(np.array([x1, x2]).T, columns=['x1', 'x2'])
df['y'] = y

In [179]:
x @ np.array([[169/66800, -33/1336], [-33/1336, 169/668]])

array([[ 1.00000000e+00,  2.27040609e-14],
       [-2.98372438e-16,  1.00000000e+00]])

In [176]:
np.linalg.inv(x) @ np.array([x1, x2]) @ y

array([  4.29004491, -32.8495509 ])

In [184]:
beta = np.array([[169/66800, -33/1336], [-33/1336, 169/668]]) @ np.array([x1, x2]) @ y

In [185]:
beta

array([  4.29004491, -32.8495509 ])

In [188]:
np.sqrt(sum(num**2 for num in y - np.array([x1, x2]).T @ beta)/(len(y)-2))


np.float64(10.70391178704824)

In [159]:
df

Unnamed: 0,x1,x2,y
0,40.0,3.0,60
1,55.0,6.0,36
2,45.0,5.0,36
3,30.0,3.5,15
4,30.0,1.5,90


In [170]:
# 2. Add a constant to the independent variables for the intercept term
# This is necessary when using the non-formula API (sm.OLS)
X_with_constant = sm.add_constant(df[['x1', 'x2']])

In [177]:
# 3. Fit the OLS model
# Method 1: Using the direct OLS class
model_direct = sm.OLS(df['y'], df[['x1', 'x2']])
results_direct = model_direct.fit()

In [178]:
# 4. Print the summary of the regression results
print("Results from direct OLS:")
print(results_direct.summary())

Results from direct OLS:
                                 OLS Regression Results                                
Dep. Variable:                      y   R-squared (uncentered):                   0.976
Model:                            OLS   Adj. R-squared (uncentered):              0.961
Method:                 Least Squares   F-statistic:                              61.85
Date:                Sat, 04 Oct 2025   Prob (F-statistic):                     0.00364
Time:                        14:27:05   Log-Likelihood:                         -17.671
No. Observations:                   5   AIC:                                      39.34
Df Residuals:                       3   BIC:                                      38.56
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                 coef    std err          t      P>|t|      [0.025      0.975]
----------------

  warn("omni_normtest is not valid with less than 8 observations; %i "


In [172]:
# Method 2: Using the formula API (often more convenient)
# Requires 'statsmodels.formula.api'
import statsmodels.formula.api as smf
model_formula = smf.ols("y ~ x1 + x2", data=df)
results_formula = model_formula.fit()

In [175]:
# 4. Print the summary of the regression results
print("Results from formula OLS:")
print(results_formula.summary())

Results from formula OLS:
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.911
Model:                            OLS   Adj. R-squared:                  0.821
Method:                 Least Squares   F-statistic:                     10.18
Date:                Sat, 04 Oct 2025   Prob (F-statistic):             0.0895
Time:                        14:25:52   Log-Likelihood:                -17.277
No. Observations:                   5   AIC:                             40.55
Df Residuals:                       2   BIC:                             39.38
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     15.9733     

  warn("omni_normtest is not valid with less than 8 observations; %i "
