In [28]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
import sys
sys.path.append('../') # Add root directory to path to import src
from src.feature_engineering import create_features

In [29]:
# Download historical data for a specific stock (e.g., VOO)
stock_df = yf.download('VOO', start='2014-01-01', end='2024-01-01')

  stock_df = yf.download('VOO', start='2014-01-01', end='2024-01-01')
[*********************100%***********************]  1 of 1 completed


In [30]:
featured_df = create_features(stock_df)

In [31]:
stock_df.tail()

Price,Close,High,Low,Open,Volume
Ticker,VOO,VOO,VOO,VOO,VOO
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2023-12-22,428.155884,429.680468,426.375551,428.323081,4035200
2023-12-26,429.936249,430.742817,428.441141,428.529693,3835800
2023-12-27,430.723114,430.841142,429.267389,429.739503,4271000
2023-12-28,430.791962,431.647698,430.487045,431.067371,4703700
2023-12-29,429.641144,431.155913,427.811642,430.693614,4374400


In [32]:
# Target: Will the price go up (1) or down (0) tomorrow?
featured_df['Target'] = np.where(featured_df['Close'].shift(-1) > featured_df['Close'], 1, 0)
# We can't use the last row since we don't know the future
featured_df = featured_df[:-1]

In [33]:
featured_df.tail()


Price,Close,High,Low,Open,Volume,ma20,ma50,volatility,rsi,macd,macd_signal,Target
Ticker,VOO,VOO,VOO,VOO,VOO,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2023-12-21,427.162445,427.516561,423.798483,426.031299,4621600,416.638507,400.807203,7.178486,69.639865,7.139153,6.743251,1
2023-12-22,428.155884,429.680468,426.375551,428.323081,4035200,417.555211,401.562774,7.42846,74.726664,7.141042,6.82281,1
2023-12-26,429.936249,430.742817,428.441141,428.529693,3835800,418.596684,402.395093,7.638817,76.240352,7.203166,6.898881,1
2023-12-27,430.723114,430.841142,429.267389,429.739503,4271000,419.657422,403.161844,7.781992,80.498156,7.232522,6.965609,1
2023-12-28,430.791962,431.647698,430.487045,431.067371,4703700,420.734827,403.929777,7.755977,78.585067,7.178591,7.008205,0


In [34]:
features = ['ma20', 'ma50', 'volatility', 'rsi', 'macd', 'macd_signal']
X = featured_df[features]
y = featured_df['Target']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
predictions = model.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, predictions):.2f}")

Model Accuracy: 0.52


In [36]:
# 1. Check the balance of the TRUE answers in your test set
print("Actual distribution in y_test:")
print(y_test.value_counts(normalize=True))

print("\n" + "="*40 + "\n")

# 2. Check what the model is ACTUALLY predicting
# We wrap `predictions` in a Pandas Series to use value_counts()
import pandas as pd
print("Model's predictions distribution:")
print(pd.Series(predictions).value_counts())

Actual distribution in y_test:
Target
1    0.536437
0    0.463563
Name: proportion, dtype: float64


Model's predictions distribution:
1    302
0    192
Name: count, dtype: int64


In [None]:
joblib.dump(model, '../models/stock_predictor.pkl')
print("Model saved to ../models/stock_predictor.pkl")