<a href="https://colab.research.google.com/github/Alphaomegainfinity/yahoo_stocks_market/blob/main/yahoo_stock.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
!pip install yfinance
!pip install ipympl
# !pip install pandas
!pip install bigquery



In [8]:
%matplotlib widget
from google.colab import output
output.enable_custom_widget_manager()

# import dependencies
import yfinance as yf
import pandas as pd
import numpy as np
import datetime
import os
from tqdm import tqdm

# import sqlalchemy for database connection
from sqlalchemy import create_engine
from sqlalchemy import inspect
from sqlalchemy.orm import sessionmaker

#Config should contain database username as username and database password as password
# from config import password

from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from statsmodels.tsa.arima.model import ARIMA
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.callbacks import EarlyStopping

import matplotlib.pyplot as plt

In [9]:
url = 'https://finance.yahoo.com/world-indices'
tables = pd.read_html(url)
world_stocks = tables[0]
world_stocks

ValueError: ignored

In [None]:
def fetch_data(ticker_symbol):
    try:
        ticker = yf.Ticker(ticker_symbol)
        today = datetime.date.today().strftime('%Y-%m-%d')
        data = ticker.history(start="1900-01-01", end=today)
        data.reset_index(inplace=True)
        data['ticker'] = ticker_symbol
        return data
    except Exception as e:
        print(f"Error fetching data for {ticker_symbol}: {e}")
        return None

In [None]:
all_data = []
for symbol in tqdm(world_stocks['Symbol'], desc="Fetching data"):  # tqdm progress bar!
    single_data = fetch_data(symbol)
    if single_data is not None:
        all_data.append(single_data)

# Concatenate all the individual datasets into one
master_data_origin = pd.concat(all_data, ignore_index=True)
master_data_origin

In [None]:
# Checking any missing values per column and per rows
def data_checking(master_data_check):
    # Identify NaN or empty values
    missing_values = master_data_check.isna().sum()

    # Identify incorrect value types
    incorrect_types = master_data_check.apply(lambda x: pd.to_numeric(x, errors='coerce').isna().sum())

    # Combine the results into a DataFrame
    cleaning_report = pd.DataFrame({'Missing Values': missing_values, 'Incorrect Types': incorrect_types})

    return cleaning_report

In [None]:
data_checking(master_data_origin)

In [None]:
# Copy the dataframe
master_data = master_data_origin.copy()

In [None]:
# Drop off the Adj Close column
master_data = master_data.drop(columns=['Adj Close'])

# Correct the Date column data type
master_data['Date'] = pd.to_datetime(master_data['Date'], utc = True)
master_data.head()

In [None]:
# rearrange the columns and remove the hour from the date
master_data = master_data[['ticker', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume']]
master_data['Date'] = pd.to_datetime((master_data['Date']).dt.date)

# remove '^' from the ticker
master_data['ticker'] = master_data['ticker'].str.replace('^', '').astype(str)
master_data

In [None]:
master_data.dtypes

In [None]:
data_checking(master_data)

In [None]:
# Adding "Movement" column to calculate the difference movement during a trading day
# master_data ['Movement'] = master_data ['Close'] - master_data ['Open']

# master_data['Color'] = master_data['Movement'].apply(lambda x: 'green' if x >= 0 else 'red')

In [None]:
# create a list of unique stock symbols
stock_symbols = master_data['ticker'].unique().tolist()

# Split data into individual stocks
individual_stocks = {}
for ticker_name in master_data['ticker'].unique():
    individual_stocks[ticker_name] = master_data[master_data['ticker'] == ticker_name]
    individual_stocks[ticker_name].reset_index(inplace=True, drop=True)

individual_stocks['GSPC']

In [None]:
print (stock_symbols)

In [None]:
data = individual_stocks['GSPC'].copy()
data.set_index('Date', inplace=True)
data=data.asfreq('B') # 'B' is for business days
# Drop off all rows that have more than 5 missing values
data = data.dropna(thresh=len(data.columns)-5)

data

In [None]:

features = ['Open', 'High', 'Low']
target = 'Close'

X = data[features]
y = data[target]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Time Series Cross-Validation
tscv = TimeSeriesSplit(n_splits=5)

# Initialize models
arima_model = ARIMA(y_train, order=(5, 1, 0))
rf_model = RandomForestRegressor(random_state=42)
gb_model = GradientBoostingRegressor(random_state=42)

# LSTM requires 3D input
X_train_lstm = np.reshape(X_train_scaled, (X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_lstm = np.reshape(X_test_scaled, (X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

lstm_model = Sequential()
lstm_model.add(LSTM(50, input_shape=(X_train_lstm.shape[1], X_train_lstm.shape[2])))
lstm_model.add(Dense(1))
lstm_model.compile(optimizer='RMSprop', loss='mse') # try different optimizer: adam, SGD or RMSprop
# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train models
arima_model_fit = arima_model.fit()
rf_model.fit(X_train_scaled, y_train)
gb_model.fit(X_train_scaled, y_train)
lstm_model.fit(X_train_lstm, y_train, epochs=2000, batch_size=25, validation_data=(X_test_lstm, y_test), callbacks=[early_stopping])

# Predictions
arima_preds = arima_model_fit.forecast(steps=len(y_test))
rf_preds = rf_model.predict(X_test_scaled)
gb_preds = gb_model.predict(X_test_scaled)
lstm_preds = lstm_model.predict(X_test_lstm)
lstm_preds = lstm_preds.reshape(-1)

# Create a date-based index for forecasted results, indexing based on business days.
forecast_start_date = y_test.index[0]
forecast_periods = len(arima_preds)
forecast_index = pd.bdate_range(start=forecast_start_date, periods=forecast_periods, freq='C', weekmask='1001000')

In [None]:
# Evaluate models
print("ARIMA")
print("MSE:", mean_squared_error(y_test, arima_preds))
print("MAE:", mean_absolute_error(y_test, arima_preds))
print("RMSE:", np.sqrt(mean_squared_error(y_test, arima_preds)))

print("\nRandom Forest")
print("MSE:", mean_squared_error(y_test, rf_preds))
print("MAE:", mean_absolute_error(y_test, rf_preds))
print("RMSE:", np.sqrt(mean_squared_error(y_test, rf_preds)))

print("\nGradient Boosting")
print("MSE:", mean_squared_error(y_test, gb_preds))
print("MAE:", mean_absolute_error(y_test, gb_preds))
print("RMSE:", np.sqrt(mean_squared_error(y_test, gb_preds)))

print("\nLSTM")
print("MSE:", mean_squared_error(y_test, lstm_preds))
print("MAE:", mean_absolute_error(y_test, lstm_preds))
print("RMSE:", np.sqrt(mean_squared_error(y_test, lstm_preds)))

# Print the forecasted results with proper date index
forecast_df = pd.DataFrame({'ARIMA': arima_preds, 'RF': rf_preds, 'GB': gb_preds, 'LSTM': lstm_preds}, index=forecast_index)
print(forecast_df)

In [None]:
# Plot actual vs. predicted
plt.figure(figsize=(10, 6))
plt.plot(y_test.index, y_test.values, label='Actual', color='black')
plt.plot(y_test.index, arima_preds, label='ARIMA', color='blue')
plt.plot(y_test.index, rf_preds, label='Random Forest', color='red')
plt.plot(y_test.index, gb_preds, label='Gradient Boosting', color='green')
plt.plot(y_test.index, lstm_preds, label='LSTM', color='yellow')
plt.legend()
plt.xlabel('Date')
plt.ylabel('Price')
plt.title('Stock Price Prediction')
plt.show()

In [None]:
# save all the model for future use:
import joblib

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

#save as h5 type GSPC ticker
arima_model_fit.save('/content/drive/My Drive/arisma_GSPC_model.h5')
joblib.dump(rf_model,'/content/drive/My Drive/rf_GSPC_model.h5')
joblib.dump(gb_model,'/content/drive/My Drive/gb_GSPC_model.h5')
lstm_model.save('/content/drive/My Drive/lstm_GSPC_model.h5')

# Save the entire model as a `.keras` zip archive.
arima_model_fit.save('/content/drive/My Drive/arismaGSPC.keras')
joblib.dump(rf_model,'/content/drive/My Drive/rfGSPC.keras')
joblib.dump(gb_model,'/content/drive/My Drive/gbGSPC.keras')
lstm_model.save('/content/drive/My Drive/lstmGSPC.keras')

In [None]:
import google.cloud.bigquery as bigquery

client = bigquery.Client()

dataset_ref = client.dataset('bq-dataset')
table_ref = dataset_ref.table('bq-table')

engine = sqlalchemy.create_engine('bigquery://')
schema = Table('stocks', engine,
              Column('ticker', String, primary_key=True),
              Column('Date', DateTime, primary_key=True),
              Column('Open', Float),
              Column('High', Float),
              Column('Low', Float),
              Column('Close', Float),
              Column('Volume', Float))

# create the table
schema.create()