# All Modules Functionality Workbook

This notebook demonstrates all steps of the workflow: fetching, transforming, training, inference, and simulation.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Check local directory is 05-deployment-and-automation
!pwd

In [None]:
# workflow settings
FETCH_REPO = True
TRANSFORM_DATA = True
TRAIN_MODEL = True

# 1) Fetching data from API / Loading from a local storage

In [None]:
from scripts.data_repo import DataRepository

repo = DataRepository()

In [None]:
if FETCH_REPO:
  # Fetch All 3 datasets for all dates from APIs
  repo.fetch()
  # save data to a local dir
  repo.persist(data_dir='local_data/')
else:
  # OR Load from disk
  repo.load(data_dir='local_data/')  

  
# DEBUG: Separate fetching of the datasets (YFinance-Tickers, YFinance-Indexes, FRED-Macro)
# repo.fetch_tickers()
# repo.fetch_indexes()
# repo.fetch_macro()

In [None]:
# NOT WORKING YET - need to move all data transformations to Transform

# # Test fetching data with a specific min_date
# min_date = '2024-06-01'
# print(f"\nFetching data from {min_date}:")
# repo.fetch(min_date=min_date)

# repo.ticker_df.Ticker.nunique()

In [None]:
repo.ticker_df.Date.agg({'min','max','count'})

In [None]:
repo.ticker_df.Ticker.nunique()

In [None]:
import matplotlib.pyplot as plt
repo.ticker_df.groupby('Date')['Open'].agg('count').plot()
plt.title('How quicky tickers data appear in the dataframe?')
plt.show()


In [None]:
repo.indexes_df.shape

In [None]:
repo.indexes_df.tail(1)

In [None]:
repo.macro_df.tail(1)

# 2) Transform all input fields (data_repo dfs) to one dataframe

In [None]:
from scripts.transform import TransformData

transformed =  TransformData(repo = repo)

if TRANSFORM_DATA:
  transformed.transform()
  transformed.persist(data_dir='local_data/')
else:
  transformed.load(data_dir='local_data/')

In [None]:
transformed.transformed_df.info()

In [None]:
transformed.transformed_df.tail(2)

In [None]:
# last 3 days of the data
transformed.transformed_df.tail(3)['Date']

# 3) Train the best model 
* Model : Random Forest(max_depth=17, n_estimators=200)

In [None]:
from scripts.train import TrainModel
import warnings

# Suppress all warnings (not recommended in production unless necessary)
warnings.filterwarnings("ignore")

trained = TrainModel(transformed=transformed)

if TRAIN_MODEL:
  trained.prepare_dataframe() # prepare dataframes
  trained.train_random_forest() # train the model
  trained.persist(data_dir='local_data/') # save the model to disk
else:
  trained.prepare_dataframe() # prepare dataframes (incl. for inference)
  trained.load(data_dir='local_data/')


In [None]:
# resulting df
trained.df_full.tail(2)

In [None]:
trained.model

# 4) Inference

In [None]:
prediction_name='pred_rf_best'
trained.make_inference(pred_name=prediction_name)

In [None]:
trained.df_full

In [None]:
COLUMNS = ['Adj Close','Ticker','Date',prediction_name, prediction_name+'_rank']
trained.df_full[trained.df_full[f'{prediction_name}_rank']==1].sort_values(by="Date").tail(10)[COLUMNS]

In [None]:
print(trained.df_full[trained.df_full[f'{prediction_name}_rank']<=3].sort_values(by="Date").tail(10)[COLUMNS])

In [None]:
# LAST DATE
tickers = trained.df_full[trained.df_full[f'{prediction_name}_rank']<=3].sort_values(by="Date").tail(3)['Ticker'].to_list()
tickers


In [None]:
# when made predictions?
from datetime import datetime  # Import the datetime module
current_datetime = datetime.now().strftime("%Y-%m-%d %H:%M")
print(f"Current date and time: {current_datetime}")

In [None]:
# Predictions for one Ticker by date - is it the last day's jump?
print(trained.df_full[trained.df_full['Ticker'].isin(tickers)].sort_values(by="Date").tail(10)[COLUMNS])

In [None]:
# several things on the predictions to choose from:
import pandas as pd
import plotly.express as px

# Create a reference to the DataFrame in `trained.df_full`
df = trained.df_full

# Ensure the `Date` column is in datetime format and sort by date and ticker
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by=['Ticker', 'Date'])

# List of tickers for reference
# top 3 predictions defined previously

# Function to calculate growth percentage for a given interval
def calculate_growth(df, days):
    df[f'Adj_Close_{days}d_growth'] = df.groupby('Ticker')['Adj Close'].pct_change(periods=days) * 100
    return df

# Calculate growths for 1d, 5d, 30d, 90d, 225d
intervals = [1, 5, 30, 90, 225]
for interval in intervals:
    df = calculate_growth(df, interval)

# 1. Growth Visualization: Grouped bar chart for each interval
growth_df = df.melt(id_vars=['Date', 'Ticker'], 
                    value_vars=[f'Adj_Close_{interval}d_growth' for interval in intervals], 
                    var_name='Growth Interval', 
                    value_name='Growth %')
growth_df['Growth Interval'] = growth_df['Growth Interval'].str.extract(r'(\d+)').astype(int)  # Extract days as integer

# Plot grouped bar chart
fig = px.bar(growth_df.dropna(), x='Ticker', y='Growth %', color='Growth Interval',
             barmode='group', title="Growth in Adj Close (%) over Different Time Intervals",
             category_orders={"Growth Interval": intervals})  # To ensure order
fig.show()

# # 2. Predictions Graph (Last Month) for each Ticker
# # Filter for the last month's data
# last_month = df['Date'].max() - pd.DateOffset(days=30)
# last_month_df = df[df['Date'] >= last_month]

# # Line chart for predictions over the last month
# fig = px.line(last_month_df, x='Date', y='pred_rf_best', color='Ticker', 
#               title="Prediction Trends for the Last Month",
#               labels={'pred_rf_best': 'Prediction Value'})
# fig.show()

# # 3. 52-Week High, Low, and Current Price for Each Ticker
# from datetime import timedelta

# one_year_ago = df['Date'].max() - timedelta(days=365)
# summary = {}

# for ticker in tickers:
#     ticker_df = df[(df['Ticker'] == ticker) & (df['Date'] >= one_year_ago)]
#     current_price = df[df['Ticker'] == ticker].iloc[-1]['Adj Close']
#     min_price_52w = ticker_df['Adj Close'].min()
#     max_price_52w = ticker_df['Adj Close'].max()
    
#     summary[ticker] = {
#         'Current Price': current_price,
#         '52-Week High': max_price_52w,
#         '52-Week Low': min_price_52w,
#         'Position': f"{round((current_price - min_price_52w) / (max_price_52w - min_price_52w) * 100, 2)}%"
#     }

# print("52-Week Summary for Each Ticker:")
# for ticker, stats in summary.items():
#     print(f"\nTicker: {ticker}")
#     print(f"  Current Price: {stats['Current Price']}")
#     print(f"  52-Week High: {stats['52-Week High']}")
#     print(f"  52-Week Low: {stats['52-Week Low']}")
#     print(f"  Position within 52-Week Range: {stats['Position']}")

# # 4. Volatility (Standard Deviation) for 1 week and overall
# volatility = {}

# for ticker in tickers:
#     ticker_df = df[df['Ticker'] == ticker]
#     # 1-Week Volatility
#     one_week_df = ticker_df[ticker_df['Date'] >= ticker_df['Date'].max() - timedelta(days=7)]
#     volatility[ticker] = {
#         '1-Week Volatility': one_week_df['Adj Close'].std(),
#         'Overall Volatility': ticker_df['Adj Close'].std()
#     }

# print("\nVolatility Summary for Each Ticker:")
# for ticker, stats in volatility.items():
#     print(f"\nTicker: {ticker}")
#     print(f"  1-Week Volatility: {stats['1-Week Volatility']}")
#     print(f"  Overall Volatility: {stats['Overall Volatility']}")


In [None]:
growth_df

In [None]:
# history of predictions for one stock
# print(trained.df_full[trained.df_full['Ticker']=='VZ'].sort_values(by="Date").tail(10)[COLUMNS])