# Forecast pipeline
In this script I will guide you to the execution of the ML forecast pipeline. I will add comments about the pipeline process, model trainning and mthe predictive process. Each stage of the proces can be run independently but we can review and compare our results for a more interactiv experience.

## Preprocess
The Preprocess scritpt has it's own jupyter notebook where I made an exploratory analysis of the information. In the script verison I only compile the preprocess functions that allows us to run the ML pipeline.


In [None]:
import logging
import os
import sys
import time
import numpy as np
import pandas as pd
from datetime import timedelta, datetime
from tqdm import tqdm

In [None]:
# Append the 'preprocess' directory to the Python path
current_dir = os.path.dirname(os.path.realpath("__file__"))
preprocess_dir = os.path.abspath(os.path.join(current_dir, "preprocess"))
sys.path.append(preprocess_dir)

In [None]:
preprocess_dir

In [None]:
# Import the load_raw_data function directly
from preprocess_script import load_raw_data, calculate_sample_size, preprocess_data
from complete_series import create_weekly_date_dataframe, expand_time_series, create_seasonal_controls, impute_default
from feature_engineering import discard_uncompleted_windows, moving_average_variables



In [None]:
#Complete Series env variables
TEST = True
date_column = "date"
aggregation_cols = ["year", "month", "week"]
# Feature Engineering env variables
date_column = "date"
lag_list = [2, 4, 6, 10]
rolling_list = [2, 4, 6]
evaluation_window = max(lag_list) + max(rolling_list) + 1


## Preprocess


In [None]:
# Specify the path to your raw data file
raw_data_path = os.path.join(current_dir, "data", "raw", "commit_history_raw.csv")

# Now you can use load_raw_data in your main script
df_raw = load_raw_data(raw_data_path)

In [None]:
# Calculate the sample size
total_repositories = len(df_raw.groupby(["repo_author_single", "year", "week_number"])["commit_count"].sum())
sample_size = calculate_sample_size(total_repositories)

# Preprocess the data
preprocess_data(df_raw, sample_size)


## Complete Time Series

In [None]:
if TEST:
    df = pd.read_csv("./data/preprocess/commit_history_subset_test.csv")
else:
    df = pd.read_csv("./data/preprocess/commit_history_subset.csv")

group_id = ["repo_name"]

df_all = pd.DataFrame()
df[date_column] = pd.to_datetime(df[date_column])
df_index = df.groupby(group_id).first().reset_index()

start_date = df[date_column].min()
end_date = df[date_column].max()

df_dates_week = create_weekly_date_dataframe(
    start_date, end_date, week_start="sunday"
)  # Choose between sunday or monday
df_expand = expand_time_series(df, date_column, df_index, df_dates_week)
df_expand = create_seasonal_controls(df_expand, date_column="date")
df_all_preproc = impute_default(df_expand, ["commit_count"], 0)
df_all_preproc = df_all_preproc[
    ["repo_name", "year", "commit_count", "date", "month", "week"]
]
if TEST:
    df_all_preproc.to_csv(
        "./data/preprocess/commit_series_expansion_test.csv",
        index=False,
    )
else:
    df_all_preproc.to_csv(
        "./data/preprocess/commit_series_expansion.csv",
        index=False,
    )


## Feature Enginnering

In [None]:

if TEST:
    df = pd.read_csv("./data/preprocess/commit_series_expansion_test.csv")
else:
    df = pd.read_csv("./data/preprocess/commit_series_expansion.csv")

df_window_mean, df_window_ewm =moving_average_variables(df, date_column, lag_list, rolling_list)

df_out = df.merge(df_window_mean, on=[date_column, "repo_name"], how="inner")
df_out = df_out.merge(df_window_ewm, on=[date_column, "repo_name"], how="inner")
df_out = df_out.sort_values(["repo_name", date_column], ascending=True)

df_out[date_column] = pd.to_datetime(df_out[date_column])

if TEST:
    file_path = "./data/preprocess/featureengineering_test.csv"
else:
    file_path = "./data/preprocess/featureengineering.csv"
df_out = discard_uncompleted_windows(df_out, evaluation_window, date_column, "W")
df_out.to_csv(file_path, index=False)


# Process
# Hyperparameter Optimization


In [None]:
# Append the 'preprocess' directory to the Python path
current_dir = os.path.dirname(os.path.realpath("__file__"))
preprocess_dir = os.path.abspath(os.path.join(current_dir, "process"))
sys.path.append(preprocess_dir)

In [None]:
preprocess_dir

In [None]:
from hyperparameter_optimization import preprocess_data, hyperparameter_optimization
from iterative_prediction import create_iterative_forecast


In [None]:
# From the previous script
lag_list = [2, 4, 6, 10]
rolling_list = [2, 4, 6]
date_column = "date"
evaluation_window = (
    max(lag_list) * 7 + max(rolling_list) * 7
)  # minimum data to run t
prediction_window = 7 * 12  # days_in_week*number_weeks
cut_date = pd.to_datetime("2021-12-26")



In [None]:

if TEST:
    file_path = "./data/preprocess/featureengineering_test.csv"
else:
    file_path = "./data/preprocess/featureengineering.csv"

df = pd.read_csv(file_path)
df = df[~df["commit_count"].isna()]

# Data Preprocessing
df = preprocess_data(df, date_column)  # 30 days
hyperparameter_optimization(
    df= df, 
    target ="commit_count", 
    prediction_window = prediction_window, 
    evaluation_window = evaluation_window,
    cut_date=cut_date)

#! NOTE: No data is saved, we only tarinned the models and save them.

## Iterative prediction

In [None]:
prediction_window

In [None]:
TEST = True
parallel = False
prediction_window = 12  # This time is at a Weekly level
prediction_contained = True

lag_list = [2, 4, 6, 10]
rolling_list = [2, 4, 6]
cut_date = pd.to_datetime("2021-12-26")
evaluation_window = max(lag_list) + max(rolling_list) + 1
# ? Note: that we are going to be placed at t='2021-12-26'  the we need to start the iteration at MAX date to cover predictions until '2021-12-26'. Notice also that you will need at least evaluation_window observations before max date in order to create predictions for forecast_start. The code advance one week at a time to recalculate the predictions using previous predictions.

if prediction_contained:
    forecast_start = cut_date - timedelta(
        days=(prediction_window * 7)
    )
    min_date = forecast_start - timedelta(days=(evaluation_window * 7))
else:
    #Start from the last day and starts making iterations over the future
    forecast_start = cut_date
    min_date = forecast_start - timedelta(days=(evaluation_window * 7))

for model in ["xgboost", "randomforest"]:
    start_time = time.time()
    if TEST:
        file_path = "./data/preprocess/featureengineering_test.csv"
    else:
        file_path = "./data/preprocess/featureengineering.csv"

    df = pd.read_csv(file_path)
    df = df[~df["commit_count"].isna()]
    # Data Preprocessing
    df = preprocess_data(df, "date")  # 30 days
    df = pd.read_csv(file_path)
    df_predicted_all = create_forcast(df, forecast_start= forecast_start, parallel=parallel, evaluation_window=evaluation_window, min_date=min_date)
    end_time = time.time()
    elapsed_time = round(end_time - start_time)

    logging.info(f"Time taken for the operation: {elapsed_time} seconds")
    # SAVE
    df_predicted_all.to_csv(f"./data/process/predictions_{model}.csv")

