# Forecast pipeline
In this script I will guide you to the execution of the ML forecast pipeline. I will add comments about the pipeline process, model trainning and the prediction process. Each stage of the proces can be run independently but we can review and compare our results for a more interactiv experience.

## Data Selection
This notebook takes only a representative subsample of the entire Github data available to train the models. Given the resource and time constraints, we are only using 10% of the total information (2% when TEST = True)


In [None]:
from utils.utils import (
    interact_categorical_numerical,
    perform_standardization,
    prepare_data,
)

In [None]:
import logging
import os
import sys
import time
import numpy as np
import pandas as pd
from datetime import timedelta, datetime
from tqdm import tqdm
from tslearn.clustering import TimeSeriesKMeans
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [None]:
# Append the 'preprocess' directory to the Python path
current_dir = os.path.dirname(os.path.realpath("__file__"))
preprocess_dir = os.path.abspath(os.path.join(current_dir, "preprocess"))
sys.path.append(preprocess_dir)

# Append the 'preprocess' directory to the Python path
current_dir = os.path.dirname(os.path.realpath("__file__"))
process_dir = os.path.abspath(os.path.join(current_dir, "process"))
sys.path.append(process_dir)

TEST = True


### TEST option
If you only want to see how the data process is being exeucted I recommend setting TEST = True. The pipeline will only run with a subsample of the information and will execute every function much faster. If you are running the projet directly from the repository you will only have acces to this subsample of data, so theis notebook will only run with TEST =True.
Read the preprocess.ipynb notebook to learn how the subset was generated and how to pass all the available information to create the production version models of this project.

In [None]:
current_dir

In [None]:
# Import the load_raw_data function directly
from preprocess_script import  calculate_sample_size, preprocess_data
from complete_series import create_weekly_date_dataframe, expand_time_series, create_seasonal_controls, impute_default
from feature_engineering import discard_uncompleted_windows, moving_average_variables



In [None]:
#Complete Series env variables
date_column = "date"
aggregation_cols = ["year", "month", "week"]
# Feature Engineering env variables
date_column = "date"
lag_list = [2, 4, 6, 10]
rolling_list = [2, 4, 6]
evaluation_window = max(lag_list) + max(rolling_list) + 1


## Preprocess
### Preprocess Data
The Preprocess scritpt has it's own jupyter notebook where I made an exploratory analysis of the information. In the script verison I only compile the preprocess functions that allows us to run the ML pipeline. in this proproces a subsample of data is crated and the observations are filtered by detecting outliers in commits and Dates. 

In [None]:
# Specify the path to your raw data file
raw_data_path = os.path.join(current_dir, "data", "raw", "commit_history_raw.csv")


# Read the CSV file
df_raw = pd.read_csv(raw_data_path)

In [None]:
# Calculate the sample size
total_repositories = len(df_raw.groupby(["repo_author_single", "year", "week_number"])["commit_count"].sum())
sample_size = calculate_sample_size(total_repositories)

# Preprocess the data
df_bounded =preprocess_data(df_raw)



In [None]:
output_path= os.path.join(current_dir, "data/preprocess/commit_history_subset.csv")
df_bounded.to_csv(output_path, index=False)

testing_sample = df_bounded.repo_name.unique()[:150]
df_testing = df_bounded[df_bounded.repo_name.isin(testing_sample)]
logging.info("\n-Dates:")
logging.info("Max commits per day: ", df_testing.commit_count.max())
logging.info("Max date: ", df_testing.date.max())
logging.info("Min date: ", df_testing.date.min())
logging.info("Shape: ", df_testing.shape)

testing_path= os.path.join(current_dir, "data/preprocess/commit_history_subset_test.csv")
df_testing.to_csv(testing_path)


### Complete Time Series
Raw data is not a valid Time Series format, we only report the ammount of commits reported at each week. However we require that every repository report at least 0 commits for all the weeks in the evaluation period. In other words, we need to input 0 to all the non reported weeks in the GH BigQuery database. 

In [None]:
if TEST:
    df = pd.read_csv("./data/preprocess/commit_history_subset_test.csv")
else:
    df = pd.read_csv("./data/preprocess/commit_history_subset.csv")

group_id = ["repo_name"]

df_all = pd.DataFrame()
df[date_column] = pd.to_datetime(df[date_column])
df_index = df.groupby(group_id).first().reset_index()

start_date = df[date_column].min()
end_date = df[date_column].max()

df_dates_week = create_weekly_date_dataframe(
    start_date, end_date, week_start="sunday"
)  # Choose between sunday or monday
df_expand = expand_time_series(df, date_column, df_index, df_dates_week)
df_expand = create_seasonal_controls(df_expand, date_column="date")
df_all_preproc = impute_default(df_expand, ["commit_count"], 0)
df_all_preproc = df_all_preproc[
    ["repo_name", "year", "commit_count", "date", "month", "week"]
]
if TEST:
    df_all_preproc.to_csv(
        "./data/preprocess/commit_series_expansion_test.csv",
        index=False,
    )
else:
    df_all_preproc.to_csv(
        "./data/preprocess/commit_series_expansion.csv",
        index=False,
    )


### Feature Enginnering
As discussed in this script, this project only created lag dependant variables and moving averages. By providing a list of lags and evlaution windows this block of code create a new variable for each combination of the afromentioned list elements. These are the variables with most predictive power in TimeSeries and we can add control variables at the end of the ensemble models (when trainnig the elasticNet model).

In [None]:

if TEST:
    df = pd.read_csv("./data/preprocess/commit_series_expansion_test.csv")
else:
    df = pd.read_csv("./data/preprocess/commit_series_expansion.csv")

df_window_mean, df_window_ewm =moving_average_variables(df, date_column, lag_list, rolling_list)

df_out = df.merge(df_window_mean, on=[date_column, "repo_name"], how="inner")
df_out = df_out.merge(df_window_ewm, on=[date_column, "repo_name"], how="inner")
df_out = df_out.sort_values(["repo_name", date_column], ascending=True)

df_out[date_column] = pd.to_datetime(df_out[date_column])

if TEST:
    file_path = "./data/preprocess/featureengineering_test.csv"
else:
    file_path = "./data/preprocess/featureengineering.csv"
df_out = discard_uncompleted_windows(df_out, evaluation_window, date_column, "W")
df_out.to_csv(file_path, index=False)


## Process
### Hyperparameter Optimization
Now that the model is preprocessed we can use the new features 
and additional control variables to train three different models. Each model is trainned with the same features and os 


In [None]:
from hyperparameter_optimization import  hyperparameter_optimization
from iterative_prediction import create_iterative_forecast


In [None]:
# From the previous script
lag_list = [2, 4, 6, 10]
rolling_list = [2, 4, 6]
date_column = "date"
evaluation_window = (
    max(lag_list) * 7 + max(rolling_list) * 7
)  # minimum data to run t
prediction_window = 7 * 12  # days_in_week*number_weeks
cut_date = pd.to_datetime("2021-12-26")



In [None]:

if TEST:
    file_path = "./data/preprocess/featureengineering_test.csv"
else:
    file_path = "./data/preprocess/featureengineering.csv"

df = pd.read_csv(file_path)
df = df[~df["commit_count"].isna()]


# Data Preprocessing
df = prepare_data(df, date_column)  # 30 days
os.chdir(current_dir)
os.chdir("./process")
hyperparameter_optimization(
    df= df, 
    target ="commit_count", 
prediction_window = prediction_window, 
    evaluation_window = evaluation_window,
    cut_date=cut_date,
    date_column="date",
    xgboost=False,
    lgbm=False,
    randomforest=True,
)
os.chdir("..")


#! NOTE: No data is saved, we only tarinned the models and save them.

### Iterative prediction
After the models have been trainned, we repeat the prediction process but this time using the best hyperparameters for each model. These prediction can be used to study and analyse each model separaely (read the Regression Anlaysis.ipynb file to look at the results.). Furthermore, each model prediction will become a new feature for the head model of the ensemble model, meaning that the elasticnet model will weight all the results and createa  metaprediction with the feedback of previously trainned models. 

In [None]:
from iterative_prediction import create_iterative_forecast
retrain_models = True
model_features = [
    "sum_repo_name_on_commit_count_with_roll2_lag2_ewm",
    "sum_repo_name_on_commit_count_with_roll4_lag2_ewm",
    "sum_repo_name_on_commit_count_with_roll6_lag2_ewm",
    "sum_repo_name_on_commit_count_with_roll2_lag4_ewm",
    "sum_repo_name_on_commit_count_with_roll4_lag4_ewm",
    "sum_repo_name_on_commit_count_with_roll6_lag4_ewm",
    "sum_repo_name_on_commit_count_with_roll2_lag6_ewm",
    "sum_repo_name_on_commit_count_with_roll4_lag6_ewm",
    "sum_repo_name_on_commit_count_with_roll6_lag6_ewm",
    "sum_repo_name_on_commit_count_with_roll2_lag10_ewm",
    "sum_repo_name_on_commit_count_with_roll4_lag10_ewm",
    "sum_repo_name_on_commit_count_with_roll6_lag10_ewm",
    "sum_repo_name_on_commit_count_with_roll2_lag2_rolling",
    "sum_repo_name_on_commit_count_with_roll4_lag2_rolling",
    "sum_repo_name_on_commit_count_with_roll6_lag2_rolling",
    "sum_repo_name_on_commit_count_with_roll2_lag4_rolling",
    "sum_repo_name_on_commit_count_with_roll4_lag4_rolling",
    "sum_repo_name_on_commit_count_with_roll6_lag4_rolling",
    "sum_repo_name_on_commit_count_with_roll2_lag6_rolling",
    "sum_repo_name_on_commit_count_with_roll4_lag6_rolling",
    "sum_repo_name_on_commit_count_with_roll6_lag6_rolling",
    "sum_repo_name_on_commit_count_with_roll2_lag10_rolling",
    "sum_repo_name_on_commit_count_with_roll4_lag10_rolling",
    "sum_repo_name_on_commit_count_with_roll6_lag10_rolling",
]

In [None]:
parallel = False
prediction_window = 12  # This time is at a Weekly level
prediction_contained = True
retrain_models=True

lag_list = [2, 4, 6, 10]
rolling_list = [2, 4, 6]
cut_date = pd.to_datetime("2021-12-26")
evaluation_window = max(lag_list) + max(rolling_list) + 1
# ? Note: that we are going to be placed at t='2021-12-26'  the we need to start the iteration at MAX date to cover predictions until '2021-12-26'. Notice also that you will need at least evaluation_window observations before max date in order to create predictions for forecast_start. The code advance one week at a time to recalculate the predictions using previous predictions.

if prediction_contained:
    if retrain_models:
        file_path = "./data/preprocess/featureengineering_test.csv"
        df_test = pd.read_csv(file_path)
        df_test["date"] = pd.to_datetime(df_test["date"])
        min_date = df_test.date.min()
        prediction_window = 104  # two years
        forecast_start = cut_date - timedelta(days=(prediction_window * 7))

    else:
        prediction_window = 12  # Three months
        forecast_start = cut_date - timedelta(days=(prediction_window * 7))
        min_date = forecast_start - timedelta(days=(53 * 7))

else:
    # Start from the last day and starts making iterations over the future
    forecast_start = cut_date
    min_date = forecast_start - timedelta(days=(53 * 7))

for model in ["lgbm", "randomforest", ]:
    start_time = time.time()
    if TEST:
        file_path = "./data/preprocess/featureengineering_test.csv"
    else:
        file_path = "./data/preprocess/featureengineering.csv"

    df = pd.read_csv(file_path)
    df = df[~df["commit_count"].isna()]
    # Data Preprocessing
    df = prepare_data(df, "date")  # 30 days
    df = pd.read_csv(file_path)
    os.chdir("./process")

    df_predicted_all = create_iterative_forecast(
        df,
        model=model,
        forecast_start=forecast_start,
        lag_list=lag_list,
        prediction_window=prediction_window,
        rolling_list=rolling_list,
        parallel=parallel,
        evaluation_window=evaluation_window,
        min_date=min_date,
        model_features=model_features,
    )
    # Revert back to the original working directory
    os.chdir("..")
    end_time = time.time()
    elapsed_time = round(end_time - start_time)

    logging.info(f"Time taken for the operation: {elapsed_time} seconds")
    text_file = open(f"logs_{model}_November.txt", "w")
    log_string = f"""
    MODEL: {model}
    PROCESSING TIME:  {elapsed_time}

    PARALLEL: {parallel}

    CORES: 32
    SUBPROCESS CORES: 6
    """
    text_file.write(log_string)

    # close file
    text_file.close()

#df_predicted_all is saved with the create_iterative_forecast function at "../data/process/predictions_{model}.csv"

### Enamble model
Finally all the model predictions are joinned into a single DataFrame that feeds the elasticNet moodel. Additionally, new variables are created to levarage the results of previous models:
* **Forecast Horizon**: Indicative variables that tells the elastic net how many weeks in the future is the next wave of prediction is ahead of the last reported week.
* **TimeSeries Cluster**:  A cluster algorithm that separates different trends of commit counts series. Each repository is assigned to one cluster out of 8 different clusters. For exmaple, cluster_0 represent all the repositories that reported 0 in most of their commit history.
* **Month Control variables**:  These dummies asign 1 or 0 deppending on the month of the weekly reported commits. There are 11 dummis, each one representing a different month.

In [None]:
date_column = "date"
prediction_start = pd.to_datetime("2022-09-01")
retrain_models = True
cut_date = pd.to_datetime("2021-12-26")
prediction_contained = True
prediction_window = 12
lag_list = [2, 4, 6, 10]
rolling_list = [2, 4, 6]
evaluation_window = max(lag_list) + max(rolling_list) + 1

In [None]:
from forecast_ensemble import  prepare_all_models, create_month_dummy, test_uniqueness_branch_date, create_cluster, cluster_indicator, create_forecast_horizon, train_elasticnet_ensemble_model


In [None]:
os.chdir(current_dir)

if prediction_contained:
    prediction_start = cut_date - timedelta(days=(prediction_window * 7))
    min_date = prediction_start - timedelta(days=(evaluation_window * 7))
    prediction_end = cut_date

else:
    # Start from the last day and starts making iterations over the future
    prediction_start = cut_date
    min_date = prediction_start - timedelta(days=(evaluation_window * 7))
    prediction_end = cut_date + timedelta(
        days=(prediction_window * 7)
    )  # Prediction of the future

if TEST:
    all_entries_path = "./data/preprocess/featureengineering_test.csv"
else:
    all_entries_path = "./data/preprocess/featureengineering.csv"

df_all = pd.read_csv(all_entries_path)

os.chdir(current_dir)
os.chdir("./process")
df_forecast, df_models = prepare_all_models(retrain_models, cut_date=cut_date)
df_forecast_month = create_month_dummy(df_models, retrain_models=retrain_models)
test_uniqueness_branch_date(df_models)
df_forecast_horizon = create_forecast_horizon(df_models.copy())
df_target = df_forecast[["Repository", "Date", "Commit Real"]].copy()
df_target = (
    df_target.groupby(["Repository", "Date"])["Commit Real"].first().reset_index()
)

predicted_labels = create_cluster(
    df=df_all, n_cluster=8, retrain_models=retrain_models, target="commit_count"
)
df_clusters = cluster_indicator(df_all, predicted_labels, index_col = "repo_name")

df_lr = df_models.merge(df_clusters, on="Repository", how="left")
test_uniqueness_branch_date(df_lr)
test_uniqueness_branch_date(df_forecast_month)

df_model_input = df_lr.merge(
    df_forecast_month, on=["Repository", "Date"], how="inner"
)
df_model_input = df_model_input.merge(
    df_forecast_horizon, on=["Repository", "Date"], how="inner"
)
df_model_input = df_target.merge(
    df_model_input, on=["Repository", "Date"], how="inner"
)
test_uniqueness_branch_date(df_model_input)

model, df_results = train_elasticnet_ensemble_model(
    df_model_input,
    target="Commit Real",
    retrain_models=retrain_models,
    load_model=True,
)

df_results["model_family"] = "elasticnet"
date_prediction_cut = pd.to_datetime("2023-08-01")
df_forecast = pd.concat(
    [df_forecast, df_results], ignore_index=True
)  # Check columns
os.chdir("..")

path_out = f"../data/process/final_predictions.csv"
df_forecast.to_csv("final_predictions.csv", index=False)

