In [1]:
import joblib
import pandas as pd
import numpy as np
from datetime import timedelta
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import randint as sp_randint
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error

import warnings
warnings.filterwarnings("ignore")

def previous_day(date, dayOfWeek):
    
    """
    Calculate the date of the previous occurrence of a specific day of the week
    """
    days = {
        'monday': 0,
        'tuesday': 1,
        'wednesday': 2,
        'thursday': 3,
        'friday': 4,
        'saturday': 5,
        'sunday': 6 }
    
    delta = (date.weekday() - days[dayOfWeek.lower()] + 7) % 7
    
    return date - timedelta(days=delta)

def timestamp_to_date(date_col):
    
    """
    Converts a timestamp column to date type
    """
    
    return datetime.fromtimestamp(date_col/1000).date()

def read_csv(path):
    
    """
    read a CSV file and return the resulting DataFrame
    """
    
    return pd.read_csv(path)


def expand_by_week(df, column_start, column_end, col_index_out):
    """
    Expands a DataFrame by week based on a range of dates between the start and end dates, inclusive.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame containing start and end dates for each row.
        column_start (str): Column name for the start date.
        column_end (str): Column name for the end date.
        col_index_out (str): Name for the expanded date column in the output DataFrame.

    Returns:
        pd.DataFrame: Expanded DataFrame with rows for each date in the range for every input row.
    """
    # Initialize an empty list to store expanded rows
    expanded_rows_list = []

    # Iterate through the rows of the input DataFrame
    for _, row in df.iterrows():
        # Generate the date range between start_date and end_date (inclusive)
        start_date = row[column_start]
        end_date = row[column_end]
        dates = pd.date_range(start=start_date, end=end_date, freq='D')

        # Create a DataFrame for the expanded rows
        expanded_rows = pd.DataFrame({
            col_index_out: dates,
            'outlet_code': row['outlet_code'],
            'item_department': row['item_department']
        })
        
        # Append the expanded DataFrame to the list
        expanded_rows_list.append(expanded_rows)

    # Concatenate all expanded DataFrames in one step
    df_expanded = pd.concat(expanded_rows_list, ignore_index=True)

    return df_expanded