In [40]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error
import warnings
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
#from pmdarima import auto_arima
# Ignore SettingWithCopyWarning
warnings.filterwarnings("ignore")
# Assuming you have a hierarchical DataFrame named 'sales_data' with columns 'Foodcourt', 'Restaurant', 'Item'
df = pd.read_excel("meals_std_2.xlsx")
df = df.reset_index(drop = True)
df = df[["foodcourt","restaurant","restaurantmenuitem","date","total_count"]]
len(df)

In [None]:
df = df[df["total_count"] != 0]
len(df)

51851

In [None]:
# pip install xgboost
# pip install openpyxl
# #pip install openpyxl
#python -m pip install prophet



In [None]:
# grouping the FRI (Food court- Restaurant- Item Combination) to create a input
def group_fri(df):
    grouped_df = df.groupby(['foodcourt', 'restaurant', 'restaurantmenuitem', 'date']).agg({'total_count': 'sum'}).reset_index()
    return grouped_df

# Example usage:
# Assuming df is your DataFrame
df = group_fri(df)
len(df)
#df.to_csv("group.csv")

51851

In [None]:
def df_summary(df, train_end, num_months):
    # Convert end_date to datetime object
    end_date = datetime.strptime(train_end, '%Y-%m-%d')
    
    # Calculate start date based on end date and number of months
    start_date = end_date - timedelta(days=30*num_months) + timedelta(days=1)  # Adjusting start_date based on num_months
    
    # Filter dataframe based on date range
    filtered_df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]
    filtered_df["period"] = num_months
    filtered_df['fri'] = filtered_df['foodcourt'] + '_' + filtered_df['restaurant'] + '_' + filtered_df['restaurantmenuitem']
    # Group by period and sum total counts
    grouped_df = filtered_df.groupby('period').agg({'total_count': 'sum','foodcourt': 'nunique','restaurant': 'nunique','restaurantmenuitem': 'nunique','fri': 'nunique'}).reset_index()
    grouped_df = grouped_df[["period","foodcourt","restaurant","restaurantmenuitem","fri","total_count"]]
    return grouped_df

## Output

train_end = '2024-02-11'
num_months =  6 # Change this to 1, 2, or 6 as needed
#summary_df = df_summary(df, train_end, num_months)

summary_df_1 = df_summary(df, train_end, 1)
summary_df_2 = df_summary(df, train_end, 2)
summary_df_6 = df_summary(df, train_end, 6)
summary_df =  pd.concat([summary_df_1,summary_df_2,summary_df_6])
summary_df

Unnamed: 0,period,foodcourt,restaurant,restaurantmenuitem,fri,total_count
0,1,85,194,953,965,311937
0,2,85,194,953,965,565466
0,6,85,194,953,965,690718


In [None]:
def rank_dataframe2(df, train_date, num_months):
    # Convert date to datetime object
    end_date = datetime.strptime(train_date, '%Y-%m-%d')
    
    # Calculate start date based on end date and number of months
    start_date = end_date - timedelta(days=30*num_months) + timedelta(days=1)  # Adjusting start_date based on num_months
    
    # Filter dataframe based on date range
    filtered_df = df[(df['date'] >= start_date) & (df['date'] <= date)]
    grouped_df = filtered_df.groupby(['foodcourt', 'restaurant', 'restaurantmenuitem']).agg({'total_count': 'mean'}).reset_index()
    # Rename 'total_count' column to 'average_count'
    grouped_df.rename(columns={'total_count': 'average_sales'}, inplace=True)
    # Sort by average sales and assign ranks
    grouped_df['rank'] = grouped_df['average_sales'].rank(ascending=False).astype(int)
        
    # Create sales_segment based on total_count ranges
    def sales_segment(row):
        count = row['average_sales']
        if count > 100:
            return '>100'
        elif 75 <= count <= 100:
            return '75-100'
        elif 50 <= count < 75:
            return '50-75'
        elif 25 <= count < 50:
            return '25-50'
        elif 10 <= count < 25:
            return '10-25'
        else:
            return '<10'
    # Create rank_segment based on rank ranges
    def rank_segment(row):
        count = row['rank']
        if count <= 100:
            return '1-100'
        elif 100 < count <= 200:
            return '100-200'
        elif 200 < count <= 300:
            return '200-300'
        elif 300 < count <= 400:
            return '300-400'
        elif 400 < count <= 500:
            return '400-500'
        else:
            return '>500'
    
    grouped_df['sales_segment'] = grouped_df.apply(sales_segment, axis=1)
    grouped_df['rank_segment'] = grouped_df.apply(rank_segment, axis=1)
    
    return grouped_df

date = '2024-02-11'
num_months = 1  # Change this to 1, 2, or 6 as needed
rank_dataframe = rank_dataframe2(df, date, num_months)
print(len(rank_dataframe))
rank_dataframe.sort_values("average_sales",ascending=False).head(5)

965


Unnamed: 0,foodcourt,restaurant,restaurantmenuitem,average_sales,rank,sales_segment,rank_segment
629,63c1178d2381e7001bf38f10,63c3cd051a491b0017062b2b,649fbf3a061c49001bb67a08,517.666667,1,>100,1-100
635,63c1178d2381e7001bf38f10,63c3cd051a491b0017062b2b,649fbf3a061c49001bb67a1d,447.75,2,>100,1-100
631,63c1178d2381e7001bf38f10,63c3cd051a491b0017062b2b,649fbf3a061c49001bb67a0f,422.0,3,>100,1-100
627,63c1178d2381e7001bf38f10,63c3cd051a491b0017062b2b,649fbf3a061c49001bb67a06,358.666667,4,>100,1-100
577,63787a9e7c7d392939c21787,6476fa8c4535460018ab604e,647b154c9a688000173314ca,329.0,5,>100,1-100


In [None]:
def rank_pivot(df):
    # Pivot the DataFrame on 'rank_segment' and 'sales_segment' and take counts
    pivot_df = df.pivot_table(index='rank_segment', columns='sales_segment', values='rank', aggfunc='count', fill_value=0)
    
    # Reset the index of the pivoted DataFrame
    pivot_df = pivot_df.reset_index()
    
    # Reorder the columns as per the specified order
    pivot_df = pivot_df[["rank_segment", ">100", "75-100", "50-75", "25-50", "10-25", "<10"]]
    
    return pivot_df
pivot_summary = rank_pivot(rank_dataframe)
pivot_summary

sales_segment,rank_segment,>100,75-100,50-75,25-50,10-25,<10
0,1-100,55,39,6,0,0,0
1,100-200,0,0,59,41,0,0
2,200-300,0,0,0,100,0,0
3,300-400,0,0,0,22,78,0
4,400-500,0,0,0,0,100,0
5,>500,0,0,0,0,115,350


In [None]:
rank_dataframe.to_csv("rank_df.csv")


In [None]:
def active_sku(df, train_date, num_days):
    # Convert date to datetime object
    end_date = datetime.strptime(train_date, '%Y-%m-%d')
    
    # Calculate start date based on end date and number of days
    start_date = end_date - timedelta(days=num_days) + timedelta(days=1)  
    
    # Filter dataframe based on date range
    filtered_df = df[(df['date'] >= start_date) & (df['date'] <= end_date)]
    grouped_df = filtered_df.groupby(['foodcourt', 'restaurant', 'restaurantmenuitem']).agg({'total_count': 'sum'}).reset_index()
    active_data = grouped_df[grouped_df["total_count"] >=20 ]
    active_data['sku_concat'] = active_data['foodcourt'] + '-' + active_data['restaurant'] + '-' + active_data['restaurantmenuitem']
    return active_data

train_date = '2024-02-11'
num_days = 14
active_dataframe = active_sku(df, train_date, num_days)
#sku_list = df2["sku_concat"].unique()

In [None]:
#Filters the dataframe on the basis of the active sku
def filter_by_sku_list(df, active_dataframe):
    sku_list = active_dataframe["sku_concat"].unique()
    df['sku_concat'] = df['foodcourt'] + '-' + df['restaurant'] + '-' + df['restaurantmenuitem']
    filtered_data = df[df['sku_concat'].isin(sku_list)]
    filtered_data = filtered_data.drop(['sku_concat'],axis = 1)
    return filtered_data
base_data = filter_by_sku_list(df, active_dataframe)
len(base_data)

51851

In [None]:
min(base_data.date)

Timestamp('2023-12-01 00:00:00')

In [None]:
base_data.head()
rank_dataframe_b = rank_dataframe2(base_data, date, num_months)
pivot_summary_b = rank_pivot(rank_dataframe_b)
pivot_summary_b

sales_segment,rank_segment,>100,75-100,50-75,25-50,10-25,<10
0,1-100,55,39,6,0,0,0
1,100-200,0,0,59,41,0,0
2,200-300,0,0,0,100,0,0
3,300-400,0,0,0,22,78,0
4,400-500,0,0,0,0,100,0
5,>500,0,0,0,0,115,350


In [None]:
## Functions
# add missing date for each date range combination
def add_missing_dates(start_date, end_date, df):
    # Create a date range between start_date and end_date
    date_range = pd.date_range(start=start_date, end=end_date, freq='D')
    
    # Create a DataFrame from the date range
    date_df = pd.DataFrame({'date': date_range})
    
    # Merge item_data with date_df to ensure all data is present
    merged_df = pd.merge(date_df, df, on='date', how='left')
    merged_df['foodcourt'].fillna(method='ffill', inplace=True)
    merged_df['restaurant'].fillna(method='ffill', inplace=True)
    merged_df['restaurantmenuitem'].fillna(method='ffill', inplace=True)
    merged_df['foodcourt'].fillna(method='bfill', inplace=True)
    merged_df['restaurant'].fillna(method='bfill', inplace=True)
    merged_df['restaurantmenuitem'].fillna(method='bfill', inplace=True)
    merged_df['total_count'].fillna(0, inplace=True)
    
    # Fill missing values with 0s
    #merged_df.fillna(0, inplace=True)
    
    return merged_df

## MAPE function creation
data = {
    'actual': [10, 20, 29, 0, 0],
    'predicted': [12, 18, 28, 0, 88]
}

# Create DataFrame
data = pd.DataFrame(data)

def acc_calculation(df,actual='actual',predicted='predicted'):
    # Add 'error' column to DataFrame
    df['error'] = abs(df[actual] - df[predicted]) / df[actual]
    
    # Handle special cases where actual is zero
    df.loc[(df[actual] == 0) & (df[predicted] == 0), 'error'] = 0
    df.loc[(df[actual] == 0) & (df[predicted] != 0), 'error'] = 1
    df['error'] = df['error'].clip(upper=1)

    df['MAPE'] = df['error'].mean()    
    df['Accuracy'] = 1- df['MAPE'] 
    return df
data = acc_calculation(data)
data

#Model

### XGB Model

def run_xgboost(item_data, train_date, test_date):
    # Order the data based on date
    item_data = item_data.sort_values(by='date')
    
    # Split the data into train and test based on train_date and test_date
    train_data = item_data[item_data['date'] <= train_date]
    test_data = item_data[(item_data['date'] > train_date) & (item_data['date'] <= test_date)]
    
    # Check if the test dataset is empty
    if test_data.empty:
        raise ValueError("Test dataset is empty. Please check your input data.")
    
    # Extract target variable
    y_train = train_data['total_count']
    y_test = test_data['total_count']
    
    # Train the XGBoost model
    model = xgb.XGBRegressor()
    X_train = train_data['date'].dt.dayofweek.values.reshape(-1, 1)  # Use day of week as feature
    model.fit(X_train, y_train)
    
    # Generate forecasts for the test data
    X_test = test_data['date'].dt.dayofweek.values.reshape(-1, 1)  # Use day of week as feature
    forecast = model.predict(X_test)
    forecast = np.floor(forecast)
    forecast[test_data['date'].dt.dayofweek.isin([5, 6])] = 0
    
    # Calculate R2 score
    r2 = r2_score(y_test, forecast)
    
    # Calculate MAPE
    mape = mean_absolute_percentage_error(y_test, forecast)
    accuracy_df = pd.DataFrame({
        'actual': y_test,
        'predicted': forecast
    })
    accuracy_df = acc_calculation(accuracy_df, actual='actual', predicted='predicted')
    accuracy = accuracy_df["Accuracy"]
    
    # Create output DataFrame
    output_df = pd.DataFrame({
        'date': test_data['date'],
        'foodcourt': test_data['foodcourt'],
        'restaurant': test_data['restaurant'],
        'restaurantmenuitem': test_data['restaurantmenuitem'],
        'actual': y_test,
        'forecast_xgb_norm': forecast,
        'r2': r2,
        'mape': mape,
        'model': "xgboost-norm",
        "accuracy_xgb_norm":accuracy
    })
    
    return output_df


def run_xgboost_lag(item_data, train_date, test_date):
    # Order the data based on date
    item_data = item_data.sort_values(by='date')
    
    # Create day_of_week feature
    item_data['day_of_week'] = item_data['date'].dt.dayofweek
    
    # Add lag_7 feature
    item_data['lag_7'] = item_data['total_count'].shift(7)
    #item_data  = item_data.fillna(0)
    item_data.reset_index(drop=True, inplace=True)
    # Split the data into train and test based on train_date and test_date
    train_data = item_data[item_data['date'] <= train_date]
    test_data = item_data[(item_data['date'] > train_date) & (item_data['date'] <= test_date)]
    
    # Check if the test dataset is empty
    if test_data.empty:
        raise ValueError("Test dataset is empty. Please check your input data.")
    
    # Extract features and target variables
    X_train, y_train = train_data.drop(columns=['total_count',"foodcourt","restaurant","restaurantmenuitem","date"]), train_data['total_count']
    X_test, y_test = test_data.drop(columns=['total_count',"foodcourt","restaurant","restaurantmenuitem","date"]), test_data['total_count']

    # Train the XGBoost model
    model = xgb.XGBRegressor()  # Set enable_categorical to True
    model.fit(X_train, y_train)
    
    # Generate forecasts for the test data
    forecast = model.predict(X_test)
    forecast = np.floor(forecast)
    forecast[test_data['date'].dt.dayofweek.isin([5, 6])] = 0
    
    # Calculate R2 score
    r2 = r2_score(test_data['total_count'], forecast)
    
    # Calculate MAPE
    mape = mean_absolute_percentage_error(test_data['total_count'], forecast)
    
    accuracy_df = pd.DataFrame({
        'actual': y_test,
        'predicted': forecast
    })
    accuracy_df = acc_calculation(accuracy_df, actual='actual', predicted='predicted')
    accuracy = accuracy_df["Accuracy"]
    
    # Create output DataFrame
    output_df = pd.DataFrame({
        'foodcourt': test_data['foodcourt'],
        'restaurant': test_data['restaurant'],
        'restaurantmenuitem': test_data['restaurantmenuitem'],
        'date': test_data['date'],
        'actual': y_test,
        'forecast_xgb_lag': forecast,
        'r2': r2,
        'mape': mape,
        "accuracy_xgb_lag":accuracy,
        'model': "xgboost-lag",
    })
    
    return output_df



def run_rf_lag(item_data, train_date, test_date):
    # Order the data based on date
    item_data = item_data.sort_values(by='date')
    
    # Create day_of_week feature
    item_data['day_of_week'] = item_data['date'].dt.dayofweek
    
    # Add lag_7 feature
    item_data['lag_7'] = item_data['total_count'].shift(7)
    item_data = item_data.fillna(0)
    item_data.reset_index(drop=True, inplace=True)
    
    # Split the data into train and test based on train_date and test_date
    train_data = item_data[item_data['date'] <= train_date]
    test_data = item_data[(item_data['date'] > train_date) & (item_data['date'] <= test_date)]
    
    # Check if the test dataset is empty
    if test_data.empty:
        raise ValueError("Test dataset is empty. Please check your input data.")
    
    # Extract features and target variables
    X_train, y_train = train_data.drop(columns=['total_count', "foodcourt", "restaurant", "restaurantmenuitem", "date"]), train_data['total_count']
    X_test, y_test = test_data.drop(columns=['total_count', "foodcourt", "restaurant", "restaurantmenuitem", "date"]), test_data['total_count']

    # Train the RandomForest model
    model = RandomForestRegressor()
    model.fit(X_train, y_train)
    
    # Generate forecasts for the test data
    forecast = model.predict(X_test)
    forecast = np.floor(forecast)
    forecast[test_data['date'].dt.dayofweek.isin([5, 6])] = 0
    
    # Calculate R2 score
    r2 = r2_score(test_data['total_count'], forecast)
    
    # Calculate MAPE
    mape = mean_absolute_percentage_error(test_data['total_count'], forecast)
    
    accuracy_df = pd.DataFrame({
        'actual': y_test,
        'predicted': forecast
    })
    accuracy_df = acc_calculation(accuracy_df, actual='actual', predicted='predicted')
    accuracy = accuracy_df["Accuracy"]
    
    # Create output DataFrame
    output_df = pd.DataFrame({
        'foodcourt': test_data['foodcourt'],
        'restaurant': test_data['restaurant'],
        'restaurantmenuitem': test_data['restaurantmenuitem'],
        'date': test_data['date'],
        'actual': y_test,
        'forecast_rf_lag': forecast,
        'r2': r2,
        'mape': mape,
        "accuracy_rf_lag": accuracy,
        'model': "randomforest-lag",
    })
    
    return output_df

# LightGBM


def run_lgb_lag(item_data, train_date, test_date):
    # Order the data based on date
    item_data = item_data.sort_values(by='date')
    
    # Create day_of_week feature
    item_data['day_of_week'] = item_data['date'].dt.dayofweek
    
    # Add lag_7 feature
    item_data['lag_7'] = item_data['total_count'].shift(7)
    item_data = item_data.fillna(0)
    item_data.reset_index(drop=True, inplace=True)
    
    # Split the data into train and test based on train_date and test_date
    train_data = item_data[item_data['date'] <= train_date]
    test_data = item_data[(item_data['date'] > train_date) & (item_data['date'] <= test_date)]
    
    # Check if the test dataset is empty
    if test_data.empty:
        raise ValueError("Test dataset is empty. Please check your input data.")
    
    # Extract features and target variables
    X_train, y_train = train_data.drop(columns=['total_count', "foodcourt", "restaurant", "restaurantmenuitem", "date"]), train_data['total_count']
    X_test, y_test = test_data.drop(columns=['total_count', "foodcourt", "restaurant", "restaurantmenuitem", "date"]), test_data['total_count']

    # Train the RandomForest model
    model = lgb.LGBMRegressor()
    model.fit(X_train, y_train)
    
    # Generate forecasts for the test data
    forecast = model.predict(X_test)
    forecast = np.floor(forecast)
    forecast[test_data['date'].dt.dayofweek.isin([5, 6])] = 0
    
    # Calculate R2 score
    r2 = r2_score(test_data['total_count'], forecast)
    
    # Calculate MAPE
    mape = mean_absolute_percentage_error(test_data['total_count'], forecast)
    
    accuracy_df = pd.DataFrame({
        'actual': y_test,
        'predicted': forecast
    })
    accuracy_df = acc_calculation(accuracy_df, actual='actual', predicted='predicted')
    accuracy = accuracy_df["Accuracy"]
    
    # Create output DataFrame
    output_df = pd.DataFrame({
        'foodcourt': test_data['foodcourt'],
        'restaurant': test_data['restaurant'],
        'restaurantmenuitem': test_data['restaurantmenuitem'],
        'date': test_data['date'],
        'actual': y_test,
        'forecast_rf_lag': forecast,
        'r2': r2,
        'mape': mape,
        "accuracy_lgb_lag": accuracy,
        'model': "lgb_lag",
    })
    
    return output_df




from prophet import Prophet


def run_prophet_lag2(item_data, train_date, test_date):
    # Order the data based on date
    item_data = item_data.sort_values(by='date')
    
    # Create day_of_week feature
    item_data['day_of_week'] = item_data['date'].dt.dayofweek
    
    # Add lag_7 feature
    item_data['lag_7'] = item_data['total_count'].shift(7)
    #item_data  = item_data.fillna(0)
    item_data.reset_index(drop=True, inplace=True)
    # Split the data into train and test based on train_date and test_date
    train_data = item_data[item_data['date'] <= train_date]
    test_data = item_data[(item_data['date'] > train_date) & (item_data['date'] <= test_date)]
    
    # Check if the test dataset is empty
    if test_data.empty:
        raise ValueError("Test dataset is empty. Please check your input data.")
    
    # Prepare DataFrame for Prophet
    prophet_train_data = train_data[['date', 'total_count']].rename(columns={'date': 'ds', 'total_count': 'y'})
    
    # Fit Prophet model
    model = Prophet()
    model.fit(prophet_train_data)
    
    # Make future DataFrame for Prophet
    future = model.make_future_dataframe(periods=len(test_data))
    
    # Add lag_7 feature
    #future['lag_7'] = item_data['total_count'].shift(7)[-len(test_data):].values
    
    # Predict with Prophet
    forecast = model.predict(future)
    
    # Extract forecasted values for the test period
    forecast_test = forecast.iloc[-len(test_data):]
    forecast_test['yhat'] = forecast_test['yhat'].apply(lambda x: max(0, int(np.floor(x))))
    forecast_test.loc[forecast_test['ds'].dt.dayofweek.isin([5, 6]), 'yhat'] = 0
    
    # Create output DataFrame
    output_df = pd.DataFrame({
        'foodcourt': test_data['foodcourt'],
        'restaurant': test_data['restaurant'],
        'restaurantmenuitem': test_data['restaurantmenuitem'],
        'date': test_data['date'],
        'forecast': forecast_test['yhat'],
        'r2': r2_score(test_data['total_count'], forecast_test['yhat']),
        #'mape': mean_absolute_percentage_error(test_data['total_count'], forecast_test['yhat']),
        'actual': test_data['total_count']
    })
    #output_df = acc_calculation(output_df, actual='total_count', predicted='forecast')
    output_df = acc_calculation(output_df, actual='actual', predicted='forecast')
    output_df = output_df.rename(columns={'Accuracy': 'accuracy_prophet'})
    return output_df
import tqdm as notebook_tqdm
import pandas as pd
import numpy as np
#from fbprophet import Prophet
from sklearn.metrics import r2_score

def run_prophet_lag3(item_data, train_date, test_date):
    # Order the data based on date
    item_data = item_data.sort_values(by='date')
    
    # Create day_of_week feature
    item_data['day_of_week'] = item_data['date'].dt.dayofweek
    
    # Add lag_7 feature
    item_data['lag_7'] = item_data['total_count'].shift(7)
    item_data['lag_7'] = item_data['lag_7'].fillna(0)  # Fill missing values with zeros
    item_data.reset_index(drop=True, inplace=True)
    
    # Split the data into train and test based on train_date and test_date
    train_data = item_data[item_data['date'] <= train_date]
    test_data = item_data[(item_data['date'] > train_date) & (item_data['date'] <= test_date)]
    
    # Check if the test dataset is empty
    if test_data.empty:
        raise ValueError("Test dataset is empty. Please check your input data.")
    
    # Prepare DataFrame for Prophet
    prophet_train_data = train_data[['date', 'total_count', 'day_of_week', 'lag_7']].rename(columns={'date': 'ds', 'total_count': 'y'})
    
    # Fit Prophet model
    model = Prophet()
    model.add_regressor('day_of_week')
    model.add_regressor('lag_7')
    model.fit(prophet_train_data)
    
    # Make future DataFrame for Prophet
    future = model.make_future_dataframe(periods=len(test_data))
    future['day_of_week'] = future['ds'].dt.dayofweek
    future['lag_7'] = item_data['total_count'].shift(7)[-len(future):].fillna(0).values
    
    # Predict with Prophet
    forecast = model.predict(future)
    
    # Extract forecasted values for the test period
    forecast_test = forecast.iloc[-len(test_data):]
    forecast_test['yhat'] = forecast_test['yhat'].apply(lambda x: max(0, int(np.floor(x))))
    forecast_test.loc[forecast_test['ds'].dt.dayofweek.isin([5, 6]), 'yhat'] = 0
    
    # Create output DataFrame
    output_df = pd.DataFrame({
        'foodcourt': test_data['foodcourt'],
        'restaurant': test_data['restaurant'],
        'restaurantmenuitem': test_data['restaurantmenuitem'],
        'date': test_data['date'],
        'forecast': forecast_test['yhat'],
        'r2': r2_score(test_data['total_count'], forecast_test['yhat']),
        'actual': test_data['total_count']
    })
    #output_df = acc_calculation(output_df, actual='total_count', predicted='forecast')
    output_df = acc_calculation(output_df, actual='actual', predicted='forecast')
    output_df = output_df.rename(columns={'Accuracy': 'accuracy_prophet_multi'})
    return output_df

def time_series_forecasting_auto_arima(item_data, train_date, test_date, max_p=7, max_q=7):
    # Order the data based on date
    item_data = item_data.sort_values(by='date')
    
    # Create lag features
    item_data['lag_7'] = item_data['total_count'].shift(7)
    item_data['day_of_week'] = item_data['date'].dt.dayofweek
    
    # Drop rows with NaN values created by lag
    item_data = item_data.dropna()
    
    # Split the data into train and test based on train_date and test_date
    train_data = item_data[item_data['date'] <= train_date]
    test_data = item_data[(item_data['date'] > train_date) & (item_data['date'] <= test_date)]
    
    # Check if the test dataset is empty
    if test_data.empty:
        raise ValueError("Test dataset is empty. Please check your input data.")
    
    # Prepare the exogenous variables (lag_7 and day_of_week)
    exog_train = train_data[['lag_7', 'day_of_week']]
    exog_test = test_data[['lag_7', 'day_of_week']]
    
    # Fit auto ARIMA model
    arima_model = auto_arima(train_data['total_count'], seasonal=False, max_p=7, max_q=7, stepwise=False,
                             exogenous=exog_train, trace=True, error_action='ignore', suppress_warnings=True)
    
    # Forecast for the test period
    forecast, conf_int = arima_model.predict(n_periods=len(test_data), exogenous=exog_test, return_conf_int=True)
    
    # Set predictions for Saturday (5) and Sunday (6) to zero
    forecast = pd.Series(forecast, index=test_data.index)
    forecast[test_data['date'].dt.dayofweek.isin([5, 6])] = 0
    
    # Calculate R2 score
    #r2 = r2_score(test_data['total_count'], forecast)
    
    # Calculate MAPE
    #mape = mean_absolute_percentage_error(test_data['total_count'], forecast)
    
    # Create accuracy DataFrame and calculate accuracy
    accuracy_df = pd.DataFrame({
        'actual': test_data['total_count'],
        'predicted': forecast
    })
    accuracy_df = acc_calculation(accuracy_df, actual='actual', predicted='predicted')
    accuracy = accuracy_df["Accuracy"]
    
    # Create output DataFrame
    output_df = pd.DataFrame({
        'date': test_data['date'],
        'foodcourt': test_data['foodcourt'],
        'restaurant': test_data['restaurant'],
        'restaurantmenuitem': test_data['restaurantmenuitem'],
        'actual': test_data['total_count'],
        'forecast_arima': forecast,
        #'r2': r2,
        #'mape': mape,
        'model': "auto-arima",
        "accuracy_arima": accuracy
    })
    
    return output_df




In [None]:
train_start= '2024-01-15'
train_end= '2024-02-25'
test_end= '2024-03-03'

In [None]:
base_data

Unnamed: 0,foodcourt,restaurant,restaurantmenuitem,date,total_count
0,5b964b50a6fb5422abb2fda7,5b964a709153de1766fcf94c,65774ecf4fb51d001be77c5b,2023-12-12,140
1,5b964b50a6fb5422abb2fda7,5b964a709153de1766fcf94c,65774ecf4fb51d001be77c5b,2023-12-13,258
2,5b964b50a6fb5422abb2fda7,5b964a709153de1766fcf94c,65774ecf4fb51d001be77c5b,2023-12-14,174
3,5b964b50a6fb5422abb2fda7,5b964a709153de1766fcf94c,65774ecf4fb51d001be77c5b,2023-12-15,85
4,5b964b50a6fb5422abb2fda7,5b964a709153de1766fcf94c,65774ecf4fb51d001be77c5b,2023-12-16,36
...,...,...,...,...,...
51846,650eaa5d1b1432001b409943,650ead650b6ef7001bcbdca6,6517e4e43a3ee4001bbe23b9,2024-03-21,7
51847,650eaa5d1b1432001b409943,650ead650b6ef7001bcbdca6,6517e4e43a3ee4001bbe23b9,2024-03-25,5
51848,650eaa5d1b1432001b409943,650ead650b6ef7001bcbdca6,6517e4e43a3ee4001bbe23b9,2024-03-26,8
51849,650eaa5d1b1432001b409943,650ead650b6ef7001bcbdca6,6517e4e43a3ee4001bbe23b9,2024-03-27,10


In [None]:
# Creating the base data frame
data = []
forecast_xgb_norm = []
forecast_xgb_lag  = []
forecast_rf_lag  = []
forecast_lgb_lag  = []
forecast_prophet  = []
forecast_prophet_multi  = []
#forecast_arima  = []
sales_data = base_data.copy()
sales_data = sales_data[(sales_data["date"] >= train_start) & (sales_data["date"] <= test_end)]
# sales_data['weekday'] = sales_data['date'].dt.weekday
unique_foodcourts = sales_data['foodcourt'].unique()
for foodcourt in unique_foodcourts:
    # Subset data for the current food court
    foodcourt_data = sales_data[sales_data['foodcourt'] == foodcourt]
    
    # Unique restaurants within the current food court
    unique_restaurants_foodcourt = foodcourt_data['restaurant'].unique()
    
    # Middle loop for restaurants
    for restaurant in unique_restaurants_foodcourt:
        # Subset data for the current restaurant within the current food court
        restaurant_data = foodcourt_data[foodcourt_data['restaurant'] == restaurant]
        
        # Unique items for the current restaurant
        unique_items_restaurant = restaurant_data['restaurantmenuitem'].unique()
        
        # Inner loop for items within the current restaurant
        for item in unique_items_restaurant:
            # Subset data for the current item within the current restaurant and food court
            item_data = restaurant_data[restaurant_data['restaurantmenuitem'] == item]
            
            # Fill missing dates for the current item
            item_data2 = add_missing_dates(train_start, test_end, item_data)
            data.append(item_data2)
            
            #XGB with day of week
            forecast_output_xgb = run_xgboost(item_data2, train_end,test_end)
            forecast_xgb_norm.append(forecast_output_xgb)
                        
            #XGB with lag
            forecast_output_xgb_lag = run_xgboost_lag(item_data2, train_end,test_end)
            forecast_xgb_lag.append(forecast_output_xgb_lag)

            #RF with lag
            forecast_output_rf_lag = run_rf_lag(item_data2, train_end,test_end)
            forecast_rf_lag.append(forecast_output_rf_lag)

            #LGBM 
            # forecast_output_lgb_lag = run_lgb_lag(item_data2, train_end,test_end)
            # forecast_lgb_lag.append(forecast_output_lgb_lag)

            #Prophet
            forecast_output_prophet= run_prophet_lag2(item_data2, train_end,test_end)
            forecast_prophet.append(forecast_output_prophet)
            # forecast_prophet.append(forecast_output_prophet)

            forecast_output_prophet= run_prophet_lag3(item_data2, train_end,test_end)
            forecast_prophet_multi.append(forecast_output_prophet)
            # forecast_prophet.append(forecast_output_prophet)

            #forecast_output_arima= time_series_forecasting_auto_arima(item_data2, train_end,test_end)
            #forecast_arima.append(forecast_output_arima)


            #time_series_forecasting_auto_arima

# Concatenate all forecast results into a single DataFrame
data = pd.concat(data, ignore_index=True)
forecast_xgb_norm = pd.concat(forecast_xgb_norm, ignore_index=True)
forecast_xgb_lag = pd.concat(forecast_xgb_lag, ignore_index=True)
forecast_rf_lag = pd.concat(forecast_rf_lag, ignore_index=True)
#forecast_lgb_lag = pd.concat(forecast_lgb_lag, ignore_index=True)
forecast_prophet = pd.concat(forecast_prophet, ignore_index=True)
forecast_prophet_multi = pd.concat(forecast_prophet_multi, ignore_index=True)
#forecast_arima = pd.concat(forecast_arima, ignore_index=True)

06:27:14 - cmdstanpy - INFO - Chain [1] start processing
06:27:14 - cmdstanpy - INFO - Chain [1] done processing
06:27:14 - cmdstanpy - INFO - Chain [1] start processing
06:27:14 - cmdstanpy - INFO - Chain [1] done processing
06:27:14 - cmdstanpy - INFO - Chain [1] start processing
06:27:14 - cmdstanpy - INFO - Chain [1] done processing
06:27:14 - cmdstanpy - INFO - Chain [1] start processing
06:27:14 - cmdstanpy - INFO - Chain [1] done processing
06:27:15 - cmdstanpy - INFO - Chain [1] start processing
06:27:15 - cmdstanpy - INFO - Chain [1] done processing
06:27:15 - cmdstanpy - INFO - Chain [1] start processing
06:27:15 - cmdstanpy - INFO - Chain [1] done processing
06:27:15 - cmdstanpy - INFO - Chain [1] start processing
06:27:15 - cmdstanpy - INFO - Chain [1] done processing
06:27:15 - cmdstanpy - INFO - Chain [1] start processing
06:27:15 - cmdstanpy - INFO - Chain [1] done processing
06:27:16 - cmdstanpy - INFO - Chain [1] start processing
06:27:16 - cmdstanpy - INFO - Chain [1]

In [None]:
forecast_xgb_norm.to_csv("forecast_xgb_norm.csv")
forecast_xgb_lag.to_csv("forecast_xgb_lag.csv")
forecast_prophet.to_csv("forecast_prophet.csv")
forecast_prophet_multi.to_csv("forecast_prophet_multi.csv")
forecast_rf_lag.to_csv("forecast_rf_lag.csv")
#forecast_lgb_lag.to_csv("forecast_lgb_lag.csv")
#forecast_arima.to_csv("forecast_arima.csv")

In [None]:
len(forecast_rf_lag)/7

965.0

In [None]:
# deep_ar_multi = pd.read_csv("DeepAR_multi.csv")
# deep_ar_multi = deep_ar_multi[["foodcourt","restaurant","restaurantmenuitem","Accuracy_DeepAR_Multi"]].reset_index(drop= True).drop_duplicates()
# deep_ar_multi.head()

In [None]:
multi_variate = pd.read_csv("results_26_3.csv")
multi_variate = multi_variate[["foodcourtid","restaurant","menuitemid","accuracy_lr","accuracy_xgb_multi","accuracy_rf_lag"]].drop_duplicates()
multi_variate = multi_variate.rename(columns={"foodcourtid":"foodcourt","menuitemid":"restaurantmenuitem","accuracy_rf_lag":"accuracy_rf_multi"})
print(len(multi_variate))
multi_variate.head()

965


Unnamed: 0,foodcourt,restaurant,restaurantmenuitem,accuracy_lr,accuracy_xgb_multi,accuracy_rf_multi
0,5b964b50a6fb5422abb2fda7,5c406ac021301e2fa8093d09,646b51e03d50300018af7f17,0.065637,0.432432,0.428571
7,5b964b50a6fb5422abb2fda7,5c406ac021301e2fa8093d09,646b51e03d50300018af7f21,0.099125,0.440233,0.006397
14,5b964b50a6fb5422abb2fda7,5c406ac021301e2fa8093d09,646b51e03d50300018af7f1e,0.265942,0.217391,0.192547
21,5b964b50a6fb5422abb2fda7,5b964a709153de1766fcf94c,65774ecf4fb51d001be77c77,0.102484,0.10559,0.065217
28,5b964b50a6fb5422abb2fda7,5b964a709153de1766fcf94c,65774ecf4fb51d001be77c5b,0.857143,0.571429,0.542857


In [None]:
deep_ar_uni = pd.read_csv("deepar_uni_26_03.csv")
deep_ar_uni = deep_ar_uni[["item_id","Accuracy"]].drop_duplicates()
deep_ar_uni = deep_ar_uni.rename(columns={"Accuracy":"accuracy_deepar_uni"})
deep_ar_multi = pd.read_csv("deepar_multi_26_03.csv")
deep_ar_multi = deep_ar_multi[["item_id","Accuracy"]].drop_duplicates()
deep_ar_multi = deep_ar_multi.rename(columns={"Accuracy":"accuracy_deepar_multi"})
deep_ar = pd.merge(deep_ar_uni, deep_ar_multi, on=['item_id'], how='left')
print(len(deep_ar))

965


In [None]:
deep_ar[['foodcourt', 'restaurant', 'restaurantmenuitem']] = deep_ar['item_id'].str.split('_', expand=True)
deep_ar = deep_ar[['foodcourt', 'restaurant', 'restaurantmenuitem','accuracy_deepar_uni','accuracy_deepar_multi']]
print(len(deep_ar))
deep_ar.head()

965


Unnamed: 0,foodcourt,restaurant,restaurantmenuitem,accuracy_deepar_uni,accuracy_deepar_multi
0,5b964b50a6fb5422abb2fda7,5b964a709153de1766fcf94c,65774ecf4fb51d001be77c5b,0.304762,0.428571
1,5b964b50a6fb5422abb2fda7,5b964a709153de1766fcf94c,65774ecf4fb51d001be77c77,0.156758,0.0
2,5b964b50a6fb5422abb2fda7,5c406ac021301e2fa8093d09,646b51e03d50300018af7f17,0.145128,0.054054
3,5b964b50a6fb5422abb2fda7,5c406ac021301e2fa8093d09,646b51e03d50300018af7f1e,0.274327,0.083851
4,5b964b50a6fb5422abb2fda7,5c406ac021301e2fa8093d09,646b51e03d50300018af7f21,0.081633,0.01895


In [None]:
xgb_norm = forecast_xgb_norm[["foodcourt","restaurant","restaurantmenuitem","accuracy_xgb_norm"]].reset_index(drop= True).drop_duplicates()
xgb_lag = forecast_xgb_lag[["foodcourt","restaurant","restaurantmenuitem","accuracy_xgb_lag"]].reset_index(drop= True).drop_duplicates()
rf_lag = forecast_rf_lag[["foodcourt","restaurant","restaurantmenuitem","accuracy_rf_lag"]].reset_index(drop= True).drop_duplicates()
#lgb_lag = forecast_lgb_lag[["foodcourt","restaurant","restaurantmenuitem","accuracy_lgb_lag"]].reset_index(drop= True).drop_duplicates()
prophet = forecast_prophet[["foodcourt","restaurant","restaurantmenuitem","accuracy_prophet"]].reset_index(drop= True).drop_duplicates()
prophet_multi = forecast_prophet_multi[["foodcourt","restaurant","restaurantmenuitem","accuracy_prophet_multi"]].reset_index(drop= True).drop_duplicates()
#arima = forecast_arima[["foodcourt","restaurant","restaurantmenuitem","accuracy_arima"]].reset_index(drop= True).drop_duplicates()
#xgb_multi = pd.read_csv("xgb_summary.csv")
##lr = pd.read_csv("lr_summary.csv")
# deep_ar = pd.read_csv("deep_ar_summary.csv")
# deep_ar_multi = pd.read_csv("deep_ar_summary.csv")
output = pd.merge(multi_variate, deep_ar, on=['foodcourt', 'restaurant', 'restaurantmenuitem'], how='left')
output = pd.merge(output, xgb_norm, on=['foodcourt', 'restaurant', 'restaurantmenuitem'], how='left')
output = pd.merge(output, rf_lag, on=['foodcourt', 'restaurant', 'restaurantmenuitem'], how='left')
output = pd.merge(output, prophet, on=['foodcourt', 'restaurant', 'restaurantmenuitem'], how='left')
output = pd.merge(output, xgb_lag, on=['foodcourt', 'restaurant', 'restaurantmenuitem'], how='left')
#output = pd.merge(output, lr, on=['foodcourt', 'restaurant', 'restaurantmenuitem'], how='left')
output = pd.merge(output, prophet_multi, on=['foodcourt', 'restaurant', 'restaurantmenuitem'], how='left')
#output = pd.merge(output, deep_ar, on=['foodcourt', 'restaurant', 'restaurantmenuitem'], how='left')
#output = pd.merge(output, multi_variate, on=['foodcourt', 'restaurant', 'restaurantmenuitem'], how='left')
#output = pd.merge(output, deep_ar_multi, on=['foodcourt', 'restaurant', 'restaurantmenuitem'], how='left')
output['max_accuracy'] = output[['accuracy_xgb_lag','accuracy_rf_lag', 'accuracy_xgb_norm','accuracy_prophet',"accuracy_prophet_multi","accuracy_deepar_multi","accuracy_deepar_uni","accuracy_lr","accuracy_xgb_multi","accuracy_rf_multi"]].max(axis=1)
output['best_model'] = output[['accuracy_xgb_lag', 'accuracy_rf_lag','accuracy_xgb_norm','accuracy_prophet',"accuracy_prophet_multi","accuracy_deepar_multi","accuracy_deepar_uni","accuracy_lr","accuracy_xgb_multi","accuracy_rf_multi"]].idxmax(axis=1)

In [None]:
output.to_excel("ROCV5_Out.xlsx")

In [None]:
def segment_max_accuracy(accuracy):
    if accuracy >= 0.90:
        return '>90'
    elif .80 <= accuracy < .90:
        return '80-90'
    elif .70 <= accuracy < .80:
        return '70-80'
    elif .50 <= accuracy < .70:
        return '50-70'
    else:
        return '<50'

def add_max_accuracy_segment(df):
    df['max_accuracy_segment'] = df['max_accuracy'].apply(segment_max_accuracy)
    return df

# Apply the function to your DataFrame
output = add_max_accuracy_segment(output)
rank_dataframe_segment =  rank_dataframe_b[["foodcourt","restaurant","restaurantmenuitem","sales_segment","rank_segment"]]
output = pd.merge(output, rank_dataframe_segment, on=['foodcourt', 'restaurant', 'restaurantmenuitem'], how='left')
output.to_csv("output_summary.csv")

In [None]:
pivot_table_df = output.pivot_table(index='max_accuracy_segment',  aggfunc='size', fill_value=0)
desired_order_rows = ['>90','80-90','70-80','50-70','<50' ]
pivot_table_df = pivot_table_df.reindex(index=desired_order_rows)
pivot_table_df

max_accuracy_segment
>90      346
80-90    272
70-80    192
50-70    124
<50       31
dtype: int64

In [None]:
pivot_table_df = output.pivot_table(index='sales_segment', columns='max_accuracy_segment', aggfunc='size', fill_value=0)
desired_order_rows = ['>100', '75-100','50-75','25-50',  '10-25',  '<10' ]
desired_order_columns = ['>90','80-90','70-80','50-70','<50' ]
pivot_table_df = pivot_table_df.reindex(index=desired_order_rows, columns=desired_order_columns)

pivot_table_df
#pivot_table_df["sales_segment","max_accuracy_segment",">100","75-100","50-75","25-50","10-25",">100"]

max_accuracy_segment,>90,80-90,70-80,50-70,<50
sales_segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
>100,41,10,2,1,1
75-100,36,3,0,0,0
50-75,48,7,5,2,3
25-50,73,44,24,14,8
10-25,94,106,55,29,9
<10,54,102,106,78,10


In [None]:
pivot_table_df = output.pivot_table(index='rank_segment', columns='max_accuracy_segment', aggfunc='size', fill_value=0)
desired_order_rows = ['1-100', '100-200','200-300','300-400',  '400-500',  '>500' ]
desired_order_columns = ['>90','80-90','70-80','50-70','<50' ]
pivot_table_df = pivot_table_df.reindex(index=desired_order_rows, columns=desired_order_columns)
pivot_table_df

max_accuracy_segment,>90,80-90,70-80,50-70,<50
rank_segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1-100,83,13,2,1,1
100-200,59,17,16,3,5
200-300,48,29,10,10,3
300-400,40,30,15,11,4
400-500,31,33,21,11,4
>500,85,150,128,88,14


In [None]:
base_data2 = base_data[(base_data['date']>=train_start) & (base_data['date']<=train_end) ]
base_data2 = base_data2.groupby(['foodcourt','restaurant','restaurantmenuitem']).agg({'total_count': 'sum'}).reset_index()
output = pd.merge(output, base_data2, on=['foodcourt', 'restaurant', 'restaurantmenuitem'], how='left')
output.to_csv("output_with_sales.csv")

In [None]:
pivot_table_df = output.pivot_table(index='max_accuracy_segment', values=['total_count', 'foodcourt'], aggfunc={'total_count': 'sum', 'foodcourt': 'count'})
desired_order_rows = ['>90','80-90','70-80','50-70','<50' ]
pivot_table_df = pivot_table_df.reindex(index=desired_order_rows)
pivot_table_df

Unnamed: 0_level_0,foodcourt,total_count
max_accuracy_segment,Unnamed: 1_level_1,Unnamed: 2_level_1
>90,346,244121
80-90,272,126278
70-80,192,56612
50-70,124,32459
<50,31,18324


In [None]:
output

Unnamed: 0,foodcourt,restaurant,restaurantmenuitem,accuracy_lr,accuracy_xgb_multi,accuracy_rf_multi,accuracy_deepar_uni,accuracy_deepar_multi,accuracy_xgb_norm,accuracy_rf_lag,accuracy_prophet,accuracy_xgb_lag,accuracy_prophet_multi,max_accuracy,best_model,max_accuracy_segment,sales_segment,rank_segment,total_count
0,5b964b50a6fb5422abb2fda7,5c406ac021301e2fa8093d09,646b51e03d50300018af7f17,0.065637,0.432432,0.428571,0.145128,0.054054,0.187486,0.216216,0.182603,0.386100,0.362934,0.432432,accuracy_xgb_multi,<50,50-75,100-200,1369
1,5b964b50a6fb5422abb2fda7,5c406ac021301e2fa8093d09,646b51e03d50300018af7f21,0.099125,0.440233,0.006397,0.081633,0.018950,0.078717,0.101606,0.096210,0.119534,0.374636,0.440233,accuracy_xgb_multi,<50,>100,1-100,5103
2,5b964b50a6fb5422abb2fda7,5c406ac021301e2fa8093d09,646b51e03d50300018af7f1e,0.265942,0.217391,0.192547,0.274327,0.083851,0.138199,0.043478,0.523810,0.198758,0.428571,0.523810,accuracy_prophet,50-70,10-25,300-400,411
3,5b964b50a6fb5422abb2fda7,5b964a709153de1766fcf94c,65774ecf4fb51d001be77c77,0.102484,0.105590,0.065217,0.156758,0.000000,0.231588,0.163857,0.235729,0.104555,0.304348,0.304348,accuracy_prophet_multi,<50,50-75,100-200,1938
4,5b964b50a6fb5422abb2fda7,5b964a709153de1766fcf94c,65774ecf4fb51d001be77c5b,0.857143,0.571429,0.542857,0.304762,0.428571,0.552381,0.495238,0.285714,0.619048,0.428571,0.857143,accuracy_lr,80-90,>100,1-100,1586
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960,64e33cca77a4b10017691075,6397f84ed6a6982bfd8544f7,652cde2ddb4ec900177a5e15,0.821429,0.857143,0.964286,0.875000,0.892857,0.678571,0.660714,0.678571,0.785714,0.660714,0.964286,accuracy_rf_multi,>90,<10,>500,66
961,650eaa5d1b1432001b409943,650ead650b6ef7001bcbdca6,6517e4e43a3ee4001bbe23b9,0.610544,0.779592,0.732993,0.753401,0.772449,0.754592,0.602211,0.772449,0.506803,0.763095,0.779592,accuracy_xgb_multi,70-80,<10,>500,149
962,650eaa5d1b1432001b409943,650ead650b6ef7001bcbdca6,6517e4e43a3ee4001bbe23b5,0.697363,0.692088,0.617802,0.748791,0.763626,0.691538,0.624286,0.785934,0.548462,0.841209,0.841209,accuracy_prophet_multi,80-90,<10,>500,251
963,650eaa5d1b1432001b409943,650ead650b6ef7001bcbdca6,6517e4e43a3ee4001bbe23ad,0.645910,0.643575,0.872641,0.813459,0.881931,0.831989,0.852069,0.768201,0.793458,0.772475,0.881931,accuracy_deepar_multi,80-90,25-50,100-200,1239


In [None]:
pivot_table_df = output.pivot_table(index='best_model', values=['total_count', 'foodcourt'],columns='max_accuracy_segment', aggfunc={'total_count': 'sum', 'foodcourt': 'count'})
desired_order_rows = ['>90','80-90','70-80','50-70','<50' ]
pivot_table_df = pivot_table_df.reindex( )
pivot_table_df

Unnamed: 0_level_0,foodcourt,foodcourt,foodcourt,foodcourt,foodcourt,total_count,total_count,total_count,total_count,total_count
max_accuracy_segment,50-70,70-80,80-90,<50,>90,50-70,70-80,80-90,<50,>90
best_model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
accuracy_deepar_multi,15.0,30.0,48.0,6.0,115.0,2617.0,9485.0,17215.0,1350.0,46850.0
accuracy_deepar_uni,14.0,18.0,38.0,,27.0,5448.0,4468.0,15124.0,,34455.0
accuracy_lr,12.0,11.0,11.0,3.0,8.0,1879.0,2881.0,3375.0,1300.0,2468.0
accuracy_prophet,6.0,10.0,24.0,1.0,21.0,2755.0,3923.0,18913.0,323.0,23065.0
accuracy_prophet_multi,7.0,17.0,20.0,3.0,25.0,1451.0,7186.0,9375.0,3043.0,33827.0
accuracy_rf_lag,11.0,17.0,21.0,,26.0,2123.0,4793.0,9318.0,,19236.0
accuracy_rf_multi,14.0,25.0,35.0,1.0,20.0,3382.0,6130.0,18873.0,50.0,16184.0
accuracy_xgb_lag,13.0,24.0,24.0,12.0,40.0,4936.0,5726.0,5924.0,4330.0,21981.0
accuracy_xgb_multi,16.0,15.0,19.0,4.0,38.0,5197.0,5564.0,7646.0,7241.0,21846.0
accuracy_xgb_norm,16.0,25.0,32.0,1.0,26.0,2671.0,6456.0,20515.0,687.0,24209.0


In [None]:
pivot_table_df = output.pivot_table(index='best_model', values=['foodcourt'],aggfunc={ 'foodcourt': 'count'})
#desired_order_rows = ['>90','80-90','70-80','50-70','<50' ]
pivot_table_df = pivot_table_df.reindex( )
pivot_table_df

Unnamed: 0_level_0,foodcourt
best_model,Unnamed: 1_level_1
accuracy_deepar_multi,214
accuracy_deepar_uni,97
accuracy_lr,45
accuracy_prophet,62
accuracy_prophet_multi,72
accuracy_rf_lag,75
accuracy_rf_multi,95
accuracy_xgb_lag,113
accuracy_xgb_multi,92
accuracy_xgb_norm,100


In [None]:
pivot_table_df = output.pivot_table(index='best_model', values=[ 'foodcourt'],columns='max_accuracy_segment', aggfunc={ 'foodcourt': 'count'})
# desired_order_rows = ['>90','80-90','70-80','50-70','<50' ]
pivot_table_df = pivot_table_df.reindex()
pivot_table_df

Unnamed: 0_level_0,foodcourt,foodcourt,foodcourt,foodcourt,foodcourt
max_accuracy_segment,50-70,70-80,80-90,<50,>90
best_model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
accuracy_deepar_multi,15.0,30.0,48.0,6.0,115.0
accuracy_deepar_uni,14.0,18.0,38.0,,27.0
accuracy_lr,12.0,11.0,11.0,3.0,8.0
accuracy_prophet,6.0,10.0,24.0,1.0,21.0
accuracy_prophet_multi,7.0,17.0,20.0,3.0,25.0
accuracy_rf_lag,11.0,17.0,21.0,,26.0
accuracy_rf_multi,14.0,25.0,35.0,1.0,20.0
accuracy_xgb_lag,13.0,24.0,24.0,12.0,40.0
accuracy_xgb_multi,16.0,15.0,19.0,4.0,38.0
accuracy_xgb_norm,16.0,25.0,32.0,1.0,26.0


In [None]:
meals= pd.read_excel("meals_std.xlsx")

In [None]:
meals = meals[["restaurantmenuitem","menuitemname"]].drop_duplicates(subset=['restaurantmenuitem'], keep='first')
meals

Unnamed: 0,restaurantmenuitem,menuitemname
0,648f1dbbe42a8f00174bfa3b,Veg Dinner Meal
1,646e15e20ecce00017753b9c,Aloo Paratha With Pickle & Butter
2,641c2ecbe3642c0017428534,Chicken Tikka Biryani
3,641e96130360260017dc33b1,Mini Chicken Thali
4,65644d7aa1cdf9001b15d6bd,Non Veg Combo
...,...,...
111883,64c89865eb8572001b22d414,Plain Biryani
111897,64f0422fdd06d3001ba97736,Plain Paratha With Paneer Preparation And salad
111921,64ca86f16fbdb000114fba13,Banjara Chicken Combo
111966,6513aaef06ef8d001bd1b40f,Thalipith with Curd


In [None]:
output = pd.merge(output, meals, on=[ 'restaurantmenuitem'], how='left')

In [None]:
len(output)

965

In [None]:
output.to_csv("output_with_itemname.csv")