In [10]:
import pandas as pd
import re

# Load the CSV file
catering_details_df = pd.read_csv('tbl_catering_details.csv')

# Function to extract numeric values from the QUANTITY column
def extract_quantity(value):
    if pd.isnull(value) or value.lower() == 'none':
        return None
    value = value.lower()
    # Remove any non-numeric and non-decimal characters
    numeric_value = re.findall(r'[\d.]+', value)
    if numeric_value:
        return float(numeric_value[0])
    return None

# Apply the extraction function to the QUANTITY column
catering_details_df['CLEANED_QUANTITY'] = catering_details_df['QUANTITY'].apply(extract_quantity)

# Check the results and see how many values were successfully extracted
cleaned_quantities_head = catering_details_df[['QUANTITY', 'CLEANED_QUANTITY']].head(20)
missing_cleaned_quantities = catering_details_df['CLEANED_QUANTITY'].isnull().sum()

print(cleaned_quantities_head)
print(f"Number of missing cleaned quantities: {missing_cleaned_quantities}")


    QUANTITY  CLEANED_QUANTITY
0       1 tr               1.0
1    11/2 tr              11.0
2   100 cups             100.0
3   1,1/2 tr               1.0
4   2 1/2 tr               2.0
5     100 ps             100.0
6       1 tr               1.0
7       2 tr               2.0
8        100             100.0
9         40              40.0
10    1 tray               1.0
11      1 tr               1.0
12        30              30.0
13  1/2 Tray               1.0
14  3/4 Tray               3.0
15  1/2 Tray               1.0
16    1 Tray               1.0
17        30              30.0
18  3/4 Tray               3.0
19    1 Tray               1.0
Number of missing cleaned quantities: 5


In [11]:
import pandas as pd

# Load the CSV files
tbl_catering = pd.read_csv('tbl_catering.csv')
tbl_catering_details = pd.read_csv('tbl_catering_details.csv')
tbl_catering_products = pd.read_csv('tbl_catering_products.csv')

# # Ensure the QUANTITY column is clean (using the earlier code)
# # Example: catering_details_df['CLEANED_QUANTITY'] = ...

# # Merge tbl_catering with tbl_catering_details
# merged_df = pd.merge(tbl_catering, tbl_catering_details, left_on='ID', right_on='CATERING_ID', how='left')
# final_df = pd.merge(merged_df, tbl_catering_products, left_on='ITEM_DESC', right_on='PRODUCT_NAME', how='left')

# # Drop any unnecessary columns
# final_df = final_df.drop(columns=['ID', 'CATERING_ID', 'PRODUCT_ID'])

# # Display the first few rows of the final merged dataframe
# print(final_df.head())





# Data Cleaning
tbl_catering.replace('\\N', pd.NA, inplace=True)
tbl_catering_details.replace('\\N', pd.NA, inplace=True)
tbl_catering_products.replace('\\N', pd.NA, inplace=True)

# # Convert dates to datetime
# tbl_catering['DELIVERY_DATE'] = pd.to_datetime(tbl_catering['DELIVERY_DATE'])
# tbl_catering['DELIVERY_TIME'] = pd.to_datetime(tbl_catering['DELIVERY_TIME'], format='%H:%M:%S').dt.time

# Fill missing values in numerical columns with 0
num_cols = tbl_catering.select_dtypes(include='number').columns
tbl_catering[num_cols] = tbl_catering[num_cols].fillna(0)

# Data Merging
merged_df = pd.merge(tbl_catering, tbl_catering_details, left_on='ID', right_on='CATERING_ID', how='left')
final_df = pd.merge(merged_df, tbl_catering_products, left_on='ITEM_DESC', right_on='PRODUCT_NAME', how='left')

# Display the first few rows of the final dataframe
final_df.tail()


Unnamed: 0,ID_x,DELIVERY_DATE,DELIVERY_TIME,DELIVERY_MODE,DELIVERY_STATUS,INVOICE_AMOUNT,TAXES,CATERING_COST,SHOW_PRICE,DISCOUNT,...,CATEGORY_NAME,CATEGORY_ID,PRODUCT_NAME,UNIT,PRICE_y,IS_DAIRY_FREE,IS_GLUTEN_FREE,IS_NUT_FREE,IS_VEGAN,EXCLUDE_ITEM
13079,2796,4/7/2024,18:00:00,30,,1485.0,0.0,1485.0,8,0.0,...,,,,,,,,,,
13080,2796,4/7/2024,18:00:00,30,,1485.0,0.0,1485.0,8,0.0,...,,,,,,,,,,
13081,2796,4/7/2024,18:00:00,30,,1485.0,0.0,1485.0,8,0.0,...,,,,,,,,,,
13082,2796,4/7/2024,18:00:00,30,,1485.0,0.0,1485.0,8,0.0,...,,,,,,,,,,
13083,2797,7/31/2024,14:30:00,32,,0.0,0.0,150.0,9,0.0,...,,,,,,,,,,


In [12]:
# Assuming we want to work with the 'QUANTITY' column, let's clean it first
import re

# Function to extract numeric values from the QUANTITY column
def extract_quantity(value):
    if pd.isnull(value) or value.lower() == 'none':
        return None
    value = value.lower()
    numeric_value = re.findall(r'[\d.]+', value)
    if numeric_value:
        return float(numeric_value[0])
    return None

# Clean the QUANTITY column
final_df['CLEANED_QUANTITY'] = final_df['QUANTITY'].apply(extract_quantity)

# Calculate the total quantity of each dish per event
final_df['TOTAL_DISH_QUANTITY'] = final_df.groupby('ID_x')['CLEANED_QUANTITY'].transform('sum')

# Calculate the average quantity of each dish per guest
final_df['AVG_QUANTITY_PER_GUEST'] = final_df['CLEANED_QUANTITY'] / final_df['GUEST_COUNT']

# Calculate the total cost per event
final_df['TOTAL_COST'] = final_df.groupby('ID_x')['PRICE_x'].transform('sum')

# Normalize the dish quantities by guest count
final_df['NORMALIZED_QUANTITY'] = final_df['CLEANED_QUANTITY'] / final_df['GUEST_COUNT']

# Prepare the data for modeling
model_data = final_df[['GUEST_COUNT', 'PRODUCT_NAME', 'AVG_QUANTITY_PER_GUEST', 'TOTAL_COST', 'NORMALIZED_QUANTITY']]

# One-hot encode the categorical variables (e.g., PRODUCT_NAME)
model_data = pd.get_dummies(model_data, columns=['PRODUCT_NAME'], drop_first=True)

# Display the first few rows of the prepared data



In [13]:
import pandas as pd

# Assuming final_df is the merged dataframe from the previous steps

# Convert the 'QUANTITY' and 'GUEST_COUNT' columns to numeric, forcing errors to NaN and then filling with 0
final_df['QUANTITY'] = pd.to_numeric(final_df['QUANTITY'], errors='coerce').fillna(0)
final_df['GUEST_COUNT'] = pd.to_numeric(final_df['GUEST_COUNT'], errors='coerce').fillna(0)

# Step 1: Create aggregate features

# Calculate the total quantity of each dish per event
final_df['TOTAL_DISH_QUANTITY'] = final_df.groupby('ID_x')['QUANTITY'].transform('sum')

# Calculate the average quantity of each dish per guest
final_df['AVG_QUANTITY_PER_GUEST'] = final_df['QUANTITY'] / final_df['GUEST_COUNT']

# Calculate the total cost per event using 'PRICE_x'
final_df['TOTAL_COST'] = final_df.groupby('ID_x')['PRICE_x'].transform('sum')


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Select features and target variable
features = final_df[['GUEST_COUNT', 'CATEGORY_NAME', 'SPICE_LEVEL', 'PRICE_x', 'TOTAL_COST', 'AVG_QUANTITY_PER_GUEST']]
target = final_df['QUANTITY']

# Convert categorical features to dummy variables
features = pd.get_dummies(features)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = model.score(X_test, y_test)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [18]:
# Check for missing values
missing_values = features.isnull().sum()
print(missing_values)


GUEST_COUNT                         0
PRICE_x                             0
TOTAL_COST                          0
AVG_QUANTITY_PER_GUEST              0
CATEGORY_NAME_APPETIZER - NONVEG    0
                                   ..
SPICE_LEVEL_SPICY                   0
SPICE_LEVEL_SWEET                   0
SPICE_LEVEL_V MILD                  0
SPICE_LEVEL_VERY  MILD              0
SPICE_LEVEL_VERY MILD               0
Length: 86, dtype: int64


In [19]:
from sklearn.impute import SimpleImputer

# Impute missing values for numerical features
numerical_imputer = SimpleImputer(strategy='mean')
features[['GUEST_COUNT', 'PRICE_x', 'TOTAL_COST', 'AVG_QUANTITY_PER_GUEST']] = numerical_imputer.fit_transform(features[['GUEST_COUNT', 'PRICE_x', 'TOTAL_COST', 'AVG_QUANTITY_PER_GUEST']])

# Impute missing values for categorical features
categorical_imputer = SimpleImputer(strategy='most_frequent')
features[['CATEGORY_NAME', 'SPICE_LEVEL']] = categorical_imputer.fit_transform(features[['CATEGORY_NAME', 'SPICE_LEVEL']])


KeyError: "None of [Index(['CATEGORY_NAME', 'SPICE_LEVEL'], dtype='object')] are in the [columns]"

In [20]:
# Print available columns to verify
print(features.columns)

# Assuming you find the correct column names, update them in the imputation step
corrected_numerical_columns = ['Corrected_GUEST_COUNT', 'Corrected_PRICE_x', 'Corrected_TOTAL_COST', 'Corrected_AVG_QUANTITY_PER_GUEST']
corrected_categorical_columns = ['Corrected_CATEGORY_NAME', 'Corrected_SPICE_LEVEL']

# Impute missing values for numerical features
numerical_imputer = SimpleImputer(strategy='mean')
features[corrected_numerical_columns] = numerical_imputer.fit_transform(features[corrected_numerical_columns])

# Impute missing values for categorical features
categorical_imputer = SimpleImputer(strategy='most_frequent')
features[corrected_categorical_columns] = categorical_imputer.fit_transform(features[corrected_categorical_columns])


Index(['GUEST_COUNT', 'PRICE_x', 'TOTAL_COST', 'AVG_QUANTITY_PER_GUEST',
       'CATEGORY_NAME_APPETIZER - NONVEG', 'CATEGORY_NAME_APPETIZER - VEG',
       'CATEGORY_NAME_CHAAT ITEM', 'CATEGORY_NAME_CUSTOM DESSERTS',
       'CATEGORY_NAME_ENTREE - CHICKEN', 'CATEGORY_NAME_ENTREE - VEG',
       'CATEGORY_NAME_NOODLES', 'CATEGORY_NAME_SIDES',
       'CATEGORY_NAME_TANDOOR - NONVEG', 'SPICE_LEVEL_0', 'SPICE_LEVEL_1',
       'SPICE_LEVEL_1 TR', 'SPICE_LEVEL_1/2 TR',
       'SPICE_LEVEL_6 MILD AND 6 MED', 'SPICE_LEVEL_BABY SPICE',
       'SPICE_LEVEL_COMP', 'SPICE_LEVEL_ED', 'SPICE_LEVEL_EXTRA MILD',
       'SPICE_LEVEL_GARLIC', 'SPICE_LEVEL_HIGH', 'SPICE_LEVEL_HOT',
       'SPICE_LEVEL_LESS SWEET', 'SPICE_LEVEL_LIGHT SUGAR',
       'SPICE_LEVEL_LIGHT SUGAR ADDED', 'SPICE_LEVEL_LITE SUGAR',
       'SPICE_LEVEL_LITE SUGAR ADDED', 'SPICE_LEVEL_LOOK AT COMMENTS',
       'SPICE_LEVEL_LOW', 'SPICE_LEVEL_M', 'SPICE_LEVEL_M ED',
       'SPICE_LEVEL_MEC', 'SPICE_LEVEL_MED', 'SPICE_LEVEL_MED - HOT',

KeyError: "None of [Index(['Corrected_GUEST_COUNT', 'Corrected_PRICE_x', 'Corrected_TOTAL_COST',\n       'Corrected_AVG_QUANTITY_PER_GUEST'],\n      dtype='object')] are in the [columns]"

In [21]:
# Alternatively, drop rows with any missing values
features.dropna(inplace=True)


In [22]:
# Split the data again if you've made changes to 'features'
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize the model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = model.score(X_test, y_test)

print(f"Mean Absolute Error: {mae}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")


Mean Absolute Error: 0.7238292319449753
Mean Squared Error: 44.29776853489109
R-squared: 0.9952262522494487


In [23]:
import pickle
from sklearn.ensemble import RandomForestRegressor

# Assuming you have trained your model as `model`
# For example:
# model = RandomForestRegressor(n_estimators=100, random_state=42)
# model.fit(X_train, y_train)

# Save the model to a file
with open('trained_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

print("Model saved successfully!")

Model saved successfully!
