# Loading and Cleaning Data

In [8]:
import pandas as pd 
import numpy as np

In [9]:
df = pd.read_csv("./data/merged.csv")
# Takes ~30 seconds to run
df = df.dropna()

# Formatting Data

In [10]:
def get_spec(df, city, concept):
    df = df.loc[df['city'] == city]
    df = df.loc[df['concept'] == concept]
    return df

def time_format(data):
    bill_hour = []
    bill_minute = []
    bill_second = []
    for i in data['bill_paid_at_local']:
        day = i.split(" ")[1]
        bill_hour.append(int(day.split(":")[0]))
        bill_minute.append(int(day.split(":")[1]))
        bill_second.append(int(day.split(":")[2]))

    data['bill_hour'] = bill_hour
    data['bill_minute'] = bill_minute
    data['bill_second'] = bill_second
    return data

def bills_per_restaurant(data):
    count_data = data.groupby('venue_xref_id').count()
    count_data = count_data[['payment_count']]
    new_data = pd.DataFrame()

    # create new columns
    new_data['venue_xref_id'] = count_data.index
    new_data['payment_count'] = count_data['payment_count'].values
    new_data['city'] = data['city']
    new_data['concept'] = data['concept']
    new_data['bill_hour'] = data['bill_hour']
    new_data['bill_day_of_week'] = data['bill_day_of_week']
    new_data['bill_date'] = data['bill_date']

    return new_data
def locateSum(startDate,endDate, df):
    
    y = df.loc[(df["business_date"]>= startDate) & (df["business_date"] <= endDate)]
    
    return y["bill_total_net"]

def get_months(df):
    df['business_date'] = pd.to_datetime(df['business_date'], format='%Y-%m-%d')

    janNet2024 = locateSum('2024-01-01', '2024-01-31', df) 
    febNet2024 = locateSum('2024-02-01', '2024-02-28', df) 
    marNet2024 = locateSum('2024-03-01', '2024-03-31', df) 
    aprNet2024 = locateSum('2024-04-01', '2024-04-30', df) 
    mayNet2024 = locateSum('2024-05-01', '2024-05-31', df) 
    junNet2024 = locateSum('2024-06-01', '2024-06-30', df) 
    julNet2024 = locateSum('2024-07-01', '2024-07-31', df) 
    augNet2024 = locateSum('2024-08-01', '2024-08-30', df) 
    sepNet2024 = locateSum('2024-09-01', '2024-09-30', df) 
    octNet2024 = locateSum('2024-10-01', '2024-10-31', df) 
    novNet2024 = locateSum('2024-11-01', '2024-11-29', df) 
    decNet2024 = locateSum('2024-12-01', '2024-12-31', df) 

    months = ["jan2024", "feb2024", "mar2024", "apr2024", "may2024", "jun2024", "jul2024", "aug2024","sep2024", "oct2024", "nov2024", "dec2024"]

    nets = [janNet2024, febNet2024, marNet2024, aprNet2024, mayNet2024, junNet2024, julNet2024, augNet2024, sepNet2024, octNet2024, novNet2024, decNet2024]

def get_weeks(df):
    day_order = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']
    df['bill_day'] = pd.Categorical(df['bill_day'], categories=day_order, ordered=True)

def preprocess_data(df):
    """Creates a new DataFrame with required columns for each unique venue_xref_id."""
    
    # Convert date columns to datetime format
    df['business_date'] = pd.to_datetime(df['business_date'])
    df['bill_paid_at_local'] = pd.to_datetime(df['bill_paid_at_local'])

    # Extract time components
    df['bill_hour'] = df['bill_paid_at_local'].dt.hour
    df['bill_date'] = pd.to_datetime(df['business_date'])
    df['bill_day_of_week'] = df['bill_date'].dt.weekday

    # Group by venue_xref_id and aggregate
    final_df = df.groupby(['venue_xref_id', 'city', 'concept', 'bill_day_of_week', 'bill_date', 'bill_hour']).size().reset_index(name='count')

    return final_df

# Apply the function to create final_df
df = preprocess_data(df)
print(df)

print(df['count'].min(), df['count'].max())
print(df['count'].mean())
print(df['count'].median())
# bins = np.linspace(df['count'].min(), df['transaction_count'].max(), num=6)
# df['transaction_category'] = pd.cut(df['transaction_count'], bins, labels=["Very Low", "Low", "Medium", "High", "Very High"])


# city = "Toronto" # Change this line
# concept = "BAR" # Change this line

# print("Number of Unique Restaurants", len(df['venue_xref_id'].unique()))
# df = df.loc[df['city'] == city]
# df = df.loc[df['concept'] == concept]
# print("Number of Bills Paid in this City, in this Concept: ", len(df))

# print("Number of Bills for a Specific Restaurant", len(df.loc(df['venue_xref_id'] == df['venue_xref_id'].unique()[0])))

                                            venue_xref_id    city  \
0       0002a1cf14e9c1acaa8255fd6777d916d3aec6bc1f3c8a...  Ottawa   
1       0002a1cf14e9c1acaa8255fd6777d916d3aec6bc1f3c8a...  Ottawa   
2       0002a1cf14e9c1acaa8255fd6777d916d3aec6bc1f3c8a...  Ottawa   
3       0002a1cf14e9c1acaa8255fd6777d916d3aec6bc1f3c8a...  Ottawa   
4       0002a1cf14e9c1acaa8255fd6777d916d3aec6bc1f3c8a...  Ottawa   
...                                                   ...     ...   
706054  ffe3cf683c3b3d6f7c1ae13deb93a0bdb9f87ffb3b658b...  Irving   
706055  ffe3cf683c3b3d6f7c1ae13deb93a0bdb9f87ffb3b658b...  Irving   
706056  ffe3cf683c3b3d6f7c1ae13deb93a0bdb9f87ffb3b658b...  Irving   
706057  ffe3cf683c3b3d6f7c1ae13deb93a0bdb9f87ffb3b658b...  Irving   
706058  ffe3cf683c3b3d6f7c1ae13deb93a0bdb9f87ffb3b658b...  Irving   

              concept  bill_day_of_week  bill_date  bill_hour  count  
0         FAST_CASUAL                 0 2024-10-21         10      1  
1         FAST_CASUAL        

In [11]:
# Formatting the data

# Training the Model

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [13]:
categorical_features = ['concept']
df = pd.get_dummies(df, columns=categorical_features)
# Select features and target variable
X = df.drop(columns=['count', 'venue_xref_id', 'bill_date', 'city'])  # Exclude target variable and date

y = df['count']

print(X)
print(y)

        bill_day_of_week  bill_hour  concept_BAKERY  concept_BAR  \
0                      0         10           False        False   
1                      1         11           False        False   
2                      1         12           False        False   
3                      1         13           False        False   
4                      1         14           False        False   
...                  ...        ...             ...          ...   
706054                 6         13           False        False   
706055                 6         14           False        False   
706056                 6         11           False        False   
706057                 6         13           False        False   
706058                 6         14           False        False   

        concept_BREWERY  concept_BUFFET  concept_CAFE  \
0                 False           False         False   
1                 False           False         False   
2           

In [14]:
# Train-Test Split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate Model Performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


Mean Absolute Error (MAE): 7.527451508956656
Root Mean Squared Error (RMSE): 11.738527204079094


In [None]:
# # Define a tolerance level (e.g., within 10% of actual value)
# tolerance = 0.10  

# # Convert actual and predicted values to NumPy arrays
# y_true = np.array(test_data["count"])
# y_pred = model.predict(X_test)

# # Calculate absolute percentage error
# error_percentage = np.abs(y_pred - y_true) / y_true

# # Count correct predictions within tolerance
# correct_predictions = np.sum(error_percentage <= tolerance)

# # Calculate "accuracy"
# accuracy = correct_predictions / len(y_true)

# print(f"Regression Accuracy (within ±{tolerance*100}% tolerance): {accuracy:.2%}")

NameError: name 'test_data' is not defined

# Saving the Model

In [16]:
import joblib

# Save the trained model to a file
joblib.dump(model, 'model.pkl')

['model.pkl']

: 

: 

: 