In [175]:
import pandas as pd

df = pd.read_csv("archive/Balaji Fast Food Sales.csv")

df.head()

Unnamed: 0,order_id,date,item_name,item_type,item_price,quantity,transaction_amount,transaction_type,received_by,time_of_sale
0,1,07-03-2022,Aalopuri,Fastfood,20,13,260,,Mr.,Night
1,2,8/23/2022,Vadapav,Fastfood,20,15,300,Cash,Mr.,Afternoon
2,3,11/20/2022,Vadapav,Fastfood,20,1,20,Cash,Mr.,Afternoon
3,4,02-03-2023,Sugarcane juice,Beverages,25,6,150,Online,Mr.,Night
4,5,10-02-2022,Sugarcane juice,Beverages,25,8,200,Online,Mr.,Evening


In [176]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   order_id            1000 non-null   int64 
 1   date                1000 non-null   object
 2   item_name           1000 non-null   object
 3   item_type           1000 non-null   object
 4   item_price          1000 non-null   int64 
 5   quantity            1000 non-null   int64 
 6   transaction_amount  1000 non-null   int64 
 7   transaction_type    893 non-null    object
 8   received_by         1000 non-null   object
 9   time_of_sale        1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 78.3+ KB


In [177]:
df.isnull().sum()

order_id                0
date                    0
item_name               0
item_type               0
item_price              0
quantity                0
transaction_amount      0
transaction_type      107
received_by             0
time_of_sale            0
dtype: int64

In [178]:
df['transaction_type'] = df['transaction_type'].fillna("Credit Card")
df.isnull().sum()

order_id              0
date                  0
item_name             0
item_type             0
item_price            0
quantity              0
transaction_amount    0
transaction_type      0
received_by           0
time_of_sale          0
dtype: int64

In [179]:
df.duplicated().sum()

0

In [180]:
df['received_by'] = df['received_by'].replace(['Mr.','Mrs.'],['Male','Female'])

In [181]:
df['date'] = df['date'].str.replace('/','-')
df['date']

0      07-03-2022
1       8-23-2022
2      11-20-2022
3      02-03-2023
4      10-02-2022
          ...    
995     3-19-2023
996     9-20-2022
997     1-26-2023
998     8-27-2022
999     5-29-2022
Name: date, Length: 1000, dtype: object

In [182]:
df['date'] = pd.to_datetime(df['date'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   order_id            1000 non-null   int64         
 1   date                1000 non-null   datetime64[ns]
 2   item_name           1000 non-null   object        
 3   item_type           1000 non-null   object        
 4   item_price          1000 non-null   int64         
 5   quantity            1000 non-null   int64         
 6   transaction_amount  1000 non-null   int64         
 7   transaction_type    1000 non-null   object        
 8   received_by         1000 non-null   object        
 9   time_of_sale        1000 non-null   object        
dtypes: datetime64[ns](1), int64(4), object(5)
memory usage: 78.3+ KB


In [183]:
unique_values = df.select_dtypes(include = 'object').apply(lambda x: x.unique())
for column, values in unique_values.items():
    print(column, values)

item_name ['Aalopuri' 'Vadapav' 'Sugarcane juice' 'Panipuri' 'Frankie' 'Sandwich'
 'Cold coffee']
item_type ['Fastfood' 'Beverages']
transaction_type ['Credit Card' 'Cash' 'Online']
received_by ['Male' 'Female']
time_of_sale ['Night' 'Afternoon' 'Evening' 'Morning' 'Midnight']


In [185]:
# Feature Engineering - Extract time-based features
# Convert date column to datetime format
df['date'] = pd.to_datetime(df['date'], format='mixed')  # Using 'mixed' format to handle different date formats

# Now we can extract time-based features
df['day_of_week'] = df['date'].dt.dayofweek
df['month'] = df['date'].dt.month
df['day_of_year'] = df['date'].dt.dayofyear
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
# Create features and target
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Select features with additional meaningful columns
# Create time mapping
time_mapping = {
    'Morning': 0,
    'Afternoon': 1,
    'Evening': 2,
    'Night': 3
}

# Convert time_of_sale to numeric values
df['time_numeric'] = df['time_of_sale'].map(time_mapping)

# Now select features
features = ['day_of_week', 'month', 'day_of_year', 'time_numeric', 'is_weekend']
X = df[features]
y = df['transaction_amount']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=features)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Create and train the model with optimized parameters
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)
rf_model.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R2 Score: {r2}')

# Perform cross-validation
cv_scores = cross_val_score(rf_model, X_scaled, y, cv=5)
print(f'Cross-validation scores: {cv_scores}')
print(f'Average CV score: {cv_scores.mean()}')

def predict_staffing_needs(date, time_of_day):
    """
    Predict staffing needs based on expected sales
    date: datetime object
    time_of_day: 'Morning', 'Afternoon', 'Evening', or 'Night'
    """
    # Create feature vector
    features = pd.DataFrame({
        'day_of_week': [date.weekday()],  # Changed from day_of_week to weekday()
        'month': [date.month],
        'day_of_year': [date.timetuple().tm_yday],  # Changed from day_of_year to timetuple().tm_yday
        'time_numeric': [time_mapping[time_of_day]],
        'is_weekend': [1 if date.weekday() in [5, 6] else 0]  # Changed to use weekday()
    })
    
    # Scale the features
    features_scaled = scaler.transform(features)
    
    # Predict sales
    predicted_sales = rf_model.predict(features_scaled)[0]
    
    # Staffing logic based on sales ranges
    if predicted_sales < 100:
        return 1, predicted_sales
    elif predicted_sales < 250:
        return 2, predicted_sales
    elif predicted_sales < 400:
        return 3, predicted_sales
    elif predicted_sales < 600:
        return 4, predicted_sales
    else:
        return 5, predicted_sales

# Example usage
from datetime import datetime
test_date = datetime(2024, 3, 15)
staff_needed, predicted_sales = predict_staffing_needs(test_date, 'Evening')
print(f'\nPredicted sales: ${predicted_sales:.2f}')
print(f'Recommended staff: {staff_needed}')

# Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf_model.feature_importances_
})
print("\nFeature Importance:")
print(feature_importance.sort_values('importance', ascending=False))

Mean Squared Error: 47317.93910348191
R2 Score: -0.10125138637824915
Cross-validation scores: [-0.05239363 -0.17404175 -0.03758828 -0.04087787  0.00551595]
Average CV score: -0.05987711638923863

Predicted sales: $350.57
Recommended staff: 3

Feature Importance:
        feature  importance
2   day_of_year    0.599841
3  time_numeric    0.183740
0   day_of_week    0.160551
1         month    0.044223
4    is_weekend    0.011645




In [187]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
import optuna

def create_advanced_features(df):
    # Time-based features
    df['hour_of_day'] = pd.Categorical(df['time_of_sale'], 
                                     categories=['Morning', 'Afternoon', 'Evening', 'Night'],
                                     ordered=True).codes
    
    # Enhanced time-based features
    df['day_of_week'] = df['date'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['month'] = df['date'].dt.month
    df['quarter'] = df['date'].dt.quarter
    df['day_of_year'] = df['date'].dt.dayofyear
    df['week_of_year'] = df['date'].dt.isocalendar().week
    
    # Sales patterns
    daily_stats = df.groupby(['date', 'time_of_sale'])['transaction_amount'].agg([
        'mean', 'count', 'std', 'min', 'max'
    ]).reset_index()
    df = df.merge(daily_stats, on=['date', 'time_of_sale'], suffixes=('', '_daily'))
    
    # Rolling statistics
    df = df.sort_values('date')
    for window in [7, 14, 30]:
        df[f'rolling_{window}d_mean'] = df.groupby('time_of_sale')['transaction_amount'].transform(
            lambda x: x.rolling(window, min_periods=1).mean())
        df[f'rolling_{window}d_std'] = df.groupby('time_of_sale')['transaction_amount'].transform(
            lambda x: x.rolling(window, min_periods=1).std())
    
    return df

def create_ensemble_model(params):
    rf = RandomForestRegressor(
        n_estimators=params['rf_n_estimators'],
        max_depth=params['rf_max_depth'],
        min_samples_split=params['rf_min_samples_split'],
        min_samples_leaf=params['rf_min_samples_leaf'],
        random_state=42,
        n_jobs=-1
    )
    
    gb = GradientBoostingRegressor(
        n_estimators=params['gb_n_estimators'],
        learning_rate=params['gb_learning_rate'],
        max_depth=params['gb_max_depth'],
        subsample=params['gb_subsample'],
        random_state=42
    )
    
    return VotingRegressor([
        ('rf', rf),
        ('gb', gb)
    ])

def objective(trial):
    params = {
        'rf_n_estimators': trial.suggest_int('rf_n_estimators', 1000, 3000),
        'rf_max_depth': trial.suggest_int('rf_max_depth', 15, 40),
        'rf_min_samples_split': trial.suggest_int('rf_min_samples_split', 2, 15),
        'rf_min_samples_leaf': trial.suggest_int('rf_min_samples_leaf', 1, 8),
        
        'gb_n_estimators': trial.suggest_int('gb_n_estimators', 200, 1000),
        'gb_learning_rate': trial.suggest_float('gb_learning_rate', 0.001, 0.1, log=True),
        'gb_max_depth': trial.suggest_int('gb_max_depth', 3, 12),
        'gb_subsample': trial.suggest_float('gb_subsample', 0.6, 1.0)
    }
    
    ensemble = create_ensemble_model(params)
    cv_scores = cross_val_score(ensemble, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)
    return cv_scores.mean()

def predict_staffing_needs(model, scaler, date, time_of_day, features_list):
    """
    Predict staffing needs based on expected sales
    """
    # Create feature vector for prediction
    prediction_df = pd.DataFrame({
        'date': [date],
        'time_of_sale': [time_of_day]
    })
    
    # Create features for prediction
    prediction_df = create_advanced_features(prediction_df)
    X_pred = prediction_df[features_list].fillna(0)  # Fill NA with 0 for prediction
    
    # Scale features
    X_pred_scaled = scaler.transform(X_pred)
    
    # Predict sales
    predicted_sales = model.predict(X_pred_scaled)[0]
    
    # Determine staffing needs based on predicted sales
    if predicted_sales < 100:
        return 1, predicted_sales
    elif predicted_sales < 250:
        return 2, predicted_sales
    elif predicted_sales < 400:
        return 3, predicted_sales
    elif predicted_sales < 600:
        return 4, predicted_sales
    else:
        return 5, predicted_sales

# Main execution

if __name__ == "__main__":
    # Read and prepare data
    df = pd.read_csv("archive/Balaji Fast Food Sales.csv")
    
    # First standardize the date format by replacing '/' with '-'
    df['date'] = df['date'].str.replace('/', '-')
    
    # Then convert to datetime with 'mixed' format
    df['date'] = pd.to_datetime(df['date'], format='mixed')
    
    # Create advanced features
    df = create_advanced_features(df)
    
    # Select features
    features = [
        'day_of_week', 'month', 'day_of_year', 'hour_of_day', 'is_weekend',
        'quarter', 'week_of_year', 'mean', 'count', 'std', 'min', 'max',
        'rolling_7d_mean', 'rolling_7d_std',
        'rolling_14d_mean', 'rolling_14d_std',
        'rolling_30d_mean', 'rolling_30d_std'
    ]
    
    # Prepare data
    X = df[features].fillna(method='ffill').fillna(method='bfill')
    y = df['transaction_amount']
    
    # Scale features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=features)
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=0.2, random_state=42
    )
    
    # Optimize hyperparameters
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=50)
    
    # Train final model with best parameters
    best_params = study.best_params
    final_ensemble = create_ensemble_model(best_params)
    final_ensemble.fit(X_train, y_train)
    
    # Evaluate final model
    y_pred = final_ensemble.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f'Final Mean Squared Error: {mse}')
    print(f'Final R2 Score: {r2}')
    
    # Feature importance analysis
    feature_importance = pd.DataFrame({
        'feature': features,
        'importance': final_ensemble.named_estimators_['rf'].feature_importances_
    })
    print("\nFeature Importance:")
    print(feature_importance.sort_values('importance', ascending=False))
    
    # Example prediction
    future_date = pd.Timestamp('2024-03-15')
    staff_needed, predicted_sales = predict_staffing_needs(
        final_ensemble, scaler, future_date, 'Evening', features
    )
    print(f'\nPrediction for {future_date.date()} Evening:')
    print(f'Predicted sales: ${predicted_sales:.2f}')
    print(f'Recommended staff: {staff_needed}')

  X = df[features].fillna(method='ffill').fillna(method='bfill')
[I 2024-12-03 13:13:50,318] A new study created in memory with name: no-name-a33863c3-db54-47b4-9303-c4aae5d85915
[I 2024-12-03 13:13:58,819] Trial 0 finished with value: 0.6362110228482536 and parameters: {'rf_n_estimators': 2615, 'rf_max_depth': 31, 'rf_min_samples_split': 15, 'rf_min_samples_leaf': 3, 'gb_n_estimators': 901, 'gb_learning_rate': 0.013912488716267216, 'gb_max_depth': 10, 'gb_subsample': 0.6102881114021207}. Best is trial 0 with value: 0.6362110228482536.
[I 2024-12-03 13:14:04,275] Trial 1 finished with value: 0.6400894381956237 and parameters: {'rf_n_estimators': 2338, 'rf_max_depth': 20, 'rf_min_samples_split': 8, 'rf_min_samples_leaf': 6, 'gb_n_estimators': 490, 'gb_learning_rate': 0.01738396557303579, 'gb_max_depth': 10, 'gb_subsample': 0.8464399182397875}. Best is trial 1 with value: 0.6400894381956237.
[I 2024-12-03 13:14:08,456] Trial 2 finished with value: 0.6357455953387199 and parameters: {'rf_