In [None]:
pip install streamlit pandas prophet plotly

In [2]:
import pandas as pd
from prophet import Prophet
import plotly.graph_objs as go
from plotly.offline import plot
import plotly.tools as tls
import os
import pickle
import numpy as np
from scipy.stats import ttest_ind
import streamlit as st

# Load the data
raw_df = pd.read_csv(r"c:\Users\avrahamma\Documents\School\AI_for_social_good\paris_bread_sales - Sheet2.csv")

# Prepare the data for Prophet
df = raw_df[["date", "filled_sold_bread", "day", "temperature_2m_mean", "precipitation_sum_mm"]].copy()  # Use .copy() to avoid SettingWithCopyWarning
df.columns = ["ds", "y", "day", "temperature_2m_mean", "precipitation_sum_mm"]  # Rename the columns to match Prophet's requirements

# Convert the date to a datetime object
df.loc[:, "ds"] = pd.to_datetime(df["ds"], format="%d/%m/%Y")

# Convert the sales to a numeric object
df.loc[:, "y"] = pd.to_numeric(df["y"])

# One-hot encode the "day" column
day_dummies = pd.get_dummies(df["day"], prefix="day")
df = pd.concat([df, day_dummies], axis=1)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from scipy.stats import ttest_ind

# feeling the data

# calculate and print the average of bread sales for each day of the week
# Ensure the "y" column contains only numeric data
df["y"] = pd.to_numeric(df["y"], errors='coerce')

# Calculate and print the average of bread sales for each day of the week
day_avg = df.groupby("day")["y"].mean().reset_index()
day_avg.columns = ["day", "y"]
print("Sales average per day of the week:", day_avg)

# understanding the relationship between the sales and the temperature
# Ensure the "temperature_2m_mean" column contains only numeric data
df["temperature_2m_mean"] = pd.to_numeric(df["temperature_2m_mean"], errors='coerce')

# Calculate and print the correlation between the sales and the temperature
temperature_corr = df["y"].corr(df["temperature_2m_mean"])
print("Correlation between temperature and sales:", temperature_corr)

# Ensure the "precipitation_sum_mm" column contains only numeric data
df["precipitation_sum_mm"] = pd.to_numeric(df["precipitation_sum_mm"], errors='coerce')

# Calculate and print the correlation between the sales and the precipitation
precipitation_corr = df["y"].corr(df["precipitation_sum_mm"])
print("Correlation between precipitation and sales:", precipitation_corr)

# check how sales are affected by extreme cold days
# Step 1: Define extreme cold days (bottom 5% of temperature)
cold_threshold = df['temperature_2m_mean'].quantile(0.05)  # Get the 5th percentile value
extreme_cold_days = df[df['temperature_2m_mean'] <= cold_threshold]  # Filter for extreme cold days
normal_days = df[df['temperature_2m_mean'] > cold_threshold]  # Filter for the rest of the days

# Step 2: Calculate average sales for both groups
avg_sales_cold = extreme_cold_days['y'].mean()
avg_sales_normal = normal_days['y'].mean()

print(f"Average sales on extreme cold days: {avg_sales_cold}")
print(f"Average sales on normal days: {avg_sales_normal}")

# Step 3: Compare the averages with a t-test (Optional)
# Null hypothesis: No difference in sales between extreme cold and normal days
t_stat, p_value = ttest_ind(extreme_cold_days['y'], normal_days['y'], equal_var=False)

print(f"T-statistic: {t_stat}, P-value: {p_value}")

# Interpretation
if p_value < 0.05:
    print("There is a statistically significant difference in sales on extreme cold days.")
else:
    print("There is no statistically significant difference in sales on extreme cold days.")

# check how sales are affected by extreme rainy days
# Step 1: Define extreme rainy days (top 5% of precipitation)
rainy_threshold = df['precipitation_sum_mm'].quantile(0.99)  # Get the 95th percentile value

extreme_rainy_days = df[df['precipitation_sum_mm'] >= rainy_threshold]  # Filter for extreme rainy days
normal_days = df[df['precipitation_sum_mm'] < rainy_threshold]  # Filter for the rest of the days

# Step 2: Calculate average sales for both groups
avg_sales_rainy = extreme_rainy_days['y'].mean()
avg_sales_normal = normal_days['y'].mean()

print(f"Average sales on extreme rainy days: {avg_sales_rainy}")
print(f"Average sales on normal days: {avg_sales_normal}")

# Step 3: Compare the averages with a t-test (Optional)
# Null hypothesis: No difference in sales between extreme rainy and normal days
t_stat, p_value = ttest_ind(extreme_rainy_days['y'], normal_days['y'], equal_var=False)

print(f"T-statistic: {t_stat}, P-value: {p_value}")

# Interpretation
if p_value < 0.05:
    print("There is a statistically significant difference in sales on extreme rainy days.")
else:
    print("There is no statistically significant difference in sales on extreme rainy days.")

# export the data as csv
# df.to_csv(r"c:\Users\avrahamma\Documents\School\AI_for_social_good\paris_bread_sales_prophet.csv", index=False)

Sales average per day of the week:          day           y
0     friday  526.467391
1     monday  539.384615
2   saturday  687.934066
3     sunday  916.626374
4   thursday  503.098901
5    tuesday  475.120879
6  wednesday  461.428571
Correlation between temperature and sales: 0.4983014746258398
Correlation between precipitation and sales: -0.06072179712236452
Average sales on extreme cold days: 355.6296296296296
Average sales on normal days: 597.2831423895253
T-statistic: nan, P-value: nan
There is no statistically significant difference in sales on extreme cold days.
Average sales on extreme rainy days: 387.6666666666667
Average sales on normal days: 587.9984251968503
T-statistic: nan, P-value: nan
There is no statistically significant difference in sales on extreme rainy days.


In [4]:
# First, identify the last date with actual sales data
last_actual_date = df[df['y'].notna()]['ds'].max()
print(f"Last date with actual sales data: {last_actual_date}")

# Split the data for training - use only data up to the last actual date
train_df = df[df['ds'] <= last_actual_date].copy()

# Initialize and train the model on training data only
model = Prophet()
# Add the regressors
for col in day_dummies.columns:
    model.add_regressor(col)
model.add_regressor('temperature_2m_mean')
model.add_regressor('precipitation_sum_mm')

# Add custom seasonality
model.add_seasonality(name='weekly', period=7, fourier_order=3)
model.add_seasonality(name='yearly', period=365.25, fourier_order=10)

# Train the model
model.fit(df)

# Save the model
with open('prophet_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Create future dataframe that goes exactly one year ahead from the last actual date
future = model.make_future_dataframe(periods=365)
prediction_end_date = last_actual_date + pd.DateOffset(days=365)
future = future[future['ds'] <= prediction_end_date]

# Add the regressors to the future DataFrame
# Add day dummies to the future dataframe
future = pd.concat([future, day_dummies.reindex(future.index, fill_value=0)], axis=1)

# Add temperature data to the future dataframe
if 'temperature_2m_mean' in df.columns:
    future['temperature_2m_mean'] = df['temperature_2m_mean'].reindex(future.index, fill_value=df['temperature_2m_mean'].mean())
else:
    raise ValueError("Temperature data is missing for future predictions. Please provide the temperature data.")

# Add precipitation data to the future dataframe
if 'precipitation_sum_mm' in df.columns:
    future['precipitation_sum_mm'] = df['precipitation_sum_mm'].reindex(future.index, fill_value=df['precipitation_sum_mm'].mean())
else:
    raise ValueError("Precipitation data is missing for future predictions. Please provide the precipitation data.")

# Make predictions
forecast = model.predict(future)

Last date with actual sales data: 2022-09-30 00:00:00


12:39:38 - cmdstanpy - INFO - Chain [1] start processing
12:39:39 - cmdstanpy - INFO - Chain [1] done processing


In [5]:
# Plot the forecast
fig = model.plot(forecast)

# Convert the Matplotlib figure to a Plotly figure
plotly_fig = tls.mpl_to_plotly(fig)

# Display the Plotly figure
plot(plotly_fig)

print(forecast.head())
print(forecast.tail())

# Save the forecast to a CSV file
# forecast_path = r"c:\Users\avrahamma\Documents\School\AI_for_social_good\prophet_forecast.csv"
# forecast.to_csv(forecast_path, index=False)


Dang! That path collection is out of this world. I totally don't know what to do with it yet! Plotly can only import path collections linked to 'data' coordinates



          ds       trend  yhat_lower  yhat_upper  trend_lower  trend_upper  \
0 2021-01-01  557.058870  226.530040  582.662648   557.058870   557.058870   
1 2021-01-02  557.196089  379.915668  725.570451   557.196089   557.196089   
2 2021-01-03  557.333309  611.877114  954.203684   557.333309   557.333309   
3 2021-01-04  557.470528  223.294497  566.576466   557.470528   557.470528   
4 2021-01-05  557.607747  147.558526  493.754539   557.607747   557.607747   

   additive_terms  additive_terms_lower  additive_terms_upper  day_friday  \
0     -155.313577           -155.313577           -155.313577   20.986016   
1       -3.236207             -3.236207             -3.236207    0.000000   
2      221.299412            221.299412            221.299412    0.000000   
3     -164.640155           -164.640155           -164.640155    0.000000   
4     -237.577698           -237.577698           -237.577698    0.000000   

   ...      weekly  weekly_lower  weekly_upper      yearly  yearly_l

In [7]:
import streamlit as st
import pandas as pd
from prophet import Prophet
import datetime
import plotly.graph_objects as go

def load_model():
    # Load the trained Prophet model
    model = Prophet()
    # Add your regressors here as in your notebook
    model.add_regressor('temperature_2m_mean')
    model.add_regressor('precipitation_sum_mm')
    for day in ['day_friday', 'day_monday', 'day_saturday', 
                'day_sunday', 'day_thursday', 'day_tuesday', 'day_wednesday']:
        model.add_regressor(day)
    return model

def create_future_df(date, temperature, precipitation):
    # Create a DataFrame for prediction
    future = pd.DataFrame({
        'ds': [date],
        'temperature_2m_mean': [temperature],
        'precipitation_sum_mm': [precipitation]
    })
    
    # Add day dummies
    day_name = date.strftime('%A').lower()
    for day in ['friday', 'monday', 'saturday', 'sunday', 'thursday', 'tuesday', 'wednesday']:
        future[f'day_{day}'] = 1 if day == day_name else 0
    
    return future

def main():
    st.title("🥖 Bread Sales Predictor")
    
    # Sidebar for inputs
    st.sidebar.header("Input Parameters")
    
    # Date selector (defaulting to tomorrow)
    tomorrow = datetime.date.today() + datetime.timedelta(days=1)
    date = st.sidebar.date_input("Select Date", value=tomorrow)
    
    # Temperature input
    temperature = st.sidebar.slider(
        "Temperature (°C)",
        min_value=-10.0,
        max_value=40.0,
        value=20.0,
        step=0.5
    )
    
    # Precipitation input
    precipitation = st.sidebar.slider(
        "Precipitation (mm)",
        min_value=0.0,
        max_value=50.0,
        value=0.0,
        step=0.5
    )
    
    # Load model
    try:
        model = load_model()
        
        # Create future DataFrame
        future = create_future_df(pd.to_datetime(date), temperature, precipitation)
        
        # Make prediction
        forecast = model.predict(future)
        
        # Main content
        st.header("Prediction Results")
        
        # Display the predicted amount with confidence interval
        col1, col2, col3 = st.columns(3)
        
        with col1:
            st.metric(
                label="Predicted Bread Loaves",
                value=f"{int(forecast['yhat'].iloc[0])}",
                delta=None
            )
        
        with col2:
            st.metric(
                label="Lower Bound",
                value=f"{int(forecast['yhat_lower'].iloc[0])}",
                delta=None
            )
            
        with col3:
            st.metric(
                label="Upper Bound",
                value=f"{int(forecast['yhat_upper'].iloc[0])}",
                delta=None
            )
        
        # Display additional information
        st.subheader("Prediction Details")
        info_col1, info_col2 = st.columns(2)
        
        with info_col1:
            st.info(f"Date: {date.strftime('%A, %B %d, %Y')}")
            st.info(f"Temperature: {temperature}°C")
            
        with info_col2:
            st.info(f"Precipitation: {precipitation}mm")
            confidence_range = f"±{int((forecast['yhat_upper'].iloc[0] - forecast['yhat_lower'].iloc[0])/2)} loaves"
            st.info(f"Confidence Range: {confidence_range}")
        
        # Add historical context
        st.subheader("Historical Context")
        st.write(f"This prediction is based on historical sales data and takes into account:")
        st.write("- Day of the week patterns")
        st.write("- Temperature effects")
        st.write("- Precipitation impacts")
        st.write("- Seasonal trends")
        
    except Exception as e:
        st.error(f"An error occurred: {str(e)}")
        st.write("Please ensure the model file is properly loaded and all required dependencies are installed.")

if __name__ == "__main__":
    main()