In [None]:
!pip install streamlit
!npm install -g localtunnel
!pip install mlflow

Collecting streamlit
  Downloading streamlit-1.48.1-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.48.1-py3-none-any.whl (9.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m98.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.48.1
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K
added 22 packages in 3s
[1G[0K⠴[1G[0K
[1G[0K⠴[1G[0K3 packages are looking for funding
[1G[0K⠴[1G

Used the final optimized xgboost model and df_complete_one from https://colab.research.google.com/drive/1cM_mIpB2YB-QRHYSLmMLMczeb-DHinBF?usp=sharing

In [None]:
%%writefile app.py

Writing app.py


In [None]:
import streamlit as st
import pickle
import pandas as pd
import numpy as np
import os
import gc
import requests
import time
import threading
from google.colab import files
import sys

# --- 0. INSTALL REQUIRED PACKAGES ---
# This line ensures pyngrok is installed automatically.
try:
    from pyngrok import ngrok, conf
except ImportError:
    print("pyngrok not found. Installing now...")
    !pip install pyngrok --quiet
    from pyngrok import ngrok, conf

# --- Add ngrok Auth Token Here ---
NGROK_AUTH_TOKEN = "31VxrJhPlYbzadnzztwM7mgWsyc_2JR5Nnnda4eyVtoYRemK" # <-- PASTE THE AUTH TOKEN HERE

# --- 1. UPLOAD THE NECESSARY FILES ---
print("Please select the dataframe 'df_complete_one.parquet' file to upload:")
uploaded_df = files.upload()
df_filename = list(uploaded_df.keys())[0]


print("\nPlease select the optimized model 'final_optimized_xgboost_model.pkl' file to upload:")
uploaded_model = files.upload()
model_filename = list(uploaded_model.keys())[0]

# --- 2. THE STREAMLIT APP CODE ---

app_code = f"""
import streamlit as st
import pickle
import pandas as pd
import numpy as np
import os
import gc
import datetime
import sys

# --- File Paths ---
model_filename = '{model_filename}'
data_filename = '{df_filename}'

# --- Load the Model and Data ---
@st.cache_data
def load_data(data_path):
    \"\"\"Loads the main dataframe from a parquet file.\"\"\"
    if not os.path.exists(data_path):
        st.error(f"Data file '{{data_path}}' not found. Please upload it.")
        return None
    try:
        df = pd.read_parquet(data_path)
        df['date'] = pd.to_datetime(df['date'])
        return df
    except Exception as e:
        st.error(f"Error loading data: {{e}}")
        return None

@st.cache_resource
def load_model(model_path):
    \"\"\"Loads the trained model from a pickle file.\"\"\"
    if not os.path.exists(model_path):
        st.error(f"Model file '{{model_path}}' not found. Please upload it.")
        return None
    try:
        with open(model_path, 'rb') as file:
            model = pickle.load(file)
        return model
    except Exception as e:
        st.error(f"Error loading model: {{e}}")
        return None

# --- Feature Engineering & Preprocessing ---
def preprocess_input_data(store_id, item_id, forecast_dates, df_complete):
    \"\"\"
    Finds the correct rows from the pre-engineered dataframe and prepares them for prediction.
    \"\"\"
    # Define the exact features the model was trained on
    MODEL_FEATURES = [
        'unit_sales_rolling_mean_7', 'unit_sales_rolling_mean_14', 'z_score',
        'unit_sales_rolling_std_7', 'unit_sales_rolling_mean_30', 'unit_sales_lag_1',
        'is_weekend', 'unit_sales_rolling_std_14', 'unit_sales_rolling_std_30',
        'day_of_week', 'cluster_17', 'month', 'item_nbr', 'unit_sales_lag_14',
        'store_type_D', 'locale_National', 'year', 'class', 'unit_sales_lag_7',
        'transactions'
    ]

    # Convert the input dates to datetime64[ns] to match the DataFrame
    input_dates = pd.to_datetime(forecast_dates)

    # Filter the DataFrame to find the matching rows for the entire forecast period
    input_rows = df_complete[
        (df_complete['store_nbr'] == store_id) &
        (df_complete['item_nbr'] == item_id) &
        (df_complete['date'].isin(input_dates))
    ].copy()

    # Check if a row was found
    if input_rows.empty:
        st.warning("No data found for the selected store, item, and date combination.")
        return None

    # Add any missing features with a default value of 0 to match the model's expectations
    for feature in MODEL_FEATURES:
        if feature not in input_rows.columns:
            input_rows[feature] = 0

    # Explicitly select only the features the model expects
    try:
        features = input_rows[MODEL_FEATURES]
    except KeyError as e:
        st.error(f"Error selecting features: {{e}}. There might be a mismatch between the data and the model's expected features.")
        return None

    return features

# --- Prediction Function ---
def predict(model, input_data):
    \"\"\"Makes a prediction using the loaded model.\"\"\"
    if input_data is not None and not input_data.empty:
        prediction = model.predict(input_data)
        return prediction
    return np.array([0.0])

# --- Main App Function ---
def main():
    st.title("Corporación Favorita Sales Forecasting")
    st.markdown("---")

    # Load data and model
    st.info("Loading model and data...")
    df_complete = load_data(data_filename)
    model = load_model(model_filename)

    # Stop the app if files are missing
    if df_complete is None or model is None:
        return

    st.success("Files loaded successfully!")

    # --- UI components for inputs ---
    st.subheader("Select Store and Item")
    st.markdown("---")

    # Get unique stores and items from the loaded data for the selectbox options
    unique_stores = sorted(df_complete['store_nbr'].unique().tolist())
    unique_items = sorted(df_complete['item_nbr'].unique().tolist())

    # Safely get the default index for store and item
    default_store = 24
    default_item = 1047679
    default_store_index = unique_stores.index(default_store) if default_store in unique_stores else 0
    default_item_index = unique_items.index(default_item) if default_item in unique_items else 0

    # Create the select boxes to allow the user to pick a store and item
    store_id = st.selectbox("Select a Store", unique_stores, index=default_store_index)
    item_id = st.selectbox("Select an Item", unique_items, index=default_item_index)

    st.write(f"**Selected Store:** {{store_id}}")
    st.write(f"**Selected Item:** {{item_id}}")

    # Define date range based on the dataset
    min_date = df_complete['date'].min().date()
    max_date = df_complete['date'].max().date()

    forecast_date = st.date_input("Forecast Date", value=max_date, min_value=min_date, max_value=max_date)

    st.markdown("---")
    st.subheader("Forecast mode")
    forecast_mode = st.radio(" ", ("Single day", "Next N days"), key="forecast_mode")

    n_days = 1
    if forecast_mode == "Next N days":
        n_days = st.slider("N days", 1, 30, 7)

    st.markdown("---")

    # Run prediction when button is clicked
    if st.button("Get Forecast"):
        with st.spinner("Generating forecast..."):

            if forecast_mode == "Single day":
                forecast_dates = [forecast_date]
            else:
                forecast_dates = [forecast_date + datetime.timedelta(days=i) for i in range(1, n_days + 1)]

            input_data = preprocess_input_data(store_id, item_id, forecast_dates, df_complete)

            if input_data is not None:
                prediction = predict(model, input_data)

                # Create a DataFrame for the forecast
                forecast_df = pd.DataFrame({{
                    'date': forecast_dates,
                    'prediction': prediction
                }})

                # --- PLOTTING ---
                # Get historical data for plotting
                historical_data = df_complete[
                    (df_complete['store_nbr'] == store_id) &
                    (df_complete['item_nbr'] == item_id) &
                    (df_complete['date'] < pd.to_datetime(forecast_df['date'].min()))
                ][['date', 'unit_sales']].copy()

                # Prepare the final DataFrame for plotting
                plot_data = pd.DataFrame(columns=['date', 'Actual (history)', 'Forecast'])
                plot_data['date'] = pd.to_datetime(historical_data['date'].tolist() + forecast_df['date'].tolist())
                plot_data['Actual (history)'] = historical_data['unit_sales'].tolist() + [np.nan] * len(forecast_df)
                plot_data['Forecast'] = [np.nan] * len(historical_data) + forecast_df['prediction'].tolist()

                # Set a wider range for the plot data to see some history
                plot_data_start_date = plot_data['date'].max() - pd.Timedelta(days=180)
                plot_data_filtered = plot_data[plot_data['date'] >= plot_data_start_date]

                st.subheader("Forecast Results")
                if forecast_mode == "Single day":
                     st.write(f"Predicted sales for {{forecast_dates[0]}}: **{{prediction[0]:.2f}}**")
                else:
                    st.write(f"Predicted {{n_days}} days: {{forecast_dates[0]}} → {{forecast_dates[-1]}}.")

                st.line_chart(plot_data_filtered.set_index('date'))

                st.subheader("Forecast Table")
                st.dataframe(forecast_df, use_container_width=True)

                # --- DOWNLOAD BUTTON ---
                csv = forecast_df.to_csv(index=False).encode('utf-8')
                st.download_button(
                    label="Download forecast as CSV",
                    data=csv,
                    file_name=f'forecast_{{store_id}}_{{item_id}}_{{forecast_date}}_{{n_days}}days.csv',
                    mime='text/csv'
                )
            else:
                st.error("Could not generate a forecast. The selected combination of store, item, and date does not exist in the dataset.")

if __name__ == "__main__":
    main()
"""

# Save the app code to a file
with open('app.py', 'w') as f:
    f.write(app_code)

# --- 3. RUN THE APP WITH NGrok ---
# Kill any processes running on port 8501 to prevent conflicts
!fuser -k 8501/tcp
# Kill all running ngrok tunnels to free up quota
ngrok.kill()

# A function to run the Streamlit app in a separate thread
def run_streamlit():
    os.system('streamlit run app.py &')

# Start the Streamlit app
print("Starting Streamlit app...")
thread = threading.Thread(target=run_streamlit)
thread.daemon = True
thread.start()

# Wait for streamlit to start and open the ngrok tunnel
print("Waiting for Streamlit app to be available on port 8501...")
for i in range(10): # Check for up to 50 seconds
    try:
        response = requests.get("http://localhost:8501")
        if response.status_code == 200:
            print("Streamlit app is ready!")
            break
    except requests.exceptions.ConnectionError:
        print(f"Attempt {i+1}/10 failed. Retrying in 5 seconds...")
        time.sleep(5)
else:
    print("Streamlit app did not become available. Please try running the code again.")
    exit()

try:
    if NGROK_AUTH_TOKEN:
        conf.get_default().auth_token = NGROK_AUTH_TOKEN
    else:
        print("ngrok auth token is not set. The tunnel may not work.")

    public_url = ngrok.connect(addr="8501", proto="http")
    print("\n-------------------------------------------------------------")
    print(f"Your app is ready! Please visit this URL in your browser: {public_url}")
    print("-------------------------------------------------------------\n")

except Exception as e:
    print(f"An error occurred while starting ngrok: {e}")



pyngrok not found. Installing now...
Please select the dataframe 'df_complete_one.parquet' file to upload:


Saving df_complete_one.parquet to df_complete_one.parquet

Please select the optimized model 'final_optimized_xgboost_model.pkl' file to upload:


Saving final_optimized_xgboost_model.pkl to final_optimized_xgboost_model.pkl
Starting Streamlit app...
Waiting for Streamlit app to be available on port 8501...
Attempt 1/10 failed. Retrying in 5 seconds...
Streamlit app is ready!

-------------------------------------------------------------
Your app is ready! Please visit this URL in your browser: NgrokTunnel: "https://5a48552a9154.ngrok-free.app" -> "http://localhost:8501"
-------------------------------------------------------------

