<a href="https://colab.research.google.com/github/CharmThiekshanaPerera/Coffee-Shop-Backend/blob/main/AI_for_Your_Coffee_Shop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import joblib
import openai  # For NLP-based questions

In [2]:
# Step 1: Gather Data
file_path = "/content/drive/MyDrive/CoffeeShop/index_1.csv"  # Replace with your actual file path
data = pd.read_csv(file_path)
print("Data Loaded Successfully!\n", data.head())

Data Loaded Successfully!
          date                 datetime cash_type                 card  money  \
0  2024-03-01  2024-03-01 10:15:50.520      card  ANON-0000-0000-0001   38.7   
1  2024-03-01  2024-03-01 12:19:22.539      card  ANON-0000-0000-0002   38.7   
2  2024-03-01  2024-03-01 12:20:18.089      card  ANON-0000-0000-0002   38.7   
3  2024-03-01  2024-03-01 13:46:33.006      card  ANON-0000-0000-0003   28.9   
4  2024-03-01  2024-03-01 13:48:14.626      card  ANON-0000-0000-0004   38.7   

     coffee_name  
0          Latte  
1  Hot Chocolate  
2  Hot Chocolate  
3      Americano  
4          Latte  


In [3]:
# Step 2: Clean and Preprocess
data.drop_duplicates(inplace=True)  # Remove duplicate entries
data.dropna(inplace=True)  # Remove missing values
data['datetime'] = pd.to_datetime(data['datetime'])  # Convert datetime

In [4]:
# Convert categorical data (coffee_name, cash_type) to numeric values
data['coffee_name'] = data['coffee_name'].astype('category').cat.codes
data['cash_type'] = data['cash_type'].astype('category').cat.codes
data['hour'] = data['datetime'].dt.hour  # Extract hour for time-based analysis

In [5]:
# Step 3: Feature Extraction
features = ['coffee_name', 'cash_type', 'hour']
target = 'money'

In [6]:
# Step 4: Split Data
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Step 5: Choose an Algorithm (Random Forest)
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [8]:
# Step 6: Train the Model
model.fit(X_train, y_train)
print("Model Training Complete!")

Model Training Complete!


In [9]:
# Step 7: Hyperparameter Tuning (Simple Example)
mae = mean_absolute_error(y_test, model.predict(X_test))
print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 1.5643062271777313


In [10]:
# Step 8: Validate Performance
predictions = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 1.916585306850911


In [11]:
# Step 9: Test Accuracy
accuracy = model.score(X_test, y_test)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

Model Accuracy: 84.94%


In [12]:
# Step 10: Iterate and Refine (Save Model)
joblib.dump(model, "coffee_sales_model.pkl")

['coffee_sales_model.pkl']

In [13]:
# Step 11: Answer Questions using AI (NLP-based)
def answer_question(question):
    if "most sold coffee" in question.lower():
        coffee_counts = data['coffee_name'].value_counts()
        most_sold = coffee_counts.idxmax()
        return f"The most sold coffee is {most_sold}."
    elif "total revenue" in question.lower():
        total_revenue = data['money'].sum()
        return f"Total revenue is ${total_revenue:.2f}."
    elif "cash vs card" in question.lower():
        payment_counts = data['cash_type'].value_counts()
        return f"Payment Breakdown: {payment_counts.to_dict()}"
    else:
        return "Sorry, I don't understand the question."

In [14]:
# Example Usage
question = "What is the most sold coffee?"
print(answer_question(question))

The most sold coffee is 1.
