<a href="https://colab.research.google.com/github/Edenshmuel/PapaJohns_Data_Science_Project/blob/Nadav/Predicting_New_Categories.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing libraries and reading data

In [None]:
# !pip install fuzzywuzzy[speedup]

In [1]:
from google.colab import drive
import os
import shutil
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from fuzzywuzzy import process

In [2]:
def reconnect_to_drive():
    # Disconnect if there is an existing connection
    try:
        drive.flush_and_unmount()
        print("📤 Previous connection to Drive was lost")
    except:
        print("ℹ️ There was no previous connection")

    # Remove the /content/drive folder if it exists
    drive_mount_point = '/content/drive'
    if os.path.exists(drive_mount_point):
        shutil.rmtree(drive_mount_point)
        print("🗑️ Old mount point removed")

    # Connect to Drive
    drive.mount(drive_mount_point)
    print("📂 Connected to Drive")

reconnect_to_drive()

Drive not mounted, so nothing to flush and unmount.
📤 Previous connection to Drive was lost
Mounted at /content/drive
📂 Connected to Drive


In [3]:
cleaned_data  = pd.read_csv('/content/drive/MyDrive/Final_Project_PapaJohns/cleaned_data.csv')
category_mapping = pd.read_csv('/content/drive/MyDrive/Final_Project_PapaJohns/category_mapping.csv')
desc_encoding_map = pd.read_csv('/content/drive/MyDrive/Final_Project_PapaJohns/desc_encoding_map.csv')

In [4]:
# Replace NaN with "לא מוגדר"
category_mapping['קטגוריה'] = category_mapping['קטגוריה'].fillna('לא מוגדר')

In [5]:
# # שמירה רק של פריטים שנמכרו לפחות 50 פעמים
# desc_counts = cleaned_data['clean_desc_encoded'].value_counts()
# valid_codes = desc_counts[desc_counts >= 50].index
# filtered_cleaned_data = cleaned_data[cleaned_data['clean_desc_encoded'].isin(valid_codes)]

In [6]:
# print(f"📊 לפני סינון: {len(cleaned_data)} שורות")
# print(f"✅ אחרי סינון: {len(filtered_cleaned_data)} שורות")

## Mergers and training table creation

In [7]:
merged = cleaned_data.merge(desc_encoding_map, left_on='clean_desc_encoded', right_on='code', how='left')
merged = merged.merge(category_mapping, left_on='category_encoded', right_on='קוד', how='left')

### 🔍 Why we use only `item_description` as input

In this classification task, the goal is to predict the **category of a new product** based solely on its textual description — for example: `"Coca Cola"`, `"Papa Deal"`, or `"Greek Salad"`.

We focus only on `item_description` for the following reasons:

- ✅ It is the **only available information** when a **new product** is added to the system.
- ✅ It contains meaningful linguistic patterns (e.g., "pizza", "drink", "sauce") that are useful for text classification.
- ❌ We ignore features like `clean_desc_encoded`, `quantity`, or `date`, since they are either:
  - Not available for new products,
  - Or irrelevant for categorizing based on name/description alone.

This approach ensures that the model:
- Can generalize to products it has **never seen before**,
- And works **in real-time**, using only the name provided during product creation.

In [8]:
# Retrieving description and category
model_data = merged[['Unnamed: 0', 'קטגוריה']].rename(columns={
    'Unnamed: 0': 'item_description',
    'קטגוריה': 'category'}).dropna()

In [9]:
# Removing the 'לא מוגדר' category from the training
model_data = model_data[model_data['category'] != 'לא מוגדר']

In [10]:
additional_examples = pd.DataFrame({
    'item_description': [
        # Desserts
        'עוגת שוקולד חמה', 'עוגת גבינה ניו יורק', 'קינוח שוקולד אישי', 'טירמיסו', 'בראוניז', 'עוגה', 'קינוח',

        # Drinks
        'קולה דיאט', 'מים מינרליים', 'ספרייט בקבוק', 'תפוחים', 'פפסי', 'מים', 'אקסל', 'נביעות', 'זירו',

        # Sauces
        'שום שמיר', 'חריף', 'סלסה', 'ברביקיו מעושן', 'אלף האיים', 'רוטב', 'טבסקו',

        # Toppings
        'בולגרית', 'פטריות טריות', 'זיתים ירוקים', 'אננס', 'עגבניות מיובשות', 'גבינה', 'בצל',

        # Main courses
        'פיצה מרגריטה', 'פיצה טוסקנית 14', 'פיצה טבעונית אישית', 'פיצה גבינות מיוחדת', 'פיצה נקניק חריף דקה', 'הטבעונית', 'מיוחדת', 'כשרה לפסח',

        # Other
        'עסקית לילה', 'פיצה זוגית במבצע', 'קופון ראשון', 'שובר הנחה', 'מארז פיצה + שתייה'
    ],
    'category': [
        # Desserts ×7
        'קינוח', 'קינוח', 'קינוח', 'קינוח', 'קינוח', 'קינוח', 'קינוח',
        # Drinks ×9
        'שתייה', 'שתייה', 'שתייה', 'שתייה', 'שתייה', 'שתייה', 'שתייה', 'שתייה', 'שתייה',
        # Sauces ×7
        'רוטב', 'רוטב', 'רוטב', 'רוטב', 'רוטב', 'רוטב', 'רוטב',
        # Toppings ×7
        'תוספת', 'תוספת', 'תוספת', 'תוספת', 'תוספת', 'תוספת', 'תוספת',
        #  Main courses ×8
        'מנה עיקרית', 'מנה עיקרית', 'מנה עיקרית', 'מנה עיקרית', 'מנה עיקרית', 'מנה עיקרית', 'מנה עיקרית', 'מנה עיקרית',
        # Other ×5
        'אחר', 'אחר', 'אחר', 'אחר', 'אחר'
    ]
})

In [11]:
# Connection to existing data
model_data = pd.concat([model_data, additional_examples], ignore_index=True)

In [12]:
print(model_data['category'].value_counts(normalize=True))

category
מנה עיקרית    0.436732
תוספת         0.309246
רוטב          0.086999
שתייה         0.082705
אחר           0.057660
קינוח         0.026659
Name: proportion, dtype: float64


## Mapping categories from the file

In [13]:
# Category Mapping: Text to Code
category_to_index = dict(zip(category_mapping['קטגוריה'], category_mapping['קוד']))
index_to_category = {v: k for k, v in category_to_index.items()}

In [14]:
# Filter category "לא מוגדר" from mapping
category_to_index.pop('לא מוגדר', None)
index_to_category.pop(0, None)

'לא מוגדר'

## Balancing categories

In [15]:
samples_per_category = 30000
balanced_model_data = model_data.groupby('category', group_keys=False).apply(
    lambda x: x.sample(n=min(len(x), samples_per_category), random_state=42)
).reset_index(drop=True)

  balanced_model_data = model_data.groupby('category', group_keys=False).apply(


In [16]:
print(balanced_model_data['category'].value_counts(normalize=True))

category
מנה עיקרית    0.254050
תוספת         0.254050
רוטב          0.168469
שתייה         0.160153
אחר           0.111655
קינוח         0.051623
Name: proportion, dtype: float64


## data fragmentation

In [17]:
X = balanced_model_data['item_description']

# Encoding the categories from names to numbers
y_encoded = balanced_model_data['category'].map(category_to_index)

# Adaptation to XGBoost: Making the encoding start from 0
y_adjusted = y_encoded - 1

In [18]:
# Split into Train/Test
X_train, X_test, y_train, y_test = train_test_split(X, y_adjusted, test_size=0.2, random_state=42, stratify=y_adjusted)

## Model building and training

In [19]:
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'))])

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



## Predicting and returning category names

In [20]:
y_pred = model.predict(X_test)

In [21]:
# Restore the original encoding
y_pred_orig = y_pred + 1
y_test_orig = y_test + 1

In [22]:
# Convert back to category names
y_pred_labels = [index_to_category[i] for i in y_pred_orig]
y_test_labels = [index_to_category[i] for i in y_test_orig]

print(classification_report(y_test_labels, y_pred_labels))

              precision    recall  f1-score   support

         אחר       1.00      1.00      1.00      2637
  מנה עיקרית       1.00      1.00      1.00      6000
       קינוח       1.00      1.00      1.00      1219
        רוטב       0.99      1.00      0.99      3979
       שתייה       1.00      0.99      1.00      3783
       תוספת       1.00      1.00      1.00      6000

    accuracy                           1.00     23618
   macro avg       1.00      1.00      1.00     23618
weighted avg       1.00      1.00      1.00     23618



## New product prediction function (including security)

### 🧠 How the Model Handles Unknown or New Categories

This classification model is designed to predict the category of a product **based solely on its description** (`"Greek Salad"`, `"Papa Deal"`, `"Coca Cola"`).

#### 🟢 Standard Behavior:
- The model uses a trained `TF-IDF + XGBoost` pipeline to predict the **most likely category** from the known set (`'Main Dish'`, `'Drink'`, `'Dessert'`).
- These categories are based on the `category_mapping.csv` file and aligned with the internal system codes (1–6).

#### ⚠️ Special Handling for New/Unknown Products:
- If the model is **not confident enough** in its prediction (the top probability is **below a certain threshold**, such as 0.6),  
  it will **not return a specific category**.
- Instead, it returns a special label: **"⚠️ Category not recognized – Unclassified"**.

#### ✅ Why this is important:
- It ensures that **new or unusual products** (like limited-time offers or misspelled items) are not forced into incorrect categories.
- It also allows system operators to **review and manually classify** such items, or update the model over time.

> In summary: the model is capable of both confidently classifying known products and flagging new or unclear ones as "Unclassified".

In [23]:
def predict_from_input(model, index_to_category, additional_examples, threshold=0.6, fuzz_threshold=80):
    from fuzzywuzzy import process
    import re

    def clean_hebrew_text(text):
        text = text.strip().lower()
        text = re.sub(r'^[הוי]', '', text)  # Removing a single prefix letter
        text = re.sub(r'\s+', ' ', text)
        return text

    # Create a dictionary from keywords after cleaning
    keyword_map = {
        clean_hebrew_text(desc): cat
        for desc, cat in zip(additional_examples['item_description'], additional_examples['category'])
    }

    print("🔍 Enter a product description (or type 'סיום' to quit):")
    while True:
        user_input = input("📝 Description: ").strip()
        if user_input.lower() == 'סיום':
            print("👋 Exiting prediction mode.")
            break

        cleaned_input = clean_hebrew_text(user_input)

        # Step 1: Precise fit after cleaning
        if cleaned_input in keyword_map:
            print(f"✅ [מילות מפתח] Predicted category: {keyword_map[cleaned_input]}")
            continue

        # Step 2: fuzzy matching on cleaned words
        best_match, score = process.extractOne(cleaned_input, list(keyword_map.keys()))
        if score >= fuzz_threshold:
            print(f"✅ [התאמה קרובה] Predicted category: {keyword_map[best_match]} (match: {best_match}, score: {score})")
        else:
            # Step 3: Using the model
            probas = model.predict_proba([user_input])[0]
            max_proba = np.max(probas)
            predicted_index = np.argmax(probas)
            original_index = predicted_index + 1

            if max_proba < threshold:
                print("⚠️ Category not recognized – Unclassified")
            else:
                category = index_to_category[original_index]
                print(f"✅ [מודל] Predicted category: {category} (Confidence: {max_proba:.2f})")

## Example of a prediction:

In [24]:
predict_from_input(model, index_to_category, additional_examples, threshold=0.6)

🔍 Enter a product description (or type 'סיום' to quit):
📝 Description: עוגת שוקולד
✅ [התאמה קרובה] Predicted category: קינוח (match: עוגת שוקולד חמה, score: 95)
📝 Description: הטיבעונית
✅ [התאמה קרובה] Predicted category: מנה עיקרית (match: טבעונית, score: 93)
📝 Description: רוטב סלסה
✅ [התאמה קרובה] Predicted category: רוטב (match: סלסה, score: 90)
📝 Description: סיום
👋 Exiting prediction mode.
