In [5]:
import gradio as gr
import joblib

# Load the vectorizer and KMeans model
vectorizer = joblib.load("vectorizer.pkl")
kmeans_model = joblib.load("kmeans_model.pkl")

# Define cluster names
cluster_names = {
    0: "Entry-Level and Kids Fire Tablets",
    1: "Batteries, Laptop Gear, and Basic Accessories",
    2: "Streaming Devices and E-Readers",
    3: "Advanced E-Readers and Smart Assistants",
    4: "Echo Speakers and Smart Home Hubs"
}

# Prediction function
def predict_cluster(text):
    vector = vectorizer.transform([text])
    cluster_num = kmeans_model.predict(vector)[0]
    cluster_name = cluster_names.get(cluster_num, "Unknown Cluster")
    return f"📦 Classification: {cluster_name}"

# Gradio interface
iface = gr.Interface(
    fn=predict_cluster,
    inputs="text",
    outputs="text",
    title="Text to Product Category Classifier",
    description="Enter a product description or review and the model will predict which product category it belongs to."
)

iface.launch()

* Running on local URL:  http://127.0.0.1:7865

To create a public link, set `share=True` in `launch()`.




In [3]:
import gradio as gr
import numpy as np
import joblib

# تحميل الملفات
embeddings = np.load("product_embeddings.npy")
kmeans_model = joblib.load("kmeans_model.pkl")

# أسماء الكلاسترات
cluster_names = {
    0: "Entry-Level and Kids Fire Tablets",
    1: "Batteries, Laptop Gear, and Basic Accessories",
    2: "Streaming Devices and E-Readers",
    3: "Advanced E-Readers and Smart Assistants",
    4: "Echo Speakers and Smart Home Hubs"
}

# دالة التنبؤ
def predict_cluster_from_index(index):
    if index is None:
        return "⚠️ الرجاء إدخال رقم المنتج."

    try:
        index = int(index)
        if index < 0 or index >= len(embeddings):
            return f"⚠️ رقم غير صالح. اختر رقماً بين 0 و {len(embeddings) - 1}"
        embedding = embeddings[index].reshape(1, -1)
        cluster_num = kmeans_model.predict(embedding)[0]
        cluster_name = cluster_names.get(cluster_num, "Unknown Cluster")
        return f"📦 التصنيف: {cluster_name} (Cluster {cluster_num})"
    except Exception as e:
        return f"⚠️ خطأ: {str(e)}"

# واجهة Gradio
gr.Interface(
    fn=predict_cluster_from_index,
    inputs=gr.Number(label="رقم المنتج", precision=0),
    outputs=gr.Textbox(label="التصنيف"),
    title="🔍 تصنيف المنتج حسب التضمين",
    description="أدخل رقم المنتج (index) لمعرفة الكلاستر الذي ينتمي إليه."
).launch()


* Running on local URL:  http://127.0.0.1:7865

To create a public link, set `share=True` in `launch()`.




In [None]:
'All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi, 16 GB - Includes Special Offers, Magenta'


In [4]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
model = AutoModel.from_pretrained("roberta-base")

tokenizer.save_pretrained("my_roberta")
model.save_pretrained("my_roberta")

NameError: name 'AutoTokenizer' is not defined

In [6]:
import gradio as gr
import joblib
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

# تحميل النموذج والتوكننايزر من المجلد المحلي
tokenizer = AutoTokenizer.from_pretrained("my_roberta")
model = AutoModel.from_pretrained("my_roberta")
model.eval()

# تحميل نموذج KMeans
kmeans_model = joblib.load("kmeans_model.pkl")

# أسماء الكلاسترات
cluster_names = {
    0: "Entry-Level and Kids Fire Tablets",
    1: "Batteries, Laptop Gear, and Basic Accessories",
    2: "Streaming Devices and E-Readers",
    3: "Advanced E-Readers and Smart Assistants",
    4: "Echo Speakers and Smart Home Hubs"
}

# دالة استخراج التضمين
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    inputs = {k: v.cuda() for k, v in inputs.items()}
    with torch.no_grad():
        output = model(**inputs)
    token_embeddings = output.last_hidden_state
    attention_mask = inputs['attention_mask']
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = input_mask_expanded.sum(1)
    sum_mask = torch.clamp(sum_mask, min=1e-9)
    embedding = (sum_embeddings / sum_mask).cpu().numpy()
    return embedding

# دالة التنبؤ
def predict_cluster_from_text(text):
    if not text.strip():
        return "⚠️ الرجاء إدخال نص."
    embedding = get_embedding(text)
    cluster_num = kmeans_model.predict(embedding)[0]
    cluster_name = cluster_names.get(cluster_num, "Unknown Cluster")
    return f"📦 التصنيف: {cluster_name} (Cluster {cluster_num})"

# واجهة Gradio
gr.Interface(
    fn=predict_cluster_from_text,
    inputs=gr.Textbox(label="أدخل اسم المنتج"),
    outputs=gr.Textbox(label="التصنيف"),
    title="🧠 تصنيف المنتج باستخدام RoBERTa + KMeans",
    description="اكتب اسم المنتج وسيتم تحديد نوع الكلاستر الذي ينتمي له."
).launch()


* Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "/Users/a7mad/Desktop/git_lab/ironhack/ironhack/lib/python3.10/site-packages/gradio/queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
  File "/Users/a7mad/Desktop/git_lab/ironhack/ironhack/lib/python3.10/site-packages/gradio/route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
  File "/Users/a7mad/Desktop/git_lab/ironhack/ironhack/lib/python3.10/site-packages/gradio/blocks.py", line 2137, in process_api
    result = await self.call_function(
  File "/Users/a7mad/Desktop/git_lab/ironhack/ironhack/lib/python3.10/site-packages/gradio/blocks.py", line 1663, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "/Users/a7mad/Desktop/git_lab/ironhack/ironhack/lib/python3.10/site-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "/Users/a7mad/Desktop/git_lab/iro

In [7]:
import gradio as gr
import joblib
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel

# تحميل النموذج والتوكننايزر من المجلد المحلي
tokenizer = AutoTokenizer.from_pretrained("my_roberta")
model = AutoModel.from_pretrained("my_roberta")
model.eval()

# تحميل نموذج KMeans
kmeans_model = joblib.load("kmeans_model.pkl")

# أسماء الكلاسترات
cluster_names = {
    0: "Entry-Level and Kids Fire Tablets",
    1: "Batteries, Laptop Gear, and Basic Accessories",
    2: "Streaming Devices and E-Readers",
    3: "Advanced E-Readers and Smart Assistants",
    4: "Echo Speakers and Smart Home Hubs"
}

# دالة استخراج التضمين (على CPU فقط)
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=64)
    with torch.no_grad():
        output = model(**inputs)
    token_embeddings = output.last_hidden_state
    attention_mask = inputs['attention_mask']
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = input_mask_expanded.sum(1)
    sum_mask = torch.clamp(sum_mask, min=1e-9)
    embedding = (sum_embeddings / sum_mask).cpu().numpy()
    return embedding

# دالة التنبؤ
def predict_cluster_from_text(text):
    if not text.strip():
        return "⚠️ الرجاء إدخال نص."
    embedding = get_embedding(text)
    cluster_num = kmeans_model.predict(embedding)[0]
    cluster_name = cluster_names.get(cluster_num, "Unknown Cluster")
    return f"📦 التصنيف: {cluster_name} (Cluster {cluster_num})"

# واجهة Gradio
gr.Interface(
    fn=predict_cluster_from_text,
    inputs=gr.Textbox(label="أدخل اسم المنتج"),
    outputs=gr.Textbox(label="التصنيف"),
    title="🧠 تصنيف المنتج باستخدام RoBERTa + KMeans",
    description="اكتب اسم المنتج وسيتم تحديد نوع الكلاستر الذي ينتمي له."
).launch()


* Running on local URL:  http://127.0.0.1:7867

To create a public link, set `share=True` in `launch()`.




In [None]:
'All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi, 16 GB - Includes Special Offers, Magenta'


In [8]:
import pandas as pd
import numpy as np
import random
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import AutoTokenizer as ClsTokenizer, AutoModel as ClsModel
import joblib

# التحميل
df = pd.read_csv("done.csv")

# تصنيف RoBERTa
cls_tokenizer = ClsTokenizer.from_pretrained("my_roberta")
cls_model = ClsModel.from_pretrained("my_roberta")
cls_model.eval()

# نموذج التجميع
kmeans_model = joblib.load("kmeans_model.pkl")

# نموذج التوليد BART
gen_tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
gen_model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

# أسماء العناقيد
cluster_names = {
    0: "Entry-Level and Kids Fire Tablets",
    1: "Batteries, Laptop Gear, and Basic Accessories",
    2: "Streaming Devices and E-Readers",
    3: "Advanced E-Readers and Smart Assistants",
    4: "Echo Speakers and Smart Home Hubs"
}


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

KeyboardInterrupt: 