# Predicția categoriei produsului pe baza titlului

Acest notebook urmărește următorii pași: încărcarea datelor → curățare → ingineria caracteristicilor → TF-IDF și caracteristici numerice → comparație de model (LogisticRegression și RandomForest) → training final → salvare.pkl → test pe exemple.

In [1]:
# Încărcare module
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')
print("Imports OK")

Imports OK


In [2]:
# Încărcare date
data_path = "IMLP4_TASK_03-products.csv"
if not os.path.exists(data_path):
    raise FileNotFoundError(f"Place your products.csv at {data_path} and re-run the notebook.")
df = pd.read_csv(data_path)
print("Rows:", len(df))
print(df.columns.tolist())
df.head(3)

Rows: 35311
['product ID', 'Product Title', 'Merchant ID', ' Category Label', '_Product Code', 'Number_of_Views', 'Merchant Rating', ' Listing Date  ']


Unnamed: 0,product ID,Product Title,Merchant ID,Category Label,_Product Code,Number_of_Views,Merchant Rating,Listing Date
0,1,apple iphone 8 plus 64gb silver,1,Mobile Phones,QA-2276-XC,860.0,2.5,5/10/2024
1,2,apple iphone 8 plus 64 gb spacegrau,2,Mobile Phones,KA-2501-QO,3772.0,4.8,12/31/2024
2,3,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,3,Mobile Phones,FP-8086-IE,3092.0,3.9,11/10/2024


In [3]:
# Curățarea datelor: se țin rândurile cu Product title și Category label
df = df.rename(columns=lambda c: c.strip())
required_cols = ['Product Title', 'Category Label']
for c in required_cols:
    if c not in df.columns:
        raise ValueError(f"Required column '{c}' not found in CSV. Found: {df.columns.tolist()}")
df = df.dropna(subset=['Product Title', 'Category Label']).reset_index(drop=True)
df['Product Title'] = df['Product Title'].astype(str)
df['Category Label'] = df['Category Label'].astype(str)
print("After cleaning rows:", len(df))
df['Category Label'].value_counts().head(10)

After cleaning rows: 35096


Category Label
Fridge Freezers     5470
Washing Machines    4015
Mobile Phones       4002
CPUs                3747
TVs                 3541
Fridges             3436
Dishwashers         3405
Digital Cameras     2689
Microwaves          2328
Freezers            2201
Name: count, dtype: int64

In [4]:
# Ingineria caracteristicilor
import re
def title_basic_stats(title):
    if not isinstance(title, str):
        title = ""
    words = title.split()
    n_words = len(words)
    n_chars = len(title)
    n_digits = sum(c.isdigit() for c in title)
    n_upper = sum(1 for c in title if c.isupper())
    pct_upper = n_upper / max(1, n_chars)
    longest_word = max((len(w) for w in words), default=0)
    has_special = int(bool(re.search(r'[^A-Za-z0-9\s]', title)))
    has_gb = int(bool(re.search(r'\b(GB|gb)\b', title)))
    has_mp = int(bool(re.search(r'\b(MP|mp)\b', title)))
    return {'n_words': n_words, 'n_chars': n_chars, 'n_digits': n_digits, 'pct_upper': pct_upper, 'longest_word': longest_word, 'has_special': has_special, 'has_gb': has_gb, 'has_mp': has_mp}

feats = df['Product Title'].apply(title_basic_stats).apply(pd.Series)
df = pd.concat([df, feats], axis=1)
df.head(3)

Unnamed: 0,product ID,Product Title,Merchant ID,Category Label,_Product Code,Number_of_Views,Merchant Rating,Listing Date,n_words,n_chars,n_digits,pct_upper,longest_word,has_special,has_gb,has_mp
0,1,apple iphone 8 plus 64gb silver,1,Mobile Phones,QA-2276-XC,860.0,2.5,5/10/2024,6.0,31.0,3.0,0.0,6.0,0.0,0.0,0.0
1,2,apple iphone 8 plus 64 gb spacegrau,2,Mobile Phones,KA-2501-QO,3772.0,4.8,12/31/2024,7.0,35.0,3.0,0.0,9.0,0.0,1.0,0.0
2,3,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,3,Mobile Phones,FP-8086-IE,3092.0,3.9,11/10/2024,13.0,70.0,9.0,0.0,10.0,1.0,0.0,0.0


In [5]:
# TF-IDF și caracteristici numerice
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        # X expected as DataFrame
        if isinstance(self.column, list):
            return X[self.column]
        return X[self.column].astype(str).values

class DataFrameToArray(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        if hasattr(X, 'values'):
            return X.values
        return X

text_pipeline = Pipeline([
    ('selector', ColumnSelector('Product Title')),
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), max_features=15000, lowercase=True))
])

numeric_cols = ['n_words','n_chars','n_digits','pct_upper','longest_word','has_special','has_gb','has_mp']
numeric_pipeline = Pipeline([
    ('selector', ColumnSelector(numeric_cols)),
    ('toarr', DataFrameToArray()),
    ('scaler', StandardScaler())
])

preprocessor = FeatureUnion([('text', text_pipeline), ('num', numeric_pipeline)])
print("Preprocessor ready")

Preprocessor ready


In [6]:
# Antrenare și testare
X = df.copy()
y = df['Category Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("Train:", len(X_train), "Test:", len(X_test))


Train: 28076 Test: 7020


In [7]:
# Evaluare modele
def train_and_eval(clf, name="model"):
    pipe = Pipeline([('preproc', preprocessor), ('clf', clf)])
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    acc = accuracy_score(y_test, preds)
    print(f"== {name} ==\nAccuracy: {acc:.4f}\n")
    print(classification_report(y_test, preds, zero_division=0))
    return pipe, acc

models_results = []

In [8]:
# Logistic Regression 
lr = LogisticRegression(max_iter=2000, n_jobs=-1)
lr_pipe, lr_acc = train_and_eval(lr, name='LogisticRegression')
models_results.append(('LogisticRegression', lr_pipe, lr_acc))

== LogisticRegression ==
Accuracy: 0.9537

                  precision    recall  f1-score   support

             CPU       0.00      0.00      0.00        17
            CPUs       0.98      1.00      0.99       749
 Digital Cameras       0.99      0.99      0.99       538
     Dishwashers       0.94      0.95      0.95       681
        Freezers       1.00      0.92      0.96       440
 Fridge Freezers       0.94      0.94      0.94      1094
         Fridges       0.85      0.90      0.88       687
      Microwaves       0.98      0.96      0.97       466
    Mobile Phone       0.00      0.00      0.00        11
   Mobile Phones       0.97      0.99      0.98       801
             TVs       0.98      0.99      0.98       708
Washing Machines       0.95      0.96      0.95       803
          fridge       0.00      0.00      0.00        25

        accuracy                           0.95      7020
       macro avg       0.74      0.74      0.74      7020
    weighted avg       0.95

In [9]:
# Random Forest 
rf = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42)
rf_pipe, rf_acc = train_and_eval(rf, name='RandomForest')
models_results.append(('RandomForest', rf_pipe, rf_acc))

== RandomForest ==
Accuracy: 0.9554

                  precision    recall  f1-score   support

             CPU       0.00      0.00      0.00        17
            CPUs       0.98      0.99      0.98       749
 Digital Cameras       1.00      0.98      0.99       538
     Dishwashers       0.97      0.97      0.97       681
        Freezers       0.94      0.93      0.93       440
 Fridge Freezers       0.94      0.94      0.94      1094
         Fridges       0.88      0.90      0.89       687
      Microwaves       0.96      0.97      0.97       466
    Mobile Phone       0.00      0.00      0.00        11
   Mobile Phones       0.96      0.99      0.97       801
             TVs       0.99      0.99      0.99       708
Washing Machines       0.96      0.96      0.96       803
          fridge       0.00      0.00      0.00        25

        accuracy                           0.96      7020
       macro avg       0.74      0.74      0.74      7020
    weighted avg       0.95      

In [10]:
# Selectarea modelului cu cea mai bună acuratețe
best = sorted(models_results, key=lambda r: r[2], reverse=True)[0]
best_name, best_pipe, best_acc = best
print("Best model:", best_name, "acc:", best_acc)

# Salvează direct în rădăcină (fără director models)
model_path = 'product_classifier.pkl'  # 👈 Direct în rădăcină
joblib.dump(best_pipe, model_path)
print("Saved best model to", model_path)

Best model: RandomForest acc: 0.9554131054131054
Saved best model to product_classifier.pkl


In [11]:
# Funcție pentru a crea manual caracteristicile
def create_features(title):
    return {
        'n_words': len(str(title).split()),
        'n_chars': len(str(title)),
        'n_digits': sum(c.isdigit() for c in str(title)),
        'pct_upper': sum(c.isupper() for c in str(title)) / max(len(str(title)), 1),
        'longest_word': max([len(w) for w in str(title).split()], default=0),
        'has_special': int(any(c in str(title) for c in ['!', '@', '#', '$', '%', '&', '*'])),
        'has_gb': int('gb' in str(title).lower()),
        'has_mp': int('mp' in str(title).lower()),
        'Product Title': title
    }

# Testare
manual_titles = [
    "iphone 7 32gb gold,4,3,Apple iPhone 7 32GB",
    "olympus e m10 mark iii geh use silber",
    "kenwood k20mss15 solo",
    "bosch wap28390gb 8kg 1400 spin",
    "bosch serie 4 kgv39vl31g",
    "smeg sbs8004po"
]

for t in manual_titles:
    # Creează toate caracteristicile necesare necesare
    features = create_features(t)
    test_data = pd.DataFrame([features])
    
    pred = best_pipe.predict(test_data)
    print(f"Title: {t}")
    print(f"Predicted: {pred[0]}")
    print("-" * 50)

Title: iphone 7 32gb gold,4,3,Apple iPhone 7 32GB
Predicted: Mobile Phones
--------------------------------------------------
Title: olympus e m10 mark iii geh use silber
Predicted: Digital Cameras
--------------------------------------------------
Title: kenwood k20mss15 solo
Predicted: Microwaves
--------------------------------------------------
Title: bosch wap28390gb 8kg 1400 spin
Predicted: Washing Machines
--------------------------------------------------
Title: bosch serie 4 kgv39vl31g
Predicted: Microwaves
--------------------------------------------------
Title: smeg sbs8004po
Predicted: Fridges
--------------------------------------------------


In [12]:

# ==== Test interactiv ====
# Funcție pentru a crea toate feature-urile necesare
def create_features(title):
    """Creează toate feature-urile pe care modelul le așteaptă"""
    title_str = str(title)
    return {
        'Product Title': title_str,
        'n_words': len(title_str.split()),
        'n_chars': len(title_str),
        'n_digits': sum(c.isdigit() for c in title_str),
        'pct_upper': sum(c.isupper() for c in title_str) / max(len(title_str), 1),
        'longest_word': max([len(w) for w in title_str.split()], default=0),
        'has_special': int(any(c in title_str for c in ['!', '@', '#', '$', '%', '&', '*'])),
        'has_gb': int('gb' in title_str.lower()),
        'has_mp': int('mp' in title_str.lower())
    }

# Încarcă modelul salvat 
try:
    model_path = 'product_classifier.pkl'  
    model = joblib.load(model_path)
    print("Model încărcat cu succes!")
except FileNotFoundError:
    print("Fișierul modelului nu a fost găsit!")
    exit()
except Exception as e:
    print(f"Eroare la încărcare: {e}")
    exit()

while True:
    user_input = input("\nIntrodu titlul produsului: ")
    
    if user_input.lower() == "exit":
        print("La revedere!")
        break
    
    if not user_input.strip():
        print("Te rog introdu un titlu valid")
        continue
    
    try:
        # Creează toate caracteristicile necesare
        features = create_features(user_input)
        
        # Creează DataFrame cu toate coloanele necesare
        test_data = pd.DataFrame([features])
        
        # Predic'ia
        prediction = model.predict(test_data)
        
        print(f"*Categoria prezisă:* {prediction[0]}")
        
        # Afișează probabilități
        if hasattr(model, 'predict_proba'):
            try:
                probabilities = model.predict_proba(test_data)
                print(f"Probabilități: {probabilities[0]}")
            except:
                pass
                
    except Exception as e:
        print(f"Eroare la predicție: {e}")
    

Model încărcat cu succes!



Introdu titlul produsului:  iphone 7 32gb gold,4,3,Apple iPhone 7 32GB


*Categoria prezisă:* Mobile Phones
Probabilități: [0.    0.045 0.005 0.005 0.015 0.025 0.005 0.025 0.01  0.84  0.015 0.01
 0.   ]



Introdu titlul produsului:  exit


La revedere!
