# 1. Ucitavanje podataka

In [1]:
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_columns', None)

df = pd.read_csv('data/products.csv')

df.head()

Unnamed: 0,product ID,Product Title,Merchant ID,Category Label,_Product Code,Number_of_Views,Merchant Rating,Listing Date
0,1,apple iphone 8 plus 64gb silver,1,Mobile Phones,QA-2276-XC,860.0,2.5,5/10/2024
1,2,apple iphone 8 plus 64 gb spacegrau,2,Mobile Phones,KA-2501-QO,3772.0,4.8,12/31/2024
2,3,apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...,3,Mobile Phones,FP-8086-IE,3092.0,3.9,11/10/2024
3,4,apple iphone 8 plus 64gb space grey,4,Mobile Phones,YI-0086-US,466.0,3.4,5/2/2022
4,5,apple iphone 8 plus gold 5.5 64gb 4g unlocked ...,5,Mobile Phones,NZ-3586-WP,4426.0,1.6,4/12/2023


# 2. Osnovni pregled i provera nedostajućih vrednosti

In [2]:
df.info()
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35311 entries, 0 to 35310
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   product ID       35311 non-null  int64  
 1   Product Title    35139 non-null  object 
 2   Merchant ID      35311 non-null  int64  
 3    Category Label  35267 non-null  object 
 4   _Product Code    35216 non-null  object 
 5   Number_of_Views  35297 non-null  float64
 6   Merchant Rating  35141 non-null  float64
 7    Listing Date    35252 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 2.2+ MB


product ID           0
Product Title      172
Merchant ID          0
 Category Label     44
_Product Code       95
Number_of_Views     14
Merchant Rating    170
 Listing Date       59
dtype: int64

# 3. Pregled kategorija i njihov broj

In [3]:
print(f"Broj unikatnih kategorija: {df[' Category Label'].nunique()}")
df[' Category Label'].value_counts().head(15)

Broj unikatnih kategorija: 13


Fridge Freezers     5495
Washing Machines    4036
Mobile Phones       4020
CPUs                3771
TVs                 3564
Fridges             3457
Dishwashers         3418
Digital Cameras     2696
Microwaves          2338
Freezers            2210
fridge               123
CPU                   84
Mobile Phone          55
Name:  Category Label, dtype: int64

# 4. Ciscenje podatka

In [4]:
df.columns = df.columns.str.strip()

df_clean = df.dropna(subset=['Product Title', 'Category Label']).copy()


df_clean['Product Title'] = df_clean['Product Title'].str.lower()

df_clean['Product Title'] = df_clean['Product Title'].apply(lambda x: re.sub(r'[^a-z0-9\s]', '', str(x)))

df_clean['Product Title'] = df_clean['Product Title'].str.replace(r'\s+', ' ', regex=True).str.strip()

df_clean['Category Label'] = df_clean['Category Label'].str.strip()

df_clean.head()

Unnamed: 0,product ID,Product Title,Merchant ID,Category Label,_Product Code,Number_of_Views,Merchant Rating,Listing Date
0,1,apple iphone 8 plus 64gb silver,1,Mobile Phones,QA-2276-XC,860.0,2.5,5/10/2024
1,2,apple iphone 8 plus 64 gb spacegrau,2,Mobile Phones,KA-2501-QO,3772.0,4.8,12/31/2024
2,3,apple mq8n2ba iphone 8 plus 64gb 55 12mp sim f...,3,Mobile Phones,FP-8086-IE,3092.0,3.9,11/10/2024
3,4,apple iphone 8 plus 64gb space grey,4,Mobile Phones,YI-0086-US,466.0,3.4,5/2/2022
4,5,apple iphone 8 plus gold 55 64gb 4g unlocked s...,5,Mobile Phones,NZ-3586-WP,4426.0,1.6,4/12/2023


# 5. Provjera ociscenih podataka

In [5]:
df_clean.info()
df_clean.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 35096 entries, 0 to 35310
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   product ID       35096 non-null  int64  
 1   Product Title    35096 non-null  object 
 2   Merchant ID      35096 non-null  int64  
 3   Category Label   35096 non-null  object 
 4   _Product Code    35002 non-null  object 
 5   Number_of_Views  35082 non-null  float64
 6   Merchant Rating  34926 non-null  float64
 7   Listing Date     35038 non-null  object 
dtypes: float64(2), int64(2), object(4)
memory usage: 2.4+ MB


product ID           0
Product Title        0
Merchant ID          0
Category Label       0
_Product Code       94
Number_of_Views     14
Merchant Rating    170
Listing Date        58
dtype: int64

# 6. Feature engineering

In [6]:
df = df.dropna(subset=['Product Title'])

In [7]:
df['title_length'] = df['Product Title'].apply(len)
df['title_word_count'] = df['Product Title'].apply(lambda x: len(x.split()))
df['title_contains_digit'] = df['Product Title'].apply(lambda x: int(any(char.isdigit() for char in x)))
df['title_digit_count'] = df['Product Title'].apply(lambda x: sum(char.isdigit() for char in x))

# Primer pregleda novih kolona
print(df[['Product Title', 'title_length', 'title_word_count', 'title_contains_digit', 'title_digit_count']].head())

                                       Product Title  title_length  \
0                    apple iphone 8 plus 64gb silver            31   
1                apple iphone 8 plus 64 gb spacegrau            35   
2  apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...            70   
3                apple iphone 8 plus 64gb space grey            35   
4  apple iphone 8 plus gold 5.5 64gb 4g unlocked ...            54   

   title_word_count  title_contains_digit  title_digit_count  
0                 6                     1                  3  
1                 7                     1                  3  
2                13                     1                  9  
3                 7                     1                  3  
4                11                     1                  6  


# 7. Priprema podataka za trening i test set

In [15]:
df['Number_of_Views'] = df['Number_of_Views'].fillna(df['Number_of_Views'].mean())
df['Merchant Rating'] = df['Merchant Rating'].fillna(df['Merchant Rating'].mean())
df['_Product Code'] = df['_Product Code'].fillna('unknown')
df['Product Title'] = df['Product Title'].fillna('unknown')
df['Category Label'] = df['Category Label'].fillna('Other')


In [16]:
y = df['Category Label']

In [21]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)


In [17]:
X = df['Product Title'].fillna('')  

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_vectorized = vectorizer.fit_transform(X)


In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42, stratify=y
)


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Multinomial NB": MultinomialNB(),
    "Linear SVC": LinearSVC(),
}

for name, model in models.items():
    print(f"\nModel: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=le.classes_))



Model: Logistic Regression


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                  precision    recall  f1-score   support

             CPU       0.00      0.00      0.00        17
            CPUs       0.98      1.00      0.99       749
 Digital Cameras       0.99      0.99      0.99       538
     Dishwashers       0.94      0.95      0.94       681
        Freezers       0.98      0.91      0.94       440
 Fridge Freezers       0.92      0.94      0.93      1094
         Fridges       0.87      0.88      0.88       687
      Microwaves       0.98      0.95      0.97       466
    Mobile Phone       0.00      0.00      0.00        11
   Mobile Phones       0.95      0.99      0.97       801
             TVs       0.98      0.99      0.98       708
Washing Machines       0.95      0.95      0.95       803
          fridge       0.00      0.00      0.00        25

        accuracy                           0.95      7020
       macro avg       0.73      0.74      0.73      7020
    weighted avg       0.94      0.95      0.95      7020


Model: Mul

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                  precision    recall  f1-score   support

             CPU       0.00      0.00      0.00        17
            CPUs       0.98      1.00      0.99       749
 Digital Cameras       0.99      0.99      0.99       538
     Dishwashers       0.97      0.92      0.94       681
        Freezers       0.98      0.77      0.86       440
 Fridge Freezers       0.81      0.95      0.87      1094
         Fridges       0.86      0.84      0.85       687
      Microwaves       0.98      0.95      0.97       466
    Mobile Phone       0.00      0.00      0.00        11
   Mobile Phones       0.98      0.99      0.98       801
             TVs       0.98      0.99      0.99       708
Washing Machines       0.96      0.94      0.95       803
          fridge       0.00      0.00      0.00        25

        accuracy                           0.93      7020
       macro avg       0.73      0.72      0.72      7020
    weighted avg       0.93      0.93      0.93      7020


Model: Lin

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Predikcija modela

In [23]:
new_product = ["samsung galaxy s21 ultra 5g 128gb phantom black"]

In [24]:
new_product_vectorized = vectorizer.transform(new_product)

In [27]:
for name, model in models.items():
    pred = model.predict(new_product_vectorized)
    print(f"{name}: {pred[0]}")


Logistic Regression: Mobile Phones
Multinomial NB: Mobile Phones
Linear SVC: Mobile Phones
