In [13]:
import requests
import pandas as pd

url = "https://dummyjson.com/products?limit=0&select=title,category"
resp = requests.get(url)
resp.raise_for_status()

data = resp.json()
products = data["products"]
print("Total products from API:", len(products))

df = pd.DataFrame(products)
print("\nRaw data preview:")
print(df.head())

electronics_categories = [
    "smartphones",
    "laptops",
    "tablets",
    "mobile-accessories",
]

df["is_electronics"] = df["category"].isin(electronics_categories).astype(int)

print("\nLabel distribution (1 = electronics, 0 = non-electronics):")
print(df["is_electronics"].value_counts())

if "id" in df.columns:
    df = df.rename(columns={"id": "product_id"})

df.insert(0, "id", range(1, len(df) + 1))

df["source"] = "dummyjson_products"
df["notes"] = df["category"]

csv_path = "electronics_titles.csv"
df.to_csv(csv_path, index=False, encoding="utf-8")

print("\nSaved to", csv_path)
print(df.head())


Total products from API: 194

Raw data preview:
   id                          title category
0   1  Essence Mascara Lash Princess   beauty
1   2  Eyeshadow Palette with Mirror   beauty
2   3                Powder Canister   beauty
3   4                   Red Lipstick   beauty
4   5                Red Nail Polish   beauty

Label distribution (1 = electronics, 0 = non-electronics):
is_electronics
0    156
1     38
Name: count, dtype: int64

Saved to electronics_titles.csv
   id  product_id                          title category  is_electronics  \
0   1           1  Essence Mascara Lash Princess   beauty               0   
1   2           2  Eyeshadow Palette with Mirror   beauty               0   
2   3           3                Powder Canister   beauty               0   
3   4           4                   Red Lipstick   beauty               0   
4   5           5                Red Nail Polish   beauty               0   

               source   notes  
0  dummyjson_products  beauty

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import numpy as np

RANDOM_SEED = 42
csv_path = "electronics_titles.csv"

df = pd.read_csv(csv_path)
print("Total samples:", len(df))
print(df.head())
print("\nLabel counts (is_electronics):")
print(df["is_electronics"].value_counts())

df["is_electronics"] = df["is_electronics"].astype(int)

X_text = df["title"].astype(str).tolist()
y = df["is_electronics"].values

X_train_text, X_temp_text, y_train, y_temp = train_test_split(
    X_text, y, test_size=0.3, random_state=RANDOM_SEED, stratify=y
)

X_val_text, X_test_text, y_val, y_test = train_test_split(
    X_temp_text, y_temp, test_size=0.5, random_state=RANDOM_SEED, stratify=y_temp
)

print("\nSplit sizes:")
print("Train size:", len(X_train_text))
print("Val size:", len(X_val_text))
print("Test size:", len(X_test_text))

print("\nTrain label dist BEFORE oversampling:", Counter(y_train))

pos_idx = np.where(y_train == 1)[0]
neg_idx = np.where(y_train == 0)[0]
n_pos, n_neg = len(pos_idx), len(neg_idx)
print("n_pos:", n_pos, "n_neg:", n_neg)

if n_pos < n_neg:
    extra_pos_idx = np.random.choice(pos_idx, size=n_neg - n_pos, replace=True)
    balanced_idx = np.concatenate([neg_idx, pos_idx, extra_pos_idx])
else:
    balanced_idx = np.arange(len(y_train))

np.random.shuffle(balanced_idx)

X_train_text_bal = [X_train_text[i] for i in balanced_idx]
y_train_bal = [y_train[i] for i in balanced_idx]

print("Train label dist AFTER oversampling:", Counter(y_train_bal))

vectorizer = TfidfVectorizer(
    max_features=1000,
    ngram_range=(1, 2),
    stop_words="english"
)

X_train = vectorizer.fit_transform(X_train_text_bal)
X_val = vectorizer.transform(X_val_text)
X_test = vectorizer.transform(X_test_text)

X_train.shape, X_val.shape, X_test.shape


Total samples: 194
   id  product_id                          title category  is_electronics  \
0   1           1  Essence Mascara Lash Princess   beauty               0   
1   2           2  Eyeshadow Palette with Mirror   beauty               0   
2   3           3                Powder Canister   beauty               0   
3   4           4                   Red Lipstick   beauty               0   
4   5           5                Red Nail Polish   beauty               0   

               source   notes  
0  dummyjson_products  beauty  
1  dummyjson_products  beauty  
2  dummyjson_products  beauty  
3  dummyjson_products  beauty  
4  dummyjson_products  beauty  

Label counts (is_electronics):
is_electronics
0    156
1     38
Name: count, dtype: int64

Split sizes:
Train size: 135
Val size: 29
Test size: 30

Train label dist BEFORE oversampling: Counter({np.int64(0): 109, np.int64(1): 26})
n_pos: 26 n_neg: 109
Train label dist AFTER oversampling: Counter({np.int64(0): 109, np.int64(

((218, 473), (29, 473), (30, 473))

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd

models = []

for C in [0.5, 1.0, 2.0]:
    clf = LogisticRegression(C=C, max_iter=1000, n_jobs=-1)
    models.append((f"LogReg_C{C}", clf))

for C in [0.5, 1.0, 2.0]:
    clf = LinearSVC(C=C)
    models.append((f"LinearSVC_C{C}", clf))

for n_est in [50, 100]:
    clf = RandomForestClassifier(
        n_estimators=n_est,
        max_depth=None,
        random_state=RANDOM_SEED,
        n_jobs=-1
    )
    models.append((f"RF_{n_est}", clf))

models.append(("MultinomialNB", MultinomialNB()))

results = []

for name, clf in models:
    clf.fit(X_train, y_train_bal)
    y_val_pred = clf.predict(X_val)

    acc = accuracy_score(y_val, y_val_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_val, y_val_pred, average="binary", zero_division=0
    )

    results.append({
        "model": name,
        "val_accuracy": acc,
        "val_precision": precision,
        "val_recall": recall,
        "val_f1": f1
    })

results_df = pd.DataFrame(results)
results_df_sorted = results_df.sort_values(by="val_f1", ascending=False)
results_df_sorted


Unnamed: 0,model,val_accuracy,val_precision,val_recall,val_f1
0,LogReg_C0.5,0.965517,1.0,0.833333,0.909091
1,LogReg_C1.0,0.965517,1.0,0.833333,0.909091
2,LogReg_C2.0,0.965517,1.0,0.833333,0.909091
3,LinearSVC_C0.5,0.965517,1.0,0.833333,0.909091
4,LinearSVC_C1.0,0.965517,1.0,0.833333,0.909091
5,LinearSVC_C2.0,0.965517,1.0,0.833333,0.909091
6,RF_50,0.965517,1.0,0.833333,0.909091
8,MultinomialNB,0.965517,1.0,0.833333,0.909091
7,RF_100,0.931034,1.0,0.666667,0.8


In [16]:
from sklearn.metrics import classification_report, confusion_matrix

best_row = results_df.sort_values(by="val_f1", ascending=False).iloc[0]
best_name = best_row["model"]
print("Best model on validation (by F1):", best_name)

def create_model_by_name(name):
    if name.startswith("LogReg"):
        C = float(name.split("C")[1])
        return LogisticRegression(C=C, max_iter=1000, n_jobs=-1)
    if name.startswith("LinearSVC"):
        C = float(name.split("C")[1])
        return LinearSVC(C=C)
    if name.startswith("RF_"):
        n_est = int(name.split("_")[1])
        return RandomForestClassifier(
            n_estimators=n_est,
            max_depth=None,
            random_state=RANDOM_SEED,
            n_jobs=-1
        )
    if name == "MultinomialNB":
        return MultinomialNB()
    raise ValueError("Unknown model name: " + name)

best_model = create_model_by_name(best_name)
best_model.fit(X_train, y_train_bal)

y_test_pred = best_model.predict(X_test)

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

test_acc = accuracy_score(y_test, y_test_pred)
precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_test_pred, average="binary", zero_division=0
)

print("\n=== Test set performance (best model) ===")
print("Accuracy:", test_acc)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

print("\nClassification report on test set:")
print(classification_report(y_test, y_test_pred, digits=3))

print("Confusion matrix (test):")
print(confusion_matrix(y_test, y_test_pred))


Best model on validation (by F1): LogReg_C0.5

=== Test set performance (best model) ===
Accuracy: 0.9333333333333333
Precision: 1.0
Recall: 0.6666666666666666
F1: 0.8

Classification report on test set:
              precision    recall  f1-score   support

           0      0.923     1.000     0.960        24
           1      1.000     0.667     0.800         6

    accuracy                          0.933        30
   macro avg      0.962     0.833     0.880        30
weighted avg      0.938     0.933     0.928        30

Confusion matrix (test):
[[24  0]
 [ 2  4]]


In [17]:
def predict_title(title):
    X = vectorizer.transform([title])
    pred = best_model.predict(X)[0]
    label = "electronics" if pred == 1 else "non-electronics"
    print(f"Title: {title}")
    print(f"Predicted label: {label}")


predict_title("Samsung Galaxy S21 Ultra")
predict_title("Wooden Dining Table Set")
predict_title("Apple AirPods Pro")
predict_title("Organic Green Tea 250g")


Title: Samsung Galaxy S21 Ultra
Predicted label: electronics
Title: Wooden Dining Table Set
Predicted label: non-electronics
Title: Apple AirPods Pro
Predicted label: electronics
Title: Organic Green Tea 250g
Predicted label: non-electronics
