In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

from catboost import CatBoostClassifier, Pool

In [2]:
df = pd.read_csv('../data/raw/ecommerceDataset.csv', names=['category', 'text'])

df = df.dropna(subset=['text'])
df = df[df['text'].str.len() > 10].copy()

print(df.shape)
df.head()

(50381, 2)


Unnamed: 0,category,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [3]:
# Разделение на обучение и тест
X = df['text']
y = df['category']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=11,
    stratify=y
)

print('Train size:', X_train.shape[0])
print('Test size:', X_test.shape[0])

Train size: 40304
Test size: 10077


In [4]:
# TF-IDF векторизация текста

# Ограничим размер словаря, чтобы ускорить обучение и уменьшить переобучение
vectorizer = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2),  # униграммы + биграммы
    min_df=5,            # игнорируем очень редкие слова
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

X_train_tfidf.shape, X_test_tfidf.shape

((40304, 50000), (10077, 50000))

In [5]:
# Базовая модель: логистическая регрессия

log_reg = LogisticRegression(
    max_iter=1000,
    n_jobs=-1,
    class_weight='balanced',  # учитываем умеренный дисбаланс классов
)

log_reg.fit(X_train_tfidf, y_train)

y_pred_lr = log_reg.predict(X_test_tfidf)

print('F1-macro (LogReg):', f1_score(y_test, y_pred_lr, average='macro'))
print('\nClassification report (LogReg):')
print(classification_report(y_test, y_pred_lr))



F1-macro (LogReg): 0.9676351219509765

Classification report (LogReg):
                        precision    recall  f1-score   support

                 Books       0.97      0.96      0.96      2356
Clothing & Accessories       0.97      0.98      0.98      1734
           Electronics       0.96      0.96      0.96      2124
             Household       0.97      0.97      0.97      3863

              accuracy                           0.97     10077
             macro avg       0.97      0.97      0.97     10077
          weighted avg       0.97      0.97      0.97     10077



In [20]:
# Модель CatBoost с использованием исходного текста (без TF-IDF)

X_train_df = X_train.to_frame()
X_test_df = X_test.to_frame()

# CatBoost ожидает Pool; передадим тексты как единственный признак
train_pool = Pool(data=X_train_df, label=y_train, text_features=[0])
test_pool = Pool(data=X_test_df, label=y_test, text_features=[0])

cb_model = CatBoostClassifier(
    iterations=500,
    learning_rate=0.2,
    depth=6,
    loss_function='MultiClass',
    eval_metric='TotalF1',
    random_seed=11,
    verbose=50
)

cb_model.fit(train_pool, eval_set=test_pool)

cb_preds = cb_model.predict(test_pool)

print('F1-macro (CatBoost):', f1_score(y_test, cb_preds, average='macro'))
print('\nClassification report (CatBoost):')
print(classification_report(y_test, cb_preds))

0:	learn: 0.9476136	test: 0.9522025	best: 0.9522025 (0)	total: 394ms	remaining: 3m 16s
50:	learn: 0.9499724	test: 0.9532013	best: 0.9535993 (40)	total: 18.9s	remaining: 2m 46s
100:	learn: 0.9526540	test: 0.9552852	best: 0.9552852 (98)	total: 35.5s	remaining: 2m 20s
150:	learn: 0.9543699	test: 0.9555883	best: 0.9556848 (128)	total: 52.5s	remaining: 2m 1s
200:	learn: 0.9562613	test: 0.9568793	best: 0.9568793 (198)	total: 1m 10s	remaining: 1m 45s
250:	learn: 0.9578508	test: 0.9575764	best: 0.9575764 (250)	total: 1m 29s	remaining: 1m 28s
300:	learn: 0.9595147	test: 0.9580714	best: 0.9584706 (269)	total: 1m 46s	remaining: 1m 10s
350:	learn: 0.9612024	test: 0.9586700	best: 0.9587669 (322)	total: 2m 4s	remaining: 52.8s
400:	learn: 0.9626182	test: 0.9590708	best: 0.9590708 (400)	total: 2m 22s	remaining: 35.1s
450:	learn: 0.9636859	test: 0.9594699	best: 0.9596705 (440)	total: 2m 40s	remaining: 17.4s
499:	learn: 0.9651250	test: 0.9598731	best: 0.9599717 (492)	total: 2m 58s	remaining: 0us

bestTe

## Анализ результатов классификации текстов e-commerce данных

### Сравнение моделей:
1) Логистическая регрессия + TF-IDF
- F1-macro: 0.9676
- Точность: 97%

2) CatBoost
- F1-macro: 0.9606
- Точность: 96%

### Ключевые выводы:

- Обе модели показали хорошие результаты (все метрики > 0.95)
- Задача решается эффективно обеими подходами
- Логистическая регрессия показала результат лучше на 0.7%