# Базовый пайплайн для соревнования по определению контрафакта 

### 1. Загрузка данных

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
import warnings

df_train = pd.read_csv('ml_ozon_сounterfeit_train.csv', index_col=0)
df_test = pd.read_csv('ml_ozon_сounterfeit_test.csv', index_col=0)

print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")
print(f"Target distribution in train:")
print(df_train['resolution'].value_counts())
print()

Train shape: (197198, 44)
Test shape: (22760, 43)
Target distribution in train:
resolution
0    184146
1     13052
Name: count, dtype: int64



In [None]:
df_train['description'][159385]

### 2. Предобработка данных
Используем 39 числовых признаков

In [8]:
numeric_columns = df_train.select_dtypes(include=[np.number]).columns.tolist()
numeric_columns = [col for col in numeric_columns if col != 'resolution']



In [9]:
X_train = df_train[numeric_columns].fillna(0)
y_train = df_train['resolution']
X_test = df_test[numeric_columns].fillna(0)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print()


X_train shape: (197198, 39)
X_test shape: (22760, 39)



### 3. Обучение модели

In [14]:
X_train = df_train[numeric_columns].fillna(0)
y_train = df_train['resolution']
X_test = df_test[numeric_columns].fillna(0)

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_split, y_train_split)

val_pred = model.predict(X_val_split)
val_f1 = f1_score(y_val_split, val_pred, pos_label=1)

print(f"Validation f1: {val_f1:.6f}")
print("Classification report:")
print(classification_report(y_val_split, val_pred))
print()


Validation f1: 0.674733
Classification report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     36830
           1       0.89      0.54      0.67      2610

    accuracy                           0.97     39440
   macro avg       0.93      0.77      0.83     39440
weighted avg       0.96      0.97      0.96     39440




In [15]:
X_train = df_train[numeric_columns].fillna(-1)
y_train = df_train['resolution']

print(f"X_train shape: {X_train.shape}")
print()

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_split, y_train_split)

val_pred = model.predict(X_val_split)
val_f1 = f1_score(y_val_split, val_pred, pos_label=1)

print(f"Validation f1: {val_f1:.6f}")
print("Classification report:")
print(classification_report(y_val_split, val_pred))
print()

X_train shape: (197198, 39)

Validation f1: 0.678224
Classification report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     36830
           1       0.89      0.55      0.68      2610

    accuracy                           0.97     39440
   macro avg       0.93      0.77      0.83     39440
weighted avg       0.96      0.97      0.96     39440




### 4. Формирование submission.csv

In [5]:

test_predictions = model.predict(X_test)

submission = pd.DataFrame({
    'id': df_test.index,
    'prediction': test_predictions
})

submission.to_csv('submission.csv', index=False)


print(f"Создан файл submission.csv с {len(submission)} предсказаниями")
print(f"Распределение предсказаний:")
print(submission['prediction'].value_counts())
print()

Создан файл submission.csv с 22760 предсказаниями
Распределение предсказаний:
prediction
0    22450
1      310
Name: count, dtype: int64



# exploration

In [11]:
df_train.columns

Index(['resolution', 'brand_name', 'description', 'name_rus',
       'CommercialTypeName4', 'rating_1_count', 'rating_2_count',
       'rating_3_count', 'rating_4_count', 'rating_5_count',
       'comments_published_count', 'photos_published_count',
       'videos_published_count', 'PriceDiscounted', 'item_time_alive',
       'item_count_fake_returns7', 'item_count_fake_returns30',
       'item_count_fake_returns90', 'item_count_sales7', 'item_count_sales30',
       'item_count_sales90', 'item_count_returns7', 'item_count_returns30',
       'item_count_returns90', 'GmvTotal7', 'GmvTotal30', 'GmvTotal90',
       'ExemplarAcceptedCountTotal7', 'ExemplarAcceptedCountTotal30',
       'ExemplarAcceptedCountTotal90', 'OrderAcceptedCountTotal7',
       'OrderAcceptedCountTotal30', 'OrderAcceptedCountTotal90',
       'ExemplarReturnedCountTotal7', 'ExemplarReturnedCountTotal30',
       'ExemplarReturnedCountTotal90', 'ExemplarReturnedValueTotal7',
       'ExemplarReturnedValueTotal30', 'Exempl

In [12]:
df_train['name_rus'][159385]

'Мешки для пылесоса PHILIPS TRIATLON, синтетические, многослойные, тип: HR 6947'

In [13]:
df_train['brand_name'][159385]

'ACTRUM'

In [14]:
df_train['description'][159385]

'Мешки пылесборники для пылесоса PHILIPS, 10 шт., синтетические, многослойные, бренд: ACTRUM, арт. AK-10/10, тип оригинального мешка: HR 6947.Подходят для пылесосов:PHILIPS: HR6955, HR6947, HR6888, HR6844 TRIATHLON, HR6843 TRIATHLON, HR6842 TRIATHLON, HR6841 TRIATHLON, HR6840 TRIATHLON, HR6839 TRIATHLON, HR6838 TRIATHLON, HR6837 TRIATHLON, HR6836 TRIATHLON, HR6835 TRIATHLON, HR6834 TRIATHLON, HR6833 TRIATHLON, HR6832 TRIATHLON, HR6831 TRIATHLON, HR6830 TRIATHLON, HR6829 TRIATHLON, HR6828 TRIATHLON, HR6827 TRIATHLON, HR6826 TRIATHLON, HR6825 TRIATHLON, HR6824 TRIATHLON, HR6823 TRIATHLON, HR6822 TRIATHLON, HR6821 TRIATHLON, HR6820 TRIATHLON, HR6819 TRIATHLON, HR6818 TRIATHLON, HR6817 TRIATHLON, HR6816 TRIATHLON, HR6815 TRIATHLON, HR6814 - HR6845 TRIATHLON, FC6844 TRIATHLON, FC6843 TRIATHLON, FC6842 TRIATHLON, FC6841 - FC6845 TRIATHLONОдноразовые мешки-пылесборники ACTRUM изгот'

In [15]:
df_train['CommercialTypeName4'][159385]

'Пылесборник'

In [26]:
len(df_train)

197198

In [16]:
unique_brands = df_train["brand_name"].nunique(dropna=True)
missing_brands = df_train["brand_name"].isna().sum()

print(f"Уникальных брендов: {unique_brands}")
print(f"Пропущенных брендов: {missing_brands}")

Уникальных брендов: 4066
Пропущенных брендов: 80531


In [17]:
unique_brands = df_train["description"].nunique(dropna=True)
missing_brands = df_train["description"].isna().sum()

print(f"Уникальных описаний: {unique_brands}")
print(f"Пропущенных описаний: {missing_brands}")

Уникальных описаний: 114781
Пропущенных описаний: 26060


In [18]:
unique_brands = df_train["name_rus"].nunique(dropna=True)
missing_brands = df_train["name_rus"].isna().sum()

print(f"Уникальных карточек: {unique_brands}")
print(f"Пропущенных карточек: {missing_brands}")

Уникальных карточек: 154718
Пропущенных карточек: 0


In [19]:
unique_brands = df_train["CommercialTypeName4"].nunique(dropna=True)
missing_brands = df_train["CommercialTypeName4"].isna().sum()

print(f"Уникальных ком названий: {unique_brands}")
print(f"Пропущенных ком названий: {missing_brands}")

Уникальных ком названий: 634
Пропущенных ком названий: 0


# Функции очищения текстовых данных и создание фич.

## простые фичи

In [81]:
import re
import html
import unicodedata

# --- regex patterns ---
INVISIBLE_RE = re.compile(
    r"[\u200B-\u200F\u202A-\u202E\u2060-\u206F\uFEFF\uFFF9-\uFFFB]"
)
CONTROL_RE = re.compile(r"[\x00-\x1F\x7F]")
TAG_RE = re.compile(r"<[^>]+>")
WS_RE = re.compile(r"\s+")
URL_PATTERN = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)

# --- cleaning functions ---
def unescape_html(text: str) -> str:
    return html.unescape(text) if isinstance(text, str) else ""

def replace_specific_tags(text: str) -> str:
    return re.sub(r"(?i)</?(br|p|li|ul|ol|div|span)\b[^>]*>", " ", text) if isinstance(text, str) else text

def remove_html_tags(text: str) -> str:
    return TAG_RE.sub(" ", text) if isinstance(text, str) else text

def remove_invisible_chars(text: str) -> str:
    return INVISIBLE_RE.sub("", text) if isinstance(text, str) else text

def replace_control_chars(text: str) -> str:
    return CONTROL_RE.sub(" ", text) if isinstance(text, str) else text

def replace_special_chars(text: str) -> str:
    return re.sub(r"[•‣‥∙]", " ", text) if isinstance(text, str) else text

def collapse_whitespace(text: str) -> str:
    return WS_RE.sub(" ", text).strip() if isinstance(text, str) else text

def normalize_urls(text: str) -> str:
    def normalize_url(match):
        url = match.group(0)
        return url.split("?")[0]
    return URL_PATTERN.sub(normalize_url, text) if isinstance(text, str) else text

# --- dictionary of cleaning functions ---
CLEANING_FUNCTIONS = {
    "unescape_html": unescape_html,
    "replace_specific_tags": replace_specific_tags,
    "remove_html_tags": remove_html_tags,
    "remove_invisible_chars": remove_invisible_chars,
    "replace_control_chars": replace_control_chars,
    "replace_special_chars": replace_special_chars,
    "collapse_whitespace": collapse_whitespace,
    "normalize_urls": normalize_urls,
}

# --- main cleaner ---
def basic_clean_text(text: str) -> str:
    """
    Applies a series of cleaning functions to the input text in sequence.
    Returns the cleaned text.
    """
    if not isinstance(text, str):
        return ""
    
    result = text
    for name, func in CLEANING_FUNCTIONS.items():
        result = func(result)
    return result

def create_basic_cleaned_text(text_series: pd.Series) -> pd.Series:
    """Возвращает очищенную последовательность"""
    return text_series.apply(basic_clean_text)

In [None]:
# Пример проверки
description = df_train['description']
clened_description = create_basic_cleaned_text(description)
clened_description.head()

id
159385    Мешки пылесборники для пылесоса PHILIPS, 10 шт...
288616    Защитная силиконовая крышка обьектива GoPro He...
108090    Плоский медиатор из кости толщиной 0.6 мм Плос...
415607    Игра Sonic Frontiers для PlayStation 5, русски...
332391    Disney Classic Games: Aladdin and The Lion Kin...
Name: description, dtype: object

In [None]:
def count_capslock_words(text: str) -> int:
    return sum(
        1
        for word in text.split()
        if word.isalpha()
        and word.isupper()
        and len(word) > 5
        and not any(ch.isdigit() for ch in word)
    )

# --- regex patterns ---
URL_PATTERN = re.compile(r"(https?://\S+|www\.\S+)", re.IGNORECASE)
PHONE_PATTERN = re.compile(
    r"(?<!\d)(?:\+7|8)\s*[\-]?\s*\(?\d{3}\)?\s*[\-]?\s*\d{3}\s*[\-]?\s*\d{2}\s*[\-]?\s*\d{2}(?!\d)"
)
SKU_PATTERN = re.compile(r"\b(?:[A-ZА-Я0-9]{5,15})\b")
PRICE_PATTERN = re.compile(r"\b\d{1,6}\s*(?:руб|р|₽)\b", re.IGNORECASE)
EMOJI_PATTERN = re.compile(
    r'[\U0001F600-\U0001F64F'
    r'\U0001F300-\U0001F5FF'
    r'\U0001F680-\U0001F6FF'
    r'\U0001F1E0-\U0001F1FF'
    r'\U00002702-\U000027B0'
    r'\U000024C2-\U0001F251]+'
)
MESSENGERS = [
    "whatsapp", "telegram", "viber", "wechat", "signal", "icq", "вк",
    "вконтакте", "телеграм", " телега", "тг", "тгк", "instagram",
    "инст", "инста", "инстаграм"
]

# --- feature functions ---
def has_url(text: str): return int(bool(URL_PATTERN.search(text)))
def has_phone(text: str): return int(bool(PHONE_PATTERN.search(text)))
def has_messenger(text: str): return int(any(re.search(rf"\b{re.escape(m)}\b", text.lower()) for m in MESSENGERS))
def has_sku(text: str): return int(bool(SKU_PATTERN.search(text)))
def desc_len_chars(text: str): return len(text)
def desc_len_words(text: str): return len(text.split())
def capslock_word_count(text: str): return count_capslock_words(text)
def exclamation_count(text: str): return text.count('!') + text.count('‼')
def question_count(text: str): return text.count('?')
def avg_word_length(text: str):
    words = text.split()
    return sum(len(word) for word in words) / len(words) if words else 0.0
def has_price(text: str): return int(bool(PRICE_PATTERN.search(text)))
def upper_ratio(text: str):
    letters = [ch for ch in text if ch.isalpha()]
    return sum(1 for ch in letters if ch.isupper()) / len(letters) if letters else 0.0
def has_emoji(text: str): return int(bool(EMOJI_PATTERN.search(text)))
def emoji_count(text: str): return len(EMOJI_PATTERN.findall(text))


# --- dictionary of features ---
FEATURE_FUNCTIONS = {
    "has_url": has_url,
    # "has_phone": has_phone,
    "has_messenger": has_messenger,
    "has_sku": has_sku,
    "desc_len_chars": desc_len_chars,
    "desc_len_words": desc_len_words,
    "capslock_word_count": capslock_word_count,
    "exclamation_count": exclamation_count,
    "question_count": question_count,
    "avg_word_length": avg_word_length,
    "has_price": has_price,
    "upper_ratio": upper_ratio,
    "has_emoji": has_emoji,
    "emoji_count": emoji_count,
}


# --- main extractor ---
def extract_basic_text_features(text: str):
    if not isinstance(text, str):
        return {name: 0 if "ratio" not in name and "avg" not in name else 0.0 for name in FEATURE_FUNCTIONS}

    return {name: func(text) for name, func in FEATURE_FUNCTIONS.items()}


def create_basic_text_features(text_series: pd.Series) -> pd.DataFrame:
    """
    Takes a pandas Series of texts and returns a DataFrame with extracted features. Index preserved.
    """
    features_df = text_series.apply(extract_basic_text_features).apply(pd.Series)
    features_df.index = text_series.index
    return features_df

In [27]:
basic_text_features = create_basic_text_features(clened_description)
basic_text_features.head()

Unnamed: 0_level_0,has_url,has_phone,has_messenger,has_sku,desc_len_chars,desc_len_words,capslock_word_count,exclamation_count,question_count,avg_word_length,has_price,upper_ratio,has_emoji,emoji_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
159385,0.0,0.0,0.0,1.0,886.0,100.0,1.0,0.0,0.0,7.87,0.0,0.761566,0.0,0.0
288616,0.0,0.0,0.0,0.0,386.0,53.0,0.0,0.0,0.0,6.301887,0.0,0.018634,0.0,0.0
108090,0.0,0.0,0.0,0.0,200.0,33.0,0.0,0.0,0.0,5.090909,0.0,0.040268,0.0,0.0
415607,0.0,0.0,0.0,0.0,557.0,67.0,0.0,0.0,0.0,7.328358,0.0,0.027197,0.0,0.0
332391,0.0,0.0,0.0,0.0,540.0,70.0,0.0,0.0,0.0,6.728571,0.0,0.037778,0.0,0.0


In [54]:
basic_text_features.columns

Index(['has_url', 'has_phone', 'has_messenger', 'has_sku', 'desc_len_chars',
       'desc_len_words', 'capslock_word_count', 'exclamation_count',
       'question_count', 'avg_word_length', 'has_price', 'upper_ratio',
       'has_emoji', 'emoji_count'],
      dtype='object')

## семантика

In [None]:
import re
import unicodedata
import pandas as pd
from razdel import tokenize
from nltk.corpus import stopwords
import pymorphy3
from functools import lru_cache
import nltk

# Initialize resources
nltk.download('stopwords', quiet=True)
morph = pymorphy3.MorphAnalyzer()

# Cache lemmatization results
@lru_cache(maxsize=10000)
def cached_lemmatize(token: str) -> str:
    """Cached lemmatization using pymorphy3."""
    return morph.parse(token)[0].normal_form

# --- regex patterns ---
COMBINED_PATTERN = re.compile(
    r"(https?://\S+|www\.\S+|"
    r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF'
    r'\U0001F1E0-\U0001F1FF\U00002702-\U000027B0\U000024C2-\U0001F251]+|'
    r"\b\d+\b|[^\w\s])",
    re.IGNORECASE
)
WS_RE = re.compile(r"\s+")

# Stop words
STOP_WORDS = set(stopwords.words("russian"))

# --- cleaning functions ---
def lowercase_text(series: pd.Series) -> pd.Series:
    """Convert text to lowercase (vectorized)."""
    return series.str.lower()

def normalize_unicode(series: pd.Series) -> pd.Series:
    """Normalize Unicode characters to NFKC form (vectorized)."""
    return series.apply(lambda x: unicodedata.normalize("NFKC", x) if isinstance(x, str) else "")

def remove_combined_patterns(series: pd.Series) -> pd.Series:
    """Remove URLs, emojis, numbers, and punctuation in one pass."""
    return series.str.replace(COMBINED_PATTERN, " ", regex=True)

def collapse_whitespace(series: pd.Series) -> pd.Series:
    """Collapse multiple whitespaces into a single space (vectorized)."""
    return series.str.replace(WS_RE, " ", regex=True).str.strip()

def tokenize_text(text: str) -> list:
    """Tokenize text into words using razdel."""
    if not isinstance(text, str) or text == "":
        return []
    return [token.text for token in tokenize(text)]

def lemmatize_and_filter(tokens: list) -> list:
    """Lemmatize tokens, filter out non-alphabetic tokens and stop words."""
    if not tokens:
        return []
    return [
        cached_lemmatize(token)
        for token in tokens
        if token.isalpha() and token not in STOP_WORDS
    ]

def remove_consecutive_duplicates(tokens: list) -> list:
    """Remove consecutive duplicate words from the token list."""
    if not tokens:
        return []
    result = [tokens[0]]
    for i in range(1, len(tokens)):
        if tokens[i] != tokens[i-1]:
            result.append(tokens[i])
    return result

def join_tokens(tokens: list) -> str:
    """Join tokens back into a string with spaces."""
    return " ".join(tokens) if tokens else ""

# --- per-text cleaning function ---
def clean_text_semantic_single(text: str) -> str:
    """Process a single text through tokenization, lemmatization, and duplicate removal."""
    if not isinstance(text, str) or text == "":
        return ""
    
    # Tokenize
    tokens = tokenize_text(text)
    
    # Lemmatize and filter
    tokens = lemmatize_and_filter(tokens)
    
    # Remove consecutive duplicates
    tokens = remove_consecutive_duplicates(tokens)
    
    # Join tokens
    return join_tokens(tokens)

# --- dictionary of vectorized cleaning functions ---
VECTORIZED_CLEANING_FUNCTIONS = {
    "lowercase_text": lowercase_text,
    "normalize_unicode": normalize_unicode,
    "remove_combined_patterns": remove_combined_patterns,
    "collapse_whitespace": collapse_whitespace,
}

# --- main cleaner ---
def create_semantic_cleaned_text(text_series: pd.Series) -> pd.Series:
    """
    Returns a semantically cleaned pandas Series by applying vectorized and sequential cleaning steps.
    
    Args:
        text_series (pd.Series): Input Series of texts.
    
    Returns:
        pd.Series: Cleaned text Series with the same index.
    """
    if not isinstance(text_series, pd.Series):
        raise ValueError("Input must be a pandas Series")

    # Apply vectorized operations
    result = text_series
    for name, func in VECTORIZED_CLEANING_FUNCTIONS.items():
        result = func(result)
    
    # Apply tokenization, lemmatization, and duplicate removal
    cleaned_texts = result.apply(clean_text_semantic_single)
    
    # Return as Series with original index
    return pd.Series(cleaned_texts, index=text_series.index)

In [35]:
semantic_cleaned_text = create_semantic_cleaned_text(clened_description)

In [37]:
fast_semantic_cleaned_text = create_semantic_cleaned_text(clened_description)

In [41]:
fast_semantic_cleaned_text.head()

id
159385    мешок пылесборник пылесос philips шт синтетиче...
288616    защитный силиконовый крышка обьектив gopro her...
108090    плоский медиатор кость толщина мм плоский меди...
415607    игра sonic frontiers playstation русский субти...
332391    disney classic games aladdin and the lion king...
Name: description, dtype: object

In [43]:
clened_description[288616]

'Защитная силиконовая крышка обьектива GoPro Hero 5 / 6 / 7 - это незаменимый аксессуар для каждого владельца вышеупомянутых экшн-камер. Данная крышка рассчитана для защиты линзы объектива от грязи, пыли и царапин при хранении или переноски камеры. Изготовленная при использовании качественного материала, это крышка обладает высокой степенью прочности и прослужит вам максимально долго.'

In [42]:
fast_semantic_cleaned_text[288616]

'защитный силиконовый крышка обьектив gopro hero это незаменимый аксессуар каждый владелец вышеупомянутый экшн камера дать крышка рассчитать защита линза объектив грязь пыль царапина хранение переноска камера изготовить использование качественный материал это крышка обладать высокий степень прочность прослужить максимально долго'

In [None]:
# --- semantic lexicons ---
SUSPICIOUS_WORDS = {
    "реплика", "копия", "аналог", "подделка", "реселл", "фейк",
    "контрафакт", "дешево", "скидка", "акция", "оригинал", "коробка", "оригинальный",
    "позвонить", "написать", "перепродажа"
}
BRANDS = {
    "nike", "adidas", "gucci", "apple", "samsung", "rolex",
    "louisvuitton", "chanel", "prada", "reebok", "philips", "apple", "logitech"
}
URGENCY_WORDS = {
    "срочно", "быстро", "спешить", "поспешить", "ограниченный",
    "последний", "сегодня", "немедленно",
}

# --- feature functions ---
def has_suspicious_words(text: str):
    words = text.split()
    return int(any(word in SUSPICIOUS_WORDS for word in words))

def suspicious_word_count(text: str):
    words = text.split()
    return sum(1 for word in words if word in SUSPICIOUS_WORDS)

def has_brand(text: str):
    words = text.split()
    return int(any(word in BRANDS for word in words))

def brand_count(text: str):
    words = text.split()
    return sum(1 for word in words if word in BRANDS)

def has_urgency_words(text: str):
    words = text.split()
    return int(any(word in URGENCY_WORDS for word in words))

def urgency_word_count(text: str):
    words = text.split()
    return sum(1 for word in words if word in URGENCY_WORDS)

def unique_word_ratio(text: str):
    words = text.split()
    return len(set(words)) / len(words) if words else 0.0


# --- dictionary of semantic features ---
SEMANTIC_FEATURE_FUNCTIONS = {
    "has_suspicious_words": has_suspicious_words,
    "suspicious_word_count": suspicious_word_count,
    # "has_brand": has_brand,
    "brand_count": brand_count,
    "has_urgency_words": has_urgency_words,
    "urgency_word_count": urgency_word_count,
    "unique_word_ratio": unique_word_ratio,
}


# --- main extractor ---
def extract_semantic_features(text: str) -> dict:
    if not isinstance(text, str) or not text.strip():
        # zeros for counts, 0.0 for ratios
        return {
            name: 0 if "ratio" not in name else 0.0
            for name in SEMANTIC_FEATURE_FUNCTIONS
        }
    return {name: func(text) for name, func in SEMANTIC_FEATURE_FUNCTIONS.items()}


def create_semantic_features(text_series: pd.Series) -> pd.DataFrame:
    """
    Takes a pandas Series of pre-cleaned texts and returns a DataFrame with semantic features.
    Index is preserved.
    """
    features_df = text_series.apply(extract_semantic_features).apply(pd.Series)
    features_df.index = text_series.index
    return features_df


In [61]:
semantic_text_features = create_semantic_features(semantic_cleaned_text)
semantic_text_features.head()

Unnamed: 0_level_0,has_suspicious_words,suspicious_word_count,has_brand,brand_count,has_urgency_words,urgency_word_count,unique_word_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
159385,1.0,1.0,1.0,2.0,0.0,0.0,0.315789
288616,0.0,0.0,0.0,0.0,0.0,0.0,0.897436
108090,0.0,0.0,0.0,0.0,0.0,0.0,0.666667
415607,0.0,0.0,0.0,0.0,0.0,0.0,0.927273
332391,0.0,0.0,0.0,0.0,0.0,0.0,0.847458


### проверка бинарных признаков

In [59]:
def calculate_class_ratios_for_binary_features(
    df_train: pd.DataFrame,
    basic_text_features: pd.DataFrame,
    semantic_text_features: pd.DataFrame
) -> pd.DataFrame:
    """
    Calculates the class ratios for binary features where the feature value = 1.
    Combines basic and semantic features, adds the target 'resolution', and computes:
    - Overall class 1 ratio
    - Conditional class 1 ratio for each binary feature = 1
    - Lift (conditional ratio / overall ratio)
    - Support (number of instances where feature = 1)
    - Total number of instances

    Args:
        df_train (pd.DataFrame): DataFrame containing the target 'resolution' (0 or 1).
        basic_text_features (pd.DataFrame): DataFrame with basic text features.
        semantic_text_features (pd.DataFrame): DataFrame with semantic text features.

    Returns:
        pd.DataFrame: A DataFrame with the calculated metrics for each binary feature.
    """
    # Combine features from basic and semantic
    all_features = pd.concat([basic_text_features, semantic_text_features], axis=1)
    
    # Add target column
    df = pd.concat([all_features, df_train['resolution']], axis=1)
    
    # List of binary features (has_ prefixed)
    binary_features = [
        'has_suspicious_words', 'has_brand', 'has_urgency_words',
        'has_url', 'has_phone', 'has_messenger', 'has_sku',
        'has_price', 'has_emoji'
    ]
    
    # Ensure all binary features exist in the DataFrame
    missing_features = [feat for feat in binary_features if feat not in df.columns]
    if missing_features:
        raise ValueError(f"Missing binary features: {missing_features}")
    
    # Overall class 1 ratio
    overall_ratio = df['resolution'].mean()
    total_count = len(df)
    
    # Prepare results list
    results = []
    
    for feat in binary_features:
        # Subset where feature = 1
        subset = df[df[feat] == 1]
        support = len(subset)
        
        if support > 0:
            conditional_ratio = subset['resolution'].mean()
            lift = conditional_ratio / overall_ratio if overall_ratio > 0 else float('inf')
        else:
            conditional_ratio = None
            lift = None
        
        results.append({
            'feature': feat,
            'overall_ratio': overall_ratio,
            'conditional_ratio': conditional_ratio,
            'lift': lift,
            'support': support,
            'total_count': total_count
        })
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    
    # Print overall ratio once
    print(f"Overall class 1 ratio: {overall_ratio:.4f} (Total instances: {total_count})")
    
    return results_df

In [60]:
calculate_class_ratios_for_binary_features(df_train, semantic_text_features, basic_text_features)

Overall class 1 ratio: 0.0662 (Total instances: 197198)


Unnamed: 0,feature,overall_ratio,conditional_ratio,lift,support,total_count
0,has_suspicious_words,0.066187,0.082569,1.247503,1417,197198
1,has_brand,0.066187,0.064433,0.973495,388,197198
2,has_urgency_words,0.066187,0.123792,1.870322,4241,197198
3,has_url,0.066187,0.4,6.043457,10,197198
4,has_phone,0.066187,0.0,0.0,19,197198
5,has_messenger,0.066187,0.017964,0.271413,167,197198
6,has_sku,0.066187,0.044599,0.673827,91998,197198
7,has_price,0.066187,0.116071,1.753682,560,197198
8,has_emoji,0.066187,0.111444,1.683764,3006,197198


## Фичи tf-idf

In [72]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def create_tfidf_features(
    texts_char: pd.Series,
    texts_word: pd.Series,
    mode: str = 'train',
    tfidf_char_vectorizer=None,
    tfidf_word_vectorizer=None,
    max_tfidf_features: int = 1000,
    char_ngram_range: tuple = (3, 5),
    word_ngram_range: tuple = (1, 3)
) -> tuple:
    """
    Creates TF-IDF features for character and word n-grams from two separate pandas Series of texts.
    For 'train' mode, fits and transforms the TF-IDF vectorizers.
    For 'test' mode, only transforms using the provided fitted vectorizers.

    Args:
        texts_char (pd.Series): Series of cleaned text for character-level TF-IDF (e.g., from basic_clean_text).
        texts_word (pd.Series): Series of cleaned text for word-level TF-IDF (e.g., from clean_text_semantic).
        mode (str): 'train' to fit and transform, 'test' to only transform.
        tfidf_char_vectorizer: Fitted TfidfVectorizer for character n-grams (required for test mode).
        tfidf_word_vectorizer: Fitted TfidfVectorizer for word n-grams (required for test mode).
        max_tfidf_features (int): Maximum number of TF-IDF features to generate.
        char_ngram_range (tuple): N-gram range for character-level TF-IDF.
        word_ngram_range (tuple): N-gram range for word-level TF-IDF.

    Returns:
        tuple: (tfidf_char_df, tfidf_word_df, tfidf_char_vectorizer, tfidf_word_vectorizer)
            - tfidf_char_df: DataFrame with character TF-IDF features.
            - tfidf_word_df: DataFrame with word TF-IDF features.
            - tfidf_char_vectorizer: Fitted (train) or input (test) character vectorizer.
            - tfidf_word_vectorizer: Fitted (train) or input (test) word vectorizer.

    Raises:
        ValueError: If mode is 'test' and vectorizers are not provided, or if mode is invalid.
        ValueError: If indices of texts_char and texts_word do not match.
    """
    # Validate mode
    if mode not in ['train', 'test']:
        raise ValueError("Mode must be 'train' or 'test'")

    # Validate vectorizers for test mode
    if mode == 'test' and (tfidf_char_vectorizer is None or tfidf_word_vectorizer is None):
        raise ValueError("Fitted vectorizers must be provided for test mode")

    # Validate index alignment
    if not texts_char.index.equals(texts_word.index):
        raise ValueError("Indices of texts_char and texts_word must match")

    # Initialize vectorizers for training mode
    if mode == 'train':
        tfidf_char_vectorizer = TfidfVectorizer(
            analyzer="char",
            ngram_range=char_ngram_range,
            max_features=max_tfidf_features
        )
        tfidf_word_vectorizer = TfidfVectorizer(
            analyzer="word",
            ngram_range=word_ngram_range,
            max_features=max_tfidf_features
        )

    # Transform texts (fit_transform for train, transform for test)
    if mode == 'train':
        tfidf_char_matrix = tfidf_char_vectorizer.fit_transform(texts_char)
        tfidf_word_matrix = tfidf_word_vectorizer.fit_transform(texts_word)
    else:
        tfidf_char_matrix = tfidf_char_vectorizer.transform(texts_char)
        tfidf_word_matrix = tfidf_word_vectorizer.transform(texts_word)

    # Convert to DataFrames with feature names and preserve index
    tfidf_char_df = pd.DataFrame(
        tfidf_char_matrix.toarray(),
        columns=[f"tfidf_char_{i}" for i in range(tfidf_char_matrix.shape[1])],
        index=texts_char.index
    )
    tfidf_word_df = pd.DataFrame(
        tfidf_word_matrix.toarray(),
        columns=[f"tfidf_word_{i}" for i in range(tfidf_word_matrix.shape[1])],
        index=texts_word.index
    )

    return tfidf_char_df, tfidf_word_df, tfidf_char_vectorizer, tfidf_word_vectorizer


In [None]:
create_tfidf_features(clened_description, )

In [73]:
train_texts_clean = clened_description  # Your cleaned text Series (e.g., from basic_clean_text)
train_texts_semantic = fast_semantic_cleaned_text  # Your semantic text Series (e.g., from clean_text_semantic)
tfidf_char_train, tfidf_word_train, char_vectorizer, word_vectorizer = create_tfidf_features(
    texts_char=train_texts_clean,
    texts_word=train_texts_semantic,
    mode='train',
    max_tfidf_features=1000
)

# # For test data
# test_texts_clean = pd.Series(...)  # Your cleaned test text Series
# test_texts_semantic = pd.Series(...)  # Your semantic test text Series
# tfidf_char_test, tfidf_word_test, _, _ = create_tfidf_features(
#     texts_char=test_texts_clean,
#     texts_word=test_texts_semantic,
#     mode='test',
#     tfidf_char_vectorizer=char_vectorizer,
#     tfidf_word_vectorizer=word_vectorizer,
#     max_tfidf_features=1000
# )

Unnamed: 0_level_0,tfidf_char_0,tfidf_char_1,tfidf_char_2,tfidf_char_3,tfidf_char_4,tfidf_char_5,tfidf_char_6,tfidf_char_7,tfidf_char_8,tfidf_char_9,...,tfidf_char_990,tfidf_char_991,tfidf_char_992,tfidf_char_993,tfidf_char_994,tfidf_char_995,tfidf_char_996,tfidf_char_997,tfidf_char_998,tfidf_char_999
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
159385,0.101434,0.0,0.060078,0.0,0.0,0.0,0.064846,0.066245,0.066317,0.0,...,0.0,0.08463,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
288616,0.04998,0.148691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0417,0.045871,0.0,0.0,0.0,0.0,0.0,0.0,0.0
108090,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
415607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.040189,0.0,0.0,0.098485,0.0,0.0,0.0,0.0
332391,0.0,0.0,0.054698,0.072564,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.042379,0.0,0.0,0.051925,0.0,0.0,0.0,0.0


## создание всех фич

In [89]:
def create_flags_features(df: pd.DataFrame, flag_columns: list = None) -> pd.DataFrame:
    """
    Creates binary flag features indicating None/NaN values for specified columns.
    
    Args:
        df (pd.DataFrame): Input DataFrame.
        flag_columns (list): List of column names to check for None/NaN.
                           Defaults to ['brand_name', 'description', 'name_rus', 'CommercialTypeName4'].
    
    Returns:
        pd.DataFrame: DataFrame with binary columns (e.g., 'is_none_<column>') where 1 indicates
                      None/NaN or missing column, 0 otherwise.
    """
    if flag_columns is None:
        flag_columns = ['brand_name', 'description', 'name_rus', 'CommercialTypeName4']
    
    none_flags = pd.DataFrame(index=df.index)
    for col in flag_columns:
        if col in df.columns:
            none_flags[f'is_none_{col}'] = df[col].isna().astype(int)
        else:
            none_flags[f'is_none_{col}'] = 1  # If column is missing, assume all None
    
    return none_flags

In [90]:
def create_all_text_features(
    df: pd.DataFrame,
    mode: str = 'train',
    max_tfidf_features: int = 1000,
    char_ngram_range=(3, 5),
    word_ngram_range=(1, 3),
    tfidf_char_vectorizer=None,
    tfidf_word_vectorizer=None
) -> tuple:
    """
    Creates all text features: basic, semantic, and TF-IDF, from df['description'].
    
    Args:
        df (pd.DataFrame): DataFrame containing the 'description' column.
        mode (str): 'train' to fit and transform, 'test' to only transform.
        max_tfidf_features (int): Maximum number of TF-IDF features.
        tfidf_char_vectorizer: Fitted TfidfVectorizer for character n-grams (test mode).
        tfidf_word_vectorizer: Fitted TfidfVectorizer for word n-grams (test mode).
    
    Returns:
        tuple: (all_features_df, tfidf_char_vectorizer, tfidf_word_vectorizer)
            - all_features_df: DataFrame with concatenated basic, semantic, and TF-IDF features.
            - tfidf_char_vectorizer: Fitted or input character vectorizer.
            - tfidf_word_vectorizer: Fitted or input word vectorizer.
    
    Raises:
        ValueError: If 'description' column is missing, mode is invalid, or vectorizers are missing in test mode.
    """
    if 'description' not in df.columns:
        raise ValueError("df must contain a 'description' column")
    if mode not in ['train', 'test']:
        raise ValueError("Mode must be 'train' or 'test'")
    if mode == 'test' and (tfidf_char_vectorizer is None or tfidf_word_vectorizer is None):
        raise ValueError("Fitted vectorizers must be provided for test mode")

    # Create None flags
    none_flags = create_flags_features(df)

    # Extract description
    description = df['description']

    # Basic cleaning
    basic_cleaned_description = create_basic_cleaned_text(description)

    # Basic text features
    basic_text_features = create_basic_text_features(basic_cleaned_description)

    # Semantic cleaning
    semantic_cleaned_text = create_semantic_cleaned_text(basic_cleaned_description)

    # Semantic features
    semantic_text_features = create_semantic_features(semantic_cleaned_text)

    # TF-IDF features
    tfidf_char_df, tfidf_word_df, tfidf_char_vectorizer, tfidf_word_vectorizer = create_tfidf_features(
        texts_char=basic_cleaned_description,
        texts_word=semantic_cleaned_text,
        mode=mode,
        tfidf_char_vectorizer=tfidf_char_vectorizer,
        tfidf_word_vectorizer=tfidf_word_vectorizer,
        max_tfidf_features=max_tfidf_features,
        char_ngram_range=char_ngram_range,
        word_ngram_range=word_ngram_range
    )

    # Concatenate all features
    all_features_df = pd.concat(
        [none_flags, basic_text_features, semantic_text_features, tfidf_char_df, tfidf_word_df],
        axis=1
    )

    return all_features_df, tfidf_char_vectorizer, tfidf_word_vectorizer

In [91]:
all_features_train, char_vectorizer, word_vectorizer = create_all_text_features(
    df=df_train,
    mode='train',
    max_tfidf_features=1000,
    char_ngram_range=(3, 5),
    word_ngram_range=(1, 3)
)

In [92]:
all_features_train.head()

Unnamed: 0_level_0,is_none_brand_name,is_none_description,is_none_name_rus,is_none_CommercialTypeName4,has_url,has_phone,has_messenger,has_sku,desc_len_chars,desc_len_words,...,tfidf_word_990,tfidf_word_991,tfidf_word_992,tfidf_word_993,tfidf_word_994,tfidf_word_995,tfidf_word_996,tfidf_word_997,tfidf_word_998,tfidf_word_999
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
159385,0,0,0,0,0.0,0.0,0.0,1.0,886.0,100.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
288616,0,0,0,0,0.0,0.0,0.0,0.0,386.0,53.0,...,0.0,0.25649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
108090,0,0,0,0,0.0,0.0,0.0,0.0,200.0,33.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
415607,1,0,0,0,0.0,0.0,0.0,0.0,557.0,67.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
332391,1,0,0,0,0.0,0.0,0.0,0.0,540.0,70.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [84]:
all_features_train.head()

Unnamed: 0_level_0,has_url,has_phone,has_messenger,has_sku,desc_len_chars,desc_len_words,capslock_word_count,exclamation_count,question_count,avg_word_length,...,tfidf_word_990,tfidf_word_991,tfidf_word_992,tfidf_word_993,tfidf_word_994,tfidf_word_995,tfidf_word_996,tfidf_word_997,tfidf_word_998,tfidf_word_999
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
159385,0.0,0.0,0.0,1.0,886.0,100.0,1.0,0.0,0.0,7.87,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
288616,0.0,0.0,0.0,0.0,386.0,53.0,0.0,0.0,0.0,6.301887,...,0.0,0.25649,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
108090,0.0,0.0,0.0,0.0,200.0,33.0,0.0,0.0,0.0,5.090909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
415607,0.0,0.0,0.0,0.0,557.0,67.0,0.0,0.0,0.0,7.328358,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
332391,0.0,0.0,0.0,0.0,540.0,70.0,0.0,0.0,0.0,6.728571,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# создание текстовых + числовых фич

In [50]:
import time
import functools

def timing_decorator(func):
    """
    Декоратор для замера времени выполнения функции.
    """
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        execution_time = end_time - start_time
        print(f"Функция {func.__name__} выполнена за {execution_time:.6f} секунд")
        return result
    return wrapper

In [51]:
@timing_decorator
def prepare_cleaned_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """
    Подготавливает очищенный датасет с числовыми признаками и текстовыми фичами из description.
    Удаляет целевую переменную 'resolution' перед обработкой.

    Parameters:
    - df: pd.DataFrame, исходный датафрейм с колонкой description

    Returns:
    - pd.DataFrame с числовыми признаками (кроме resolution) и текстовыми фичами
    """
    # Создаем копию датафрейма
    df_clean = df.copy()
    original_index = df_clean.index

    # Удаляем целевую переменную 'resolution', если она есть
    if 'resolution' in df_clean.columns:
        df_clean = df_clean.drop(columns=['resolution'])

    # Применяем очистку текста
    df_clean["description_clean"] = df_clean["description"].apply(clean_text)
    df_clean["description_semantic"] = df_clean["description"].apply(clean_text_semantic)

    # Извлекаем структурные и семантические признаки
    structural_features = df_clean["description_clean"].apply(extract_features)
    semantic_features = df_clean["description_semantic"].apply(extract_semantic_features)

    # Преобразуем признаки в датафреймы
    structural_features_df = pd.json_normalize(structural_features).set_index(original_index)
    semantic_features_df = pd.json_normalize(semantic_features).set_index(original_index)

    # Выбираем числовые столбцы из исходного датафрейма (кроме 'resolution')
    numeric_columns = df_clean.select_dtypes(include=[np.number]).columns.tolist()
    numeric_columns = [col for col in numeric_columns if col != 'resolution']

    # Оставляем только числовые столбцы
    df_numeric = df_clean[numeric_columns].fillna(0)
    
    # Объединяем числовые столбцы с текстовыми признаками
    df_result = pd.concat([df_numeric, structural_features_df, semantic_features_df], axis=1)
    
    return df_result

In [41]:
# Загрузка данных
df_train = pd.read_csv('ml_ozon_сounterfeit_train.csv', index_col=0)
# Подготовка очищенного датасета
y_train = df_train['resolution']
X_train = prepare_cleaned_dataset(df_train)

In [46]:
len(X_train), len(y_train), len(df_train)

(308417, 197198, 197198)

197198

In [42]:

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_split, y_train_split)

val_pred = model.predict(X_val_split)
val_accuracy = accuracy_score(y_val_split, val_pred)

print(f"Validation accuracy: {val_accuracy:.4f}")
print("Classification report:")
print(classification_report(y_val_split, val_pred))
print()


ValueError: Found input variables with inconsistent numbers of samples: [308417, 197198]

In [52]:
df = df_train
df_clean = df.copy()

# Удаляем целевую переменную 'resolution', если она есть
if 'resolution' in df_clean.columns:
    df_clean = df_clean.drop(columns=['resolution'])

In [65]:
df_clean.head()

Unnamed: 0_level_0,brand_name,description,name_rus,CommercialTypeName4,rating_1_count,rating_2_count,rating_3_count,rating_4_count,rating_5_count,comments_published_count,...,ExemplarReturnedValueTotal7,ExemplarReturnedValueTotal30,ExemplarReturnedValueTotal90,ItemVarietyCount,ItemAvailableCount,seller_time_alive,ItemID,SellerID,description_clean,description_semantic
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
159385,ACTRUM,"Мешки пылесборники для пылесоса PHILIPS, 10 шт...","Мешки для пылесоса PHILIPS TRIATLON, синтетиче...",Пылесборник,6.0,4.0,4.0,3.0,32.0,3.0,...,730.171845,896.528847,1043.118191,1.0,1.0,1860.0,78312,1218,"Мешки пылесборники для пылесоса PHILIPS, 10 шт...",мешок пылесборник пылесос philips шт синтетиче...
288616,Red Line,Защитная силиконовая крышка обьектива GoPro He...,Защитная крышка Redline на экшн-камеру GoPro (...,Крышка для объектива,,,,,,,...,993.043882,1137.421611,1188.608,1.0,1.0,1757.0,141999,1374,Защитная силиконовая крышка обьектива GoPro He...,защитный силиконовый крышка обьектив gopro her...
108090,Talwar Brothers,Плоский медиатор из кости толщиной 0.6 мм<br/>...,Медиатор для гитары Acura GP-PB6,Аксессуар для музыкального инструмента,0.0,0.0,1.0,0.0,1.0,0.0,...,800.822138,1174.069505,1224.798286,1.0,1.0,1722.0,53306,1448,Плоский медиатор из кости толщиной 0.6 мм Плос...,плоский медиатор кость толщина мм плоский меди...
415607,,"Игра Sonic Frontiers для PlayStation 5, русски...","Игра Sonic Frontiers для PlayStation 5, русски...",Видеоигра,,,,,,,...,0.0,913.530121,982.789171,3.0,3.0,1692.0,202599,715,"Игра Sonic Frontiers для PlayStation 5, русски...",игра sonic frontiers playstation русский субти...
332391,,Disney Classic Games: Aladdin and The Lion Kin...,"Игра Aladdin and Lion King (PlayStation 4, анг...",Видеоигра,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,913.54217,982.783783,3.0,3.0,1692.0,163725,715,Disney Classic Games: Aladdin and The Lion Kin...,disney classic games aladdin and the lion king...


In [53]:
df_clean["description_clean"] = df_clean["description"].apply(clean_text)

In [None]:
df_clean["description_semantic"] = df_clean["description"].apply(clean_text_semantic)

In [None]:
# Извлекаем структурные и семантические признаки
structural_features = df_clean["description_clean"].apply(extract_features)
semantic_features = df_clean["description_semantic"].apply(extract_semantic_features)

In [67]:
# Преобразуем признаки в датафреймы
structural_features_df = pd.json_normalize(structural_features).set_index(original_index)
semantic_features_df = pd.json_normalize(semantic_features).set_index(original_index)

In [57]:
len(structural_features_df), len(semantic_features_df)

(197198, 197198)

In [68]:
structural_features_df.head()

Unnamed: 0_level_0,has_url,has_sku,desc_len_chars,desc_len_words,capslock_word_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
159385,0,1,886,100,1
288616,0,0,386,53,0
108090,0,0,200,33,0
415607,0,0,557,67,0
332391,0,0,540,70,0


In [69]:
semantic_features_df.head()

Unnamed: 0_level_0,has_suspicious_words,suspicious_word_count,has_brand,brand_count,has_urgency_words,urgency_word_count,unique_word_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
159385,0,0,0,0,0,0,0.315789
288616,0,0,0,0,0,0,0.897436
108090,0,0,0,0,0,0,0.666667
415607,0,0,0,0,0,0,0.927273
332391,0,0,0,0,0,0,0.847458


In [60]:
numeric_columns = df_clean.select_dtypes(include=[np.number]).columns.tolist()
numeric_columns = [col for col in numeric_columns if col != 'resolution']

In [63]:
df_numeric = df_clean[numeric_columns].fillna(0)

In [64]:
df_numeric.head()

Unnamed: 0_level_0,rating_1_count,rating_2_count,rating_3_count,rating_4_count,rating_5_count,comments_published_count,photos_published_count,videos_published_count,PriceDiscounted,item_time_alive,...,ExemplarReturnedCountTotal30,ExemplarReturnedCountTotal90,ExemplarReturnedValueTotal7,ExemplarReturnedValueTotal30,ExemplarReturnedValueTotal90,ItemVarietyCount,ItemAvailableCount,seller_time_alive,ItemID,SellerID
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
159385,6.0,4.0,4.0,3.0,32.0,3.0,6.0,0.0,688.436773,195,...,11.0,50.0,730.171845,896.528847,1043.118191,1.0,1.0,1860.0,78312,1218
288616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,663.157297,972,...,26.0,54.0,993.043882,1137.421611,1188.608,1.0,1.0,1757.0,141999,1374
108090,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,606.573197,1596,...,16.0,34.0,800.822138,1174.069505,1224.798286,1.0,1.0,1722.0,53306,1448
415607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,856.755162,386,...,3.0,6.0,0.0,913.530121,982.789171,3.0,3.0,1692.0,202599,715
332391,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,822.274833,428,...,3.0,6.0,0.0,913.54217,982.783783,3.0,3.0,1692.0,163725,715


In [66]:
original_index = df_clean.index

In [70]:
df_result = pd.concat([df_numeric, structural_features_df, semantic_features_df], axis=1)

In [71]:
len(df_result)

197198

In [74]:
y_train = df_train['resolution']
X_train_extended = df_result

In [99]:
from sklearn.metrics import f1_score

def run_experiment(
    X_train_base, 
    X_train_extended, 
    y_train, 
    model_base, 
    model_extend,
    descriptions_clean,       # очищенный текст для символьного tfidf
    descriptions_semantic,    # очищенный текст для словного tfidf
    max_tfidf_features=1000
):
    # Разделение на train/val
    (
        X_train_base_split, X_val_base_split,
        X_train_ext_split, X_val_ext_split,
        y_train_split, y_val_split,
        desc_train_clean, desc_val_clean,
        desc_train_sem, desc_val_sem
    ) = train_test_split(
        X_train_base, X_train_extended, y_train,
        descriptions_clean, descriptions_semantic,
        test_size=0.2, random_state=42, stratify=y_train
    )

    # === TF-IDF: символьные n-граммы ===
    tfidf_char = TfidfVectorizer(analyzer="char", ngram_range=(3, 5), max_features=max_tfidf_features)
    tfidf_char_train = tfidf_char.fit_transform(desc_train_clean)
    tfidf_char_val = tfidf_char.transform(desc_val_clean)

    tfidf_char_train_df = pd.DataFrame(tfidf_char_train.toarray(),
                                       columns=[f"tfidf_char_{i}" for i in range(tfidf_char_train.shape[1])],
                                       index=X_train_base_split.index)
    tfidf_char_val_df = pd.DataFrame(tfidf_char_val.toarray(),
                                     columns=[f"tfidf_char_{i}" for i in range(tfidf_char_val.shape[1])],
                                     index=X_val_base_split.index)

    # === TF-IDF: словные n-граммы ===
    tfidf_word = TfidfVectorizer(analyzer="word", ngram_range=(1, 3), max_features=max_tfidf_features)
    tfidf_word_train = tfidf_word.fit_transform(desc_train_sem)
    tfidf_word_val = tfidf_word.transform(desc_val_sem)

    tfidf_word_train_df = pd.DataFrame(tfidf_word_train.toarray(),
                                       columns=[f"tfidf_word_{i}" for i in range(tfidf_word_train.shape[1])],
                                       index=X_train_base_split.index)
    tfidf_word_val_df = pd.DataFrame(tfidf_word_val.toarray(),
                                     columns=[f"tfidf_word_{i}" for i in range(tfidf_word_val.shape[1])],
                                     index=X_val_base_split.index)

    # === Дополняем extended-признаки TF-IDF ===
    X_train_ext_split = pd.concat([X_train_ext_split, tfidf_char_train_df, tfidf_word_train_df], axis=1)
    X_val_ext_split = pd.concat([X_val_ext_split, tfidf_char_val_df, tfidf_word_val_df], axis=1)

    # === Базовая модель (без tfidf) ===
    model_base.fit(X_train_base_split, y_train_split)
    val_pred_base = model_base.predict(X_val_base_split)
    val_f1_base = f1_score(y_val_split, val_pred_base, pos_label=1)

    # === Расширенная модель (с tfidf) ===
    model_extend.fit(X_train_ext_split, y_train_split)
    val_pred_ext = model_extend.predict(X_val_ext_split)
    val_f1_ext = f1_score(y_val_split, val_pred_ext, pos_label=1)

    # === Вывод ===
    print("=== Базовый набор (только числовые признаки) ===")
    print(f"Validation f1: {val_f1_base:.6f}")
    print("Classification report:")
    print(classification_report(y_val_split, val_pred_base))
    print()

    print("=== Расширенный набор (числовые + tfidf признаки) ===")
    print(f"Validation f1: {val_f1_ext:.6f}")
    print("Classification report:")
    print(classification_report(y_val_split, val_pred_ext))
    print()

    print(f"Улучшение f1: {(val_f1_ext - val_f1_base):.6f}")

    return tfidf_char, tfidf_word

In [75]:
X_train_extended.head()

Unnamed: 0_level_0,rating_1_count,rating_2_count,rating_3_count,rating_4_count,rating_5_count,comments_published_count,photos_published_count,videos_published_count,PriceDiscounted,item_time_alive,...,desc_len_chars,desc_len_words,capslock_word_count,has_suspicious_words,suspicious_word_count,has_brand,brand_count,has_urgency_words,urgency_word_count,unique_word_ratio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
159385,6.0,4.0,4.0,3.0,32.0,3.0,6.0,0.0,688.436773,195,...,886,100,1,0,0,0,0,0,0,0.315789
288616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,663.157297,972,...,386,53,0,0,0,0,0,0,0,0.897436
108090,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,606.573197,1596,...,200,33,0,0,0,0,0,0,0,0.666667
415607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,856.755162,386,...,557,67,0,0,0,0,0,0,0,0.927273
332391,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,822.274833,428,...,540,70,0,0,0,0,0,0,0,0.847458


In [76]:
X_train_base = df_numeric

In [78]:
X_train_base.head()

Unnamed: 0_level_0,rating_1_count,rating_2_count,rating_3_count,rating_4_count,rating_5_count,comments_published_count,photos_published_count,videos_published_count,PriceDiscounted,item_time_alive,...,ExemplarReturnedCountTotal30,ExemplarReturnedCountTotal90,ExemplarReturnedValueTotal7,ExemplarReturnedValueTotal30,ExemplarReturnedValueTotal90,ItemVarietyCount,ItemAvailableCount,seller_time_alive,ItemID,SellerID
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
159385,6.0,4.0,4.0,3.0,32.0,3.0,6.0,0.0,688.436773,195,...,11.0,50.0,730.171845,896.528847,1043.118191,1.0,1.0,1860.0,78312,1218
288616,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,663.157297,972,...,26.0,54.0,993.043882,1137.421611,1188.608,1.0,1.0,1757.0,141999,1374
108090,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,606.573197,1596,...,16.0,34.0,800.822138,1174.069505,1224.798286,1.0,1.0,1722.0,53306,1448
415607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,856.755162,386,...,3.0,6.0,0.0,913.530121,982.789171,3.0,3.0,1692.0,202599,715
332391,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,822.274833,428,...,3.0,6.0,0.0,913.54217,982.783783,3.0,3.0,1692.0,163725,715


In [87]:
model_base = RandomForestClassifier(n_estimators=100, random_state=42)
model_ext = RandomForestClassifier(n_estimators=100, random_state=42)
run_experiment(X_train_base, X_train_extended, y_train, model_base, model_ext)

=== Базовый набор (только числовые признаки) ===
Validation f1: 0.674733
Classification report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     36830
           1       0.89      0.54      0.67      2610

    accuracy                           0.97     39440
   macro avg       0.93      0.77      0.83     39440
weighted avg       0.96      0.97      0.96     39440


=== Расширенный набор (числовые + текстовые признаки) ===
Validation f1: 0.6798
Classification report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     36830
           1       0.91      0.54      0.68      2610

    accuracy                           0.97     39440
   macro avg       0.94      0.77      0.83     39440
weighted avg       0.96      0.97      0.96     39440


Улучшение f1: 0.0050


In [89]:
from catboost import CatBoostClassifier
model_base = CatBoostClassifier(iterations=100, random_seed=42, verbose=0, auto_class_weights='Balanced', cat_features=['ItemID', 'SellerID'])
model_ext = CatBoostClassifier(iterations=100, random_seed=42, verbose=0, auto_class_weights='Balanced', cat_features=['ItemID', 'SellerID'])
run_experiment(X_train_base, X_train_extended, y_train, model_base, model_ext)

=== Базовый набор (только числовые признаки) ===
Validation f1: 0.616569
Classification report:
              precision    recall  f1-score   support

           0       0.99      0.93      0.96     36830
           1       0.47      0.89      0.62      2610

    accuracy                           0.93     39440
   macro avg       0.73      0.91      0.79     39440
weighted avg       0.96      0.93      0.94     39440


=== Расширенный набор (числовые + текстовые признаки) ===
Validation f1: 0.6156
Classification report:
              precision    recall  f1-score   support

           0       0.99      0.93      0.96     36830
           1       0.47      0.89      0.62      2610

    accuracy                           0.93     39440
   macro avg       0.73      0.91      0.79     39440
weighted avg       0.96      0.93      0.94     39440


Улучшение f1: -0.0010


# добавить tf-idf

In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer
max_tfidf_features = 1000

In [100]:
from catboost import CatBoostClassifier
model_base = CatBoostClassifier(iterations=100, random_seed=42, verbose=0, auto_class_weights='Balanced', cat_features=['ItemID', 'SellerID'])
model_ext = CatBoostClassifier(iterations=100, random_seed=42, verbose=0, auto_class_weights='Balanced', cat_features=['ItemID', 'SellerID'])
run_experiment(X_train_base, X_train_extended, y_train, model_base, model_ext, df_clean["description_clean"], df_clean["description_semantic"], 1000)

=== Базовый набор (только числовые признаки) ===
Validation f1: 0.616569
Classification report:
              precision    recall  f1-score   support

           0       0.99      0.93      0.96     36830
           1       0.47      0.89      0.62      2610

    accuracy                           0.93     39440
   macro avg       0.73      0.91      0.79     39440
weighted avg       0.96      0.93      0.94     39440


=== Расширенный набор (числовые + tfidf признаки) ===
Validation f1: 0.651605
Classification report:
              precision    recall  f1-score   support

           0       0.99      0.94      0.96     36830
           1       0.51      0.91      0.65      2610

    accuracy                           0.94     39440
   macro avg       0.75      0.92      0.81     39440
weighted avg       0.96      0.94      0.94     39440


Улучшение f1: 0.035036


(TfidfVectorizer(analyzer='char', max_features=1000, ngram_range=(3, 5)),
 TfidfVectorizer(max_features=1000, ngram_range=(1, 3)))

In [None]:
from catboost import CatBoostClassifier
model_base = CatBoostClassifier(iterations=100, random_seed=42, verbose=0, auto_class_weights='Balanced', cat_features=['ItemID', 'SellerID'])
model_ext = CatBoostClassifier(iterations=100, random_seed=42, verbose=0, auto_class_weights='Balanced', cat_features=['ItemID', 'SellerID'])
run_experiment(X_train_base, X_train_extended, y_train, model_base, model_ext, df_clean["description_clean"], df_clean["description_semantic"], 4000)

In [None]:
from catboost import CatBoostClassifier
model_base = CatBoostClassifier(iterations=100, random_seed=42, verbose=0, auto_class_weights='Balanced', cat_features=['ItemID', 'SellerID'])
model_ext = CatBoostClassifier(iterations=100, random_seed=42, verbose=0, auto_class_weights='Balanced', cat_features=['ItemID', 'SellerID'])
run_experiment(X_train_base, X_train_extended, y_train, model_base, model_ext, df_clean["description_clean"], df_clean["description_semantic"], 4000)