# Пример создания текстовых фичей

In [1]:
import pandas as pd
from text_features import create_all_text_features

# Load datasets
df_train = pd.read_csv(r'data\ml_ozon_сounterfeit_train.csv', index_col=0)
df_test = pd.read_csv(r'data\ml_ozon_сounterfeit_test.csv', index_col=0)

# Print dataset shapes and target distribution
print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")
print("Target distribution in train:")
print(df_train['resolution'].value_counts())
print()

# Generate features for training set
print("Generating features for training set...")
all_features_train, char_vectorizer, word_vectorizer = create_all_text_features(
    df=df_train,
    mode='train',
    max_tfidf_features=1000,
    char_ngram_range=(3, 5),
    word_ngram_range=(1, 3)
)

# Print training features info
print(f"Training features shape: {all_features_train.shape}")
print("Sample of training features (first 5 rows, first 10 columns):")
print(all_features_train.iloc[:, :10].head())
print()


Train shape: (197198, 44)
Test shape: (22760, 43)
Target distribution in train:
resolution
0    184146
1     13052
Name: count, dtype: int64

Generating features for training set...
Training features shape: (197198, 2025)
Sample of training features (first 5 rows, first 10 columns):
        is_none_brand_name  is_none_description  is_none_name_rus  \
id                                                                  
159385                   0                    0                 0   
288616                   0                    0                 0   
108090                   0                    0                 0   
415607                   1                    0                 0   
332391                   1                    0                 0   

        is_none_CommercialTypeName4  has_url  has_phone  has_messenger  \
id                                                                       
159385                            0      0.0        0.0            0.0   
288616    

In [2]:
# Generate features for test set using fitted vectorizers
print("Generating features for test set...")
all_features_test, _, _ = create_all_text_features(
    df=df_test,
    mode='test',
    max_tfidf_features=1000,
    char_ngram_range=(3, 5),
    word_ngram_range=(1, 3),
    tfidf_char_vectorizer=char_vectorizer,
    tfidf_word_vectorizer=word_vectorizer
)

# Print test features info
print(f"Test features shape: {all_features_test.shape}")
print("Sample of test features (first 5 rows, first 10 columns):")
print(all_features_test.iloc[:, :10].head())
print()

Generating features for test set...
Test features shape: (22760, 2025)
Sample of test features (first 5 rows, first 10 columns):
        is_none_brand_name  is_none_description  is_none_name_rus  \
id                                                                  
17384                    1                    0                 0   
260316                   0                    0                 0   
10610                    0                    0                 0   
205236                   0                    0                 0   
308655                   0                    0                 0   

        is_none_CommercialTypeName4  has_url  has_phone  has_messenger  \
id                                                                       
17384                             0      0.0        0.0            0.0   
260316                            0      0.0        0.0            0.0   
10610                             0      0.0        0.0            0.0   
205236           