In [None]:
import pandas as pd

In [None]:
train = pd.read_csv("data/ml_ozon_сounterfeit_train.csv")
test = pd.read_csv("data/ml_ozon_сounterfeit_test.csv")

# Utils funcs

In [None]:
def count_empty_nan_emptystr(df, col):
    return {"NaN": df[col].isna().sum(), "empty_string": (df[col] == "").sum(), "total_rows": len(df)}

# Add image presence flag

In [None]:
img_info_df_train = pd.read_csv("data/no_image_products.csv").loc[:, ["ItemID", "has_image"]]
img_info_df_train.has_image = img_info_df_train.has_image.astype(int)

img_info_df_test = pd.read_csv("data/no_image_products_test.csv").loc[:, ["ItemID", "has_image"]]
img_info_df_test.has_image = img_info_df_test.has_image.astype(int)

len(img_info_df_train), len(img_info_df_test)

(738, 25)

In [None]:
train = train.merge(img_info_df_train, on="ItemID", how="left")
train.has_image = train.has_image.fillna(1).astype(int)

test = test.merge(img_info_df_test, on="ItemID", how="left")
test.has_image = test.has_image.fillna(1).astype(int)

# Look at columns

In [None]:
len(train), len(test)

(197198, 22760)

In [None]:
train_cols = set(train.columns)
test_cols = set(test.columns)
print("Columns in train but not in test:", train_cols - test_cols)
print("Columns in test but not in train:", test_cols - train_cols)

Columns in train but not in test: {'resolution'}
Columns in test but not in train: set()


In [None]:
train.columns

Index(['id', 'resolution', 'brand_name', 'description', 'name_rus',
       'CommercialTypeName4', 'rating_1_count', 'rating_2_count',
       'rating_3_count', 'rating_4_count', 'rating_5_count',
       'comments_published_count', 'photos_published_count',
       'videos_published_count', 'PriceDiscounted', 'item_time_alive',
       'item_count_fake_returns7', 'item_count_fake_returns30',
       'item_count_fake_returns90', 'item_count_sales7', 'item_count_sales30',
       'item_count_sales90', 'item_count_returns7', 'item_count_returns30',
       'item_count_returns90', 'GmvTotal7', 'GmvTotal30', 'GmvTotal90',
       'ExemplarAcceptedCountTotal7', 'ExemplarAcceptedCountTotal30',
       'ExemplarAcceptedCountTotal90', 'OrderAcceptedCountTotal7',
       'OrderAcceptedCountTotal30', 'OrderAcceptedCountTotal90',
       'ExemplarReturnedCountTotal7', 'ExemplarReturnedCountTotal30',
       'ExemplarReturnedCountTotal90', 'ExemplarReturnedValueTotal7',
       'ExemplarReturnedValueTotal30', '

In [None]:
print(train.dtypes)

id                                int64
resolution                        int64
brand_name                       object
description                      object
name_rus                         object
CommercialTypeName4              object
rating_1_count                  float64
rating_2_count                  float64
rating_3_count                  float64
rating_4_count                  float64
rating_5_count                  float64
comments_published_count        float64
photos_published_count          float64
videos_published_count          float64
PriceDiscounted                 float64
item_time_alive                   int64
item_count_fake_returns7          int64
item_count_fake_returns30         int64
item_count_fake_returns90         int64
item_count_sales7                 int64
item_count_sales30                int64
item_count_sales90                int64
item_count_returns7               int64
item_count_returns30              int64
item_count_returns90              int64


In [None]:
print("Is 'id' unique in train?", train["id"].is_unique)
print("Is 'ItemID' unique in train?", train["ItemID"].is_unique)
print("Is 'id' unique in test?", test["id"].is_unique)
print("Is 'ItemID' unique in test?", test["ItemID"].is_unique)

Is 'id' unique in train? True
Is 'ItemID' unique in train? True
Is 'id' unique in test? True
Is 'ItemID' unique in test? True


In [None]:
target_col = "resolution"
id_cols = [
    "id",  # можно выкинуть
    "ItemID",
]
boolean_cols = [  # это новые
    "has_image",
    "has_description",
    "has_brand_name",
]
text_cols = [  # по тектовым колонкам считаем фичи + tf-idf + возможно сконкатенируем (вместе brand_name, CommercialTypeName4) и сделаем эмбеддинг
    "description",  # мб Nan, пустым и т.д. добавил флаг
    "name_rus",  # всегда есть и не пусто и в train и в test
]
categorial_cols = [
    "brand_name",  # мб Nan, пустым и т.д. добавил флаг. не уверен что категориальная, но мб да. можно добавить категорию "редкие"
    "CommercialTypeName4",  # точно категориальная. надо добавить категорию "редкие"
    "SellerID",  # категориальная?, хз как использовать её. как числовую использовать не хочется (нет отнощения порядка же)
]
numeric_cols = [  # числовые фичи, NaN значения заменяем на -1
    "rating_1_count",
    "rating_2_count",
    "rating_3_count",
    "rating_4_count",
    "rating_5_count",
    "comments_published_count",
    "photos_published_count",
    "videos_published_count",
    "PriceDiscounted",
    "item_time_alive",
    "item_count_fake_returns7",
    "item_count_fake_returns30",
    "item_count_fake_returns90",
    "item_count_sales7",
    "item_count_sales30",
    "item_count_sales90",
    "item_count_returns7",
    "item_count_returns30",
    "item_count_returns90",
    "GmvTotal7",
    "GmvTotal30",
    "GmvTotal90",
    "ExemplarAcceptedCountTotal7",
    "ExemplarAcceptedCountTotal30",
    "ExemplarAcceptedCountTotal90",
    "OrderAcceptedCountTotal7",
    "OrderAcceptedCountTotal30",
    "OrderAcceptedCountTotal90",
    "ExemplarReturnedCountTotal7",
    "ExemplarReturnedCountTotal30",
    "ExemplarReturnedCountTotal90",
    "ExemplarReturnedValueTotal7",
    "ExemplarReturnedValueTotal30",
    "ExemplarReturnedValueTotal90",
    "ItemVarietyCount",
    "ItemAvailableCount",
    "seller_time_alive",
]

# Check that all columns from train (except target) are included in one of the lists
all_feature_cols = set(id_cols + numeric_cols + text_cols + categorial_cols)
missing_cols = set(train.columns) - all_feature_cols - {target_col}
print("Columns in train not included in any feature list:", missing_cols)

Columns in train not included in any feature list: {'has_image'}


In [None]:
print("train 'name_rus':", count_empty_nan_emptystr(train, "name_rus"))
print("test 'name_rus':", count_empty_nan_emptystr(test, "name_rus"))
print()
print("train 'CommercialTypeName4':", count_empty_nan_emptystr(train, "CommercialTypeName4"))
print("test 'CommercialTypeName4':", count_empty_nan_emptystr(test, "CommercialTypeName4"))

train 'name_rus': {'NaN': np.int64(0), 'empty_string': np.int64(0), 'total_rows': 197198}
test 'name_rus': {'NaN': np.int64(0), 'empty_string': np.int64(0), 'total_rows': 22760}

train 'CommercialTypeName4': {'NaN': np.int64(0), 'empty_string': np.int64(0), 'total_rows': 197198}
test 'CommercialTypeName4': {'NaN': np.int64(0), 'empty_string': np.int64(0), 'total_rows': 22760}


In [None]:
print(train.isnull().sum())

id                                   0
resolution                           0
brand_name                       80531
description                      26060
name_rus                             0
CommercialTypeName4                  0
rating_1_count                  150005
rating_2_count                  150005
rating_3_count                  150005
rating_4_count                  150005
rating_5_count                  150005
comments_published_count        150005
photos_published_count          150005
videos_published_count          150005
PriceDiscounted                      0
item_time_alive                      0
item_count_fake_returns7             0
item_count_fake_returns30            0
item_count_fake_returns90            0
item_count_sales7                    0
item_count_sales30                   0
item_count_sales90                   0
item_count_returns7                  0
item_count_returns30                 0
item_count_returns90                 0
GmvTotal7                

In [None]:
print(test.isnull().sum())

id                                  0
brand_name                       5832
description                      2224
name_rus                            0
CommercialTypeName4                 0
rating_1_count                  21053
rating_2_count                  21053
rating_3_count                  21053
rating_4_count                  21053
rating_5_count                  21053
comments_published_count        21053
photos_published_count          21053
videos_published_count          21053
PriceDiscounted                     3
item_time_alive                     0
item_count_fake_returns7            0
item_count_fake_returns30           0
item_count_fake_returns90           0
item_count_sales7                   0
item_count_sales30                  0
item_count_sales90                  0
item_count_returns7                 0
item_count_returns30                0
item_count_returns90                0
GmvTotal7                        1381
GmvTotal30                        913
GmvTotal90  