In [2]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.dataset import load_dataset

In [3]:
path = "../data"
joined_tables = pd.read_csv(f"{path}/joined_tables.csv")

In [4]:
joined_tables = joined_tables[joined_tables.category != "uncategorized"]

In [5]:
dataset = joined_tables[["id_product", "category"]].drop_duplicates()

## Create train, test, validation set

In [152]:
# Count and filter categories with few samples
min_category_count = 500
max_category_count = 10_000
dataset_multilabel = dataset.groupby("id_product").category.apply(lambda x: list(sorted(x))).reset_index().rename(columns={"category": "categories"})
dataset_multilabel["categories_str"] = dataset_multilabel.categories.apply(str)
category_count = dataset_multilabel[["id_product", "categories_str"]].groupby("categories_str").count().reset_index().rename(columns={"id_product": "nb_category_product"})
dataset_multilabel = dataset_multilabel.merge(category_count, on="categories_str")
dataset_multilabel_filtered = dataset_multilabel[ (dataset_multilabel.nb_category_product >= min_category_count) & (dataset_multilabel.nb_category_product <= max_category_count)][["id_product", "categories"]].reset_index(drop=True)

In [153]:
# convert categories into label columns
mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(dataset_multilabel_filtered.categories)
dataset_multilabel_filtered = pd.concat([dataset_multilabel_filtered, pd.DataFrame(labels)], axis=1)
dataset_multilabel_filtered.columns = ["id_product", "categories"] + list(mlb.classes_)

In [154]:
from skmultilearn.model_selection import iterative_train_test_split
import numpy as np

In [155]:
train, _, X_others, y_others = iterative_train_test_split(np.asmatrix(dataset_multilabel_filtered[["id_product"]]), np.asmatrix(dataset_multilabel_filtered.drop(columns=["id_product", "categories"])), test_size = 0.15)

In [156]:
train_ids = pd.DataFrame(train, columns=["id_product"])

In [157]:
test, _, validation, _ = iterative_train_test_split(X_others, y_others, test_size = 0.5)

In [158]:
test_ids = pd.DataFrame(test, columns=["id_product"])
validation_ids = pd.DataFrame(validation, columns=["id_product"])

In [280]:
train = joined_tables[joined_tables.id_product.isin(train_ids.id_product.tolist())]
test = joined_tables[joined_tables.id_product.isin(test_ids.id_product.tolist())]
validation = joined_tables[joined_tables.id_product.isin(validation_ids.id_product.tolist())]

## Train baseline

In [281]:
from lightgbm import LGBMClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [282]:
train = train.merge(dataset_multilabel_filtered, on="id_product").drop_duplicates(subset=["id_product"]).drop(columns=["category", "categories", "product_description"])
test = test.merge(dataset_multilabel_filtered, on="id_product").drop_duplicates(subset=["id_product"]).drop(columns=["category", "categories", "product_description"])
validation = validation.merge(dataset_multilabel_filtered, on="id_product").drop_duplicates(subset=["id_product"]).drop(columns=["category", "categories", "product_description"])

### Categorical features transformation

In [283]:
categorical_features = ["merchant_name", "brand_name"]
categorical_transformer = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1, dtype=int)

### Numerical features transformation

In [284]:
numeric_features = ["price"]
numeric_transformer = StandardScaler()

### Text transformation

In [300]:
text_features = "product_name"
text_transformer = TfidfVectorizer()

### Processor

In [301]:
preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
            ("text", text_transformer, text_features)
                     ]
       )

### Classifier

In [302]:
classifier = LGBMClassifier(objective="binary", random_state=42, silent=False, metric="binary_logloss")

In [303]:
multilabel_classifier = MultiOutputClassifier(classifier, n_jobs=-1)

### Pipeline

In [304]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", multilabel_classifier)]
)

### train

In [306]:
X_train, y_train = train.iloc[:,1:5], train.iloc[:,5:]

In [310]:
clf.fit(X_train, y_train, classifier__verbose=True) #classifier__feature_name=["product_name", "price", "merchant_name", "brand_name"], classifier__categorical_feature=["merchant_name", "brand_name"]



### Eval

In [311]:
X_test, y_test = test.iloc[:,1:5], test.iloc[:,5:]

In [312]:
y_pred = clf.predict(X_test)

In [313]:
from sklearn.metrics import hamming_loss
from sklearn.metrics import classification_report

In [314]:
print(classification_report(y_test, y_pred, target_names=list(y_test.columns)))

                                precision    recall  f1-score   support

                    animalerie       0.94      0.83      0.88       688
                  auto et moto       0.93      0.67      0.78       346
               bagages et sacs       0.92      0.79      0.85       289
          bebe et puericulture       0.92      0.75      0.83       392
                        bijoux       0.98      0.93      0.95       360
     chaussures et accessoires       0.97      0.88      0.93       722
                dvd et blu-ray       0.99      0.97      0.98       101
         fournitures de bureau       0.94      0.78      0.85       595
           gros electromenager       0.95      0.92      0.93        59
instruments de musique et sono       0.93      0.70      0.80        79
                        jardin       0.94      0.65      0.77       415
                    jeux video       0.96      0.83      0.89       307
                        livres       0.68      0.74      0.71  

  _warn_prf(average, modifier, msg_start, len(result))
