In [1]:
import os
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from product_classification.data_processing.train_test_val_split import IterativeSplit, SimpleSplit 
from product_classification.data_processing.create_dataset import clean_dataset

In [2]:
path = "../data"
joined_tables = pd.read_csv(f"{path}/joined_tables.csv")

In [3]:
dataset = clean_dataset(joined_tables)

2022-09-10 16:14:44,015 :: create_dataset/create_dataset.py/clean_dataset :: INFO :: Duplicates removed
2022-09-10 16:14:44,016 :: create_dataset/create_dataset.py/clean_dataset :: INFO :: Dataset shape: (462695, 7)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["merchant_name"] = dataset["merchant_name"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["brand_name"] = dataset["brand_name"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value ins

## Create train, test, validation set

In [4]:
min_category_count = 300
max_category_count = 10_000
split_size = 0.15
random_state = 42
iter_split = IterativeSplit(min_categories_threshold=min_category_count, max_categories_threshold=max_category_count)
simple_split = SimpleSplit(min_categories_threshold=min_category_count, max_categories_threshold=max_category_count)

In [7]:
iter_datasets = iter_split.execute(dataset=dataset, split_size=split_size)

2022-09-10 16:15:57,886 :: train_test_val_split/train_test_val_split.py/execute :: INFO :: Split dataset into train, test, val using iterative_train_test_split
2022-09-10 16:15:57,887 :: train_test_val_split/train_test_val_split.py/_multilabel_transformation :: INFO :: Join categories for each product
2022-09-10 16:16:33,821 :: train_test_val_split/train_test_val_split.py/_filter_categories :: INFO :: Apply filter to remove categories with few examples
2022-09-10 16:16:33,971 :: train_test_val_split/train_test_val_split.py/_filter_categories :: INFO :: Removed categories: ['animalerie', 'bebe et puericulture', 'epicerie'],['animalerie', 'bebe et puericulture'],['animalerie', 'bricolage'],['animalerie', 'cuisine et maison'],['animalerie', 'epicerie', 'high-tech'],['animalerie', 'epicerie'],['animalerie', 'high-tech'],['animalerie', 'hygiene et sante'],['animalerie', 'jardin'],['animalerie', 'jeux et jouets'],['animalerie', 'livres'],['applis et jeux', 'cd et vinyles', 'chaussures et acc

In [6]:
simple_datasets = simple_split.execute(dataset=dataset, split_size=split_size, random_state=random_state)

2022-09-10 16:14:45,851 :: train_test_val_split/train_test_val_split.py/execute :: INFO :: Split dataset into train, test, val using train_test_split with stratification
2022-09-10 16:14:45,852 :: train_test_val_split/train_test_val_split.py/_filter_categories :: INFO :: Apply filter to remove categories with few examples
2022-09-10 16:14:45,885 :: train_test_val_split/train_test_val_split.py/_filter_categories :: INFO :: Removed categories: applis et jeux,boutique cheques-cadeaux,cigarettes,garantie,livraison,logiciels,telechargement de musique,uncategorized
2022-09-10 16:14:45,886 :: train_test_val_split/train_test_val_split.py/_filter_categories :: INFO :: Dataset shape: (357320, 7)
2022-09-10 16:14:45,887 :: train_test_val_split/train_test_val_split.py/_downsampling_categories :: INFO :: Downsampling categories with too much examples
2022-09-10 16:14:45,917 :: train_test_val_split/train_test_val_split.py/_downsampling_categories :: INFO :: Dataset shape: (153681, 7)
2022-09-10 16:1

## Train baseline

In [7]:
from lightgbm import LGBMClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from product_classification.data_processing.text_processing import EStemTag, Lemmatizer

In [10]:
# for fname, dataframe in iter_datasets:
#     setattr(iter_datasets, fname, dataframe.drop(columns=["product_description"]))
for fname, dataframe in simple_datasets:
    setattr(simple_datasets, fname, dataframe.drop(columns=["product_description"]))

### Categorical features transformation

In [11]:
categorical_features = ["merchant_name", "brand_name"]
categorical_transformer = OneHotEncoder(drop="first" ,handle_unknown="ignore", dtype=int)

### Numerical features transformation

In [12]:
numeric_features = ["price"]
numeric_transformer = StandardScaler()

### Text transformation

In [13]:
text_features = "product_name"

stem_tag = EStemTag.STEMMER
lemma = Lemmatizer(stem=stem_tag)

text_transformer = Pipeline(
    [(stem_tag.value, lemma),
    ("tfidf" ,TfidfVectorizer())]
)

### Processor

In [14]:
preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
            ("text", text_transformer, text_features)
                     ]
       )

### Classifier

In [15]:
classifier = LGBMClassifier(objective="binary", random_state=42, silent=False, metric="binary_logloss")

In [16]:
multilabel_classifier = MultiOutputClassifier(classifier, n_jobs=-1)

### Pipeline

In [17]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", multilabel_classifier)]
)

### train

In [18]:
X_train, y_train = simple_datasets.training.iloc[:,1:5], simple_datasets.training.iloc[:,5:]

In [20]:
clf.fit(X_train, y_train, classifier__verbose=True) #, classifier__feature_name=["price", "merchant_name", "brand_name"] + list(clf["preprocessor"].named_transformers_["text"].get_feature_names_out()), classifier__categorical_feature=["merchant_name", "brand_name"]



### Eval

In [21]:
X_test, y_test = simple_datasets.test.iloc[:,1:5], simple_datasets.test.iloc[:,5:]

In [22]:
y_pred = clf.predict(X_test)



In [23]:
from sklearn.metrics import hamming_loss
from sklearn.metrics import classification_report

In [24]:
print(classification_report(y_test, y_pred, target_names=list(y_test.columns)))

                                    precision    recall  f1-score   support

                        animalerie       0.96      0.85      0.90       692
                      auto et moto       0.84      0.62      0.71       348
                   bagages et sacs       0.88      0.70      0.78       291
                  beaute et parfum       0.94      0.87      0.90       754
              bebe et puericulture       0.91      0.64      0.75       398
                            bijoux       0.94      0.89      0.91       361
                         bricolage       0.86      0.63      0.73       754
                     cd et vinyles       0.44      0.28      0.34        29
         chaussures et accessoires       0.95      0.85      0.89       702
    commerce, industrie et science       0.28      0.24      0.26        29
                 cuisine et maison       0.89      0.60      0.71       754
                    dvd et blu-ray       0.98      0.94      0.96       103
           

  _warn_prf(average, modifier, msg_start, len(result))
