## Train baseline

In [51]:
from lightgbm import LGBMClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from product_classification.data_processing.text_processing import EStemTag, Lemmatizer
import pickle

In [52]:
path = "../data"
with open(f"{path}/datasets.pkl", "rb") as handle:
    simple_datasets = pickle.load(handle)

### Categorical features transformation

In [9]:
categorical_features = ["merchant_name", "brand_name"]
categorical_transformer = OneHotEncoder(drop="first" ,handle_unknown="ignore", dtype=int)

### Numerical features transformation

In [10]:
numeric_features = ["price"]
numeric_transformer = StandardScaler()

### Text transformation

In [11]:
text_features = "product_name"

stem_tag = EStemTag.STEMMER
lemma = Lemmatizer(stem=stem_tag)

text_transformer = Pipeline(
    [(stem_tag.value, lemma),
    ("tfidf" ,TfidfVectorizer())]
)

### Processor

In [12]:
preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_features),
            ("cat", categorical_transformer, categorical_features),
            ("text", text_transformer, text_features)
                     ]
       )

### Classifier

In [42]:
classifier = LGBMClassifier(objective="binary", random_state=42, silent=False, metric="binary_logloss")

In [43]:
multilabel_classifier = MultiOutputClassifier(classifier, n_jobs=-1)

### Pipeline

In [44]:
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", multilabel_classifier)]
)

### train

In [45]:
X_train, y_train = simple_datasets.training.iloc[:,1:5], simple_datasets.training.iloc[:,5:]

In [46]:
clf.fit(X_train, y_train, classifier__verbose=True)



[LightGBM] [Info] Number of positive: 781, number of negative: 129638
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 255715
[LightGBM] [Info] Number of data points in the train set: 130419, number of used features: 5959
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.005988 -> initscore=-5.111926
[LightGBM] [Info] Start training from score -5.111926
[LightGBM] [Info] Number of positive: 8511, number of negative: 121908
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255715
[LightGBM] [Info] Number of data points in the train set: 130419, number of used features: 5959
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.065259 -> initscore=-2.661907
[LightGBM] [Info] Start training from score -2.661907
[LightGBM] [Info] Number of positive: 8503, number of negative: 121916
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] 



### Eval

In [47]:
X_test, y_test = simple_datasets.test.iloc[:,1:5], simple_datasets.test.iloc[:,5:]

In [48]:
y_pred = clf.predict(X_test)



In [49]:
from sklearn.metrics import hamming_loss
from sklearn.metrics import classification_report

In [50]:
print(classification_report(y_test, y_pred, target_names=list(y_test.columns)))

                                    precision    recall  f1-score   support

                        animalerie       0.95      0.84      0.89       691
                      auto et moto       0.84      0.59      0.70       350
                   bagages et sacs       0.86      0.70      0.77       290
                  beaute et parfum       0.94      0.90      0.92       757
              bebe et puericulture       0.92      0.64      0.75       399
                            bijoux       0.96      0.88      0.92       361
                         bricolage       0.85      0.61      0.71       753
                     cd et vinyles       0.58      0.37      0.45        30
         chaussures et accessoires       0.94      0.87      0.90       703
    commerce, industrie et science       0.28      0.24      0.26        29
                 cuisine et maison       0.87      0.57      0.69       758
                    dvd et blu-ray       0.98      0.93      0.96       103
           

  _warn_prf(average, modifier, msg_start, len(result))


[LightGBM] [Info] Number of positive: 8515, number of negative: 121904
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255715
[LightGBM] [Info] Number of data points in the train set: 130419, number of used features: 5959
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.065290 -> initscore=-2.661405
[LightGBM] [Info] Start training from score -2.661405
[LightGBM] [Info] Number of positive: 8512, number of negative: 121907
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255715
[LightGBM] [Info] Number of data points in the train set: 130419, number of used features: 5959
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.065267 -> initscore=-2.661782
[LightGBM] [Info] Start training from score -2.661782
[LightGBM] [Info] Number of positive: 3581, number of negative: 126838
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255715
[LightGBM] [Info] Number of data points in th