# Buscalibre Numeric Model

The preprocessed has two types of data, numeric and text data. Here, we try to build an estimator using only the numeric part.

## Libraries and Data

Import the neccesary packages.

In [1]:
!pip install catboost
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/5a/41/24e14322b9986cf72a8763e0a0a69cc256cf963cf9502c8f0044a62c1ae8/catboost-0.26-cp37-none-manylinux1_x86_64.whl (69.2MB)
[K     |████████████████████████████████| 69.2MB 41kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26


In [2]:
from google.colab import files
import io
uploaded_train = files.upload()

Saving train_preprocessed_2.csv to train_preprocessed_2.csv


Upload the Training dataset, and shuffle as it becomes sectioned.

In [3]:
train = pd.read_csv(io.BytesIO(uploaded_train["train_preprocessed_2.csv"]))
train = train.sample(frac=1).reset_index(drop=True)

X = train.drop(columns=["isbn", "review", "topic", "review_cleaned"])
y = train["topic"]

And define the cross validated scorer function.

In [4]:
def cross_score(model, k=10):
    kf = StratifiedKFold(n_splits=k, shuffle=True)
    scores = cross_val_score(model, X, y, cv=kf)
    return np.mean(scores)

### Common Label

As we saw in the Data Exploration Analysis, the topic "grandes-descuentos" has the greatest number of samples. So an starting prediction is to assume every label belongs to it.

We get an score of 

In [5]:
y_pred = ["grandes-descuentos" for i in range(len(y))]
print(f"Predicting with the common label has an accuracy of: {accuracy_score(y, y_pred):.2%}.")

Predicting with the common label has an accuracy of: 41.79%.


## Base Models

We are going to use various simple predictors provided by Scikit Learn and Tensorflow. 

The choice of hyperparameters was done using Optuna package and Google servers for about 5-10 hours for each model.

Random Forest.

In [6]:
ovr_rfc = OneVsRestClassifier(RandomForestClassifier(**{
    'n_estimators': 334,
    'criterion': 'entropy',
    'max_depth': 42,
    'min_samples_split': 9,
    'min_samples_leaf': 4,
    'max_features': 0.45448755763486154
}))

acc = cross_score(ovr_rfc)

print(f"Random Forest Classifier has an accuracy of {acc:.2%}")

Random Forest Classifier has an accuracy of 68.04%


Logistic Regression.

In [7]:
logreg = make_pipeline(StandardScaler(), MinMaxScaler(), LogisticRegression(
    C=8.261486231908338,
    tol=0.8728213920467933,
    intercept_scaling=9.117615728181427,
    multi_class="multinomial",
    max_iter=10_000
))

acc = cross_score(logreg)

print(f"Logistic Regression has an accuracy of {acc:.2%}")

Logistic Regression has an accuracy of 66.24%


XGBoost.

In [8]:
ovr_xgb = OneVsRestClassifier(XGBClassifier(**{
    'n_estimators': 378,
    'learning_rate': 0.02950073992817461,
    'base_score': 0.9187179242725662,
    'verbosity': 0,
    'use_label_encoder': False
}))

acc = cross_score(ovr_xgb)

print(f"XGBoost Classifier has an accuracy of {acc:.2%}")

XGBoost Classifier has an accuracy of 68.12%


Light GBoost

In [9]:
lgb = LGBMClassifier(**{
    'num_leaves': 27,
    'n_estimators': 268,
    'learning_rate': 0.018813923117324143
})

acc = cross_score(lgb)

print(f"Light GBoost has an accuracy of {acc:.2%}")

Light GBoost has an accuracy of 66.73%


Cat Boost

In [10]:
ovr_cat = OneVsRestClassifier(CatBoostClassifier(**{
    'iterations': 330,
    'learning_rate': 0.043379595491767745,
    'depth': 7,
    'l2_leaf_reg': 0.5416613355579589,
    'border_count': 212,
    'loss_function': 'MultiClass',
    'verbose': False
}))

acc = cross_score(ovr_cat)

print(f"Cat Boost Classifier has an accuracy of {acc:.2%}")

Cat Boost Classifier has an accuracy of 67.71%


Forward Network.

In [11]:
class NetworkClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, ini_neurons=60, optimizer="adam", epochs=200, validation_split=0.3):
        self.ini_neurons = ini_neurons
        self.optimizer = optimizer
        self.epochs = epochs
        self.validation_split = validation_split
        self.model = Sequential()

    def fit(self, X, y):
        y_enc = pd.get_dummies(y)
        self.cols = y_enc.columns
        y_np = y_enc.to_numpy()
        X_np = X.to_numpy()
        self.model.add(Dense(self.ini_neurons, input_shape=(X.shape[1], ), activation="relu"))
        self.model.add(Dense(13, activation="softmax"))
        self.model.compile(
            optimizer=self.optimizer, loss="categorical_crossentropy", metrics=["accuracy"]
        )
        self.model.fit(
            X_np, y_np, epochs=self.epochs, validation_split=self.validation_split, verbose=0
        )
        return self

    def predict(self, X):
        X_np = X.to_numpy()
        y_hat = self.model.predict(X_np)
        y_df = pd.DataFrame(data=y_hat, columns=self.cols)
        y_pred = y_df.idxmax(axis=1)
        return y_pred

nc = NetworkClassifier()

acc = cross_score(nc, k=6)

print(f"Network Classifier has an accuracy of {acc:.2%}")

Network Classifier has an accuracy of 65.75%


# Weighted Voting Model

One simple way to combine and (posible) improve your predictions, is building a voting classifier. Each Estimator makes its own prediction, and then we save the most voted label. Also, one can put weights on each estimator, to favor it over the others, and get an overall improved prediction.

Again, the weights were chosen using Optuna searching functions.

Define and call the Weighted Averaging Estimator class.

In [12]:
class WeightedAveragingEstimator(BaseEstimator, ClassifierMixin):
    def __init__(self, models, weights=None):
        self.models = models
        self.weights = weights

    def fit(self, X, y):
        self.cols = pd.get_dummies(y).columns

        self.models_ = [clone(x) for x in self.models]

        for model in self.models_:
            model.fit(X, y)

        return self

    def predict(self, X):
        sum = pd.DataFrame(dtype=float, columns=self.cols)
        for i, model in enumerate(self.models_):
            y_pred_ = model.predict(X)
            y_hat = self.weights[i] * pd.get_dummies(y_pred_)
            sum = sum.add(y_hat, fill_value=0)

        sum.fillna(value=0)
        y_pred = sum.idxmax(axis=1)
        return y_pred

models = (
    OneVsRestClassifier(RandomForestClassifier(**{
        'n_estimators': 334,
        'criterion': 'entropy',
        'max_depth': 42,
        'min_samples_split': 9,
        'min_samples_leaf': 4,
        'max_features': 0.45448755763486154
    })),
    make_pipeline(StandardScaler(), MinMaxScaler(), LogisticRegression(
        C=8.261486231908338,
        tol=0.8728213920467933,
        intercept_scaling=9.117615728181427,
        multi_class="multinomial",
        max_iter=10_000
    )),
    OneVsRestClassifier(XGBClassifier(**{
        'n_estimators': 378,
        'learning_rate': 0.02950073992817461,
        'base_score': 0.9187179242725662,
        'verbosity': 0,
        'use_label_encoder': False
    })),
    LGBMClassifier(**{
        'num_leaves': 27,
        'n_estimators': 268,
        'learning_rate': 0.018813923117324143
    }),
    OneVsRestClassifier(CatBoostClassifier(**{
        'iterations': 330,
        'learning_rate': 0.043379595491767745,
        'depth': 7,
        'l2_leaf_reg': 0.5416613355579589,
        'border_count': 212,
        'loss_function': 'MultiClass',
        'verbose': False
    })),
    NetworkClassifier()
)
weights = [0.599, 0.781, 0.76, 0.425, 0.678, 0.449]

wae = WeightedAveragingEstimator(models=models, weights=weights)

WAE gets an score of

In [13]:
acc = cross_score(wae, k=6)

print(f"Weighted Averaging Estimator has an accuracy of {acc:.2%}")

Weighted Averaging Estimator has an accuracy of 68.42%
