# KNNRecommender

Fit KNNRecommender and save the resulting Meta-Table and Model

In [None]:
from knn_recommender import KNNRecommender

recommender = KNNRecommender(supervised=True, _load=False)
recommender.fit(complete_data_path="./datasets/", train_data_path="./data/train_data/", test_data_path="./data/test_data/")
recommender.save_meta_table("./recommender_data/knn_meta_table.h5")
recommender.persist_model("./recommender_data/KNNRecommender.joblib")

Using KNNRecommender

In [1]:
from knn_recommender import KNNRecommender
import pandas as pd

# When a KNNRecommender model is instantiated, it automatically loads
# the fitted model persisted on the previous step (with _load=True by default).
recommender = KNNRecommender()

df = pd.read_csv("./datasets/winetype.csv")
X = df.drop(columns=["class"]).values
y = df["class"].values

# Returns two tuples: (quantifiers ranking), (quantifiers weight by their ARR).
# First quantifier is best, second quantifier is second best, and so on.
ranking, weights = recommender.recommend(X, y)

print(f"Top 1: {ranking[0]} with Weight: {weights[0]}")
print(f"Top 2: {ranking[1]} with Weight: {weights[1]}")
print(f"Top 3: {ranking[2]} with Weight: {weights[2]}")
print(f"Full ranking: {ranking}")
print(f"Sum of weights: {sum(weights)}")

Top 1: DyS with Weight: 0.0912292699822285
Top 2: SORD with Weight: 0.09121615578024786
Top 3: MS with Weight: 0.09118933512419261
Full ranking: ('DyS', 'SORD', 'MS', 'SMM', 'MAX', 'X', 'ACC', 'HDy', 'CC', 'PACC', 'PCC')
Sum of weights: 0.9999999999999998


# RegressionRecommender

Fit RegressionRecommender and save the resulting Meta-Table and Model

In [None]:
from regression_recommender import RegressionRecommender

recommender = RegressionRecommender(supervised=True, _load=False)
recommender.fit(complete_data_path="./datasets/", train_data_path="./data/train_data/", test_data_path="./data/test_data/")
recommender.save_meta_table("./recommender_data/regression_meta_table.h5")
recommender.persist_model("./recommender_data/RegressionRecommender.joblib")

Using RegressionRecommender

In [2]:
from regression_recommender import RegressionRecommender
import pandas as pd

# When a RegressionRecommender model is instantiated, it automatically loads
# the fitted model persisted on the previous step (with _load=True by default).
recommender = RegressionRecommender()

df = pd.read_csv("./datasets/winetype.csv")
X = df.drop(columns=["class"]).values
y = df["class"].values

# Returns two tuples: (quantifiers ranking), (quantifiers weight by their MAE).
# First quantifier is best, second quantifier is second best, and so on.
ranking, weights = recommender.recommend(X, y)

print(f"Top 1: {ranking[0]} with Weight: {weights[0]}")
print(f"Top 2: {ranking[1]} with Weight: {weights[1]}")
print(f"Top 3: {ranking[2]} with Weight: {weights[2]}")
print(f"Full ranking: {ranking}")
print(f"Sum of weights: {sum(weights)}")

Top 1: SORD with Weight: 0.1556898116026983
Top 2: X with Weight: 0.13255208404701815
Top 3: SMM with Weight: 0.13152383834692327
Full ranking: ('SORD', 'X', 'SMM', 'MAX', 'DyS', 'HDy', 'ACC', 'PACC', 'CC', 'MS', 'PCC')
Sum of weights: 1.0000000000000002


# EnsembleQuantifier

You can use the resulting ranking from a recommender and feed it to the EnsembleQuantifier.

For example, we can recommend a ranking of quantifiers for a dataset and then use an ensemble of the
Top-k quantifiers to get the positive prevalence of the dataset.

In [3]:
from regression_recommender import RegressionRecommender
from ensemble_quantifier import EnsembleQuantifier
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# Let's load a dataset
df = pd.read_csv("./datasets/winetype.csv")
X = df.drop(columns=["class"]).values
y = df["class"].values

# Instantiate a RegressionRecommender model
recommender = RegressionRecommender()

# We can use the recommender to get a ranking of quantifiers
ranking, weights = recommender.recommend(X, y)

# Let's instantiate an EnsembleQuantifier model
# and pass the ranking and the Top-k quantifiers to it.
# With k = 3, for example.
# Now, when we need to get predictions, an ensemble of the Top-3 will be used.
ensemble_qtf = EnsembleQuantifier(ranking=ranking)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit the ensemble model
ensemble_qtf.fit(X_train, y_train)

# Predict the positive prevalence of the test set
pred_prev = ensemble_qtf.predict(X_test)

# Compare the prediction with the truth
true_prev = np.sum(y_test) / len(y_test)

print(f"{ensemble_qtf}")
print(f"True prevalence: {true_prev}")
print(f"Predicted prevalence: {pred_prev}")

# We can change the method of the EnsembleQuantifier.
# Avaliable methods are the following:
# - "median": returns the median of the predictions
# - "weighted": returns the weighted average of the predictions
#
# Default is "median".

# Let's use the weighted method and pass the weights to the ensemble model.
ensemble_qtf.method = "weighted"
ensemble_qtf.weights = weights

# Model is already fitted, so we can predict again
pred_prev = ensemble_qtf.predict(X_test)

print(f"\n{ensemble_qtf}")
print(f"True prevalence: {true_prev}")
print(f"Predicted prevalence: {pred_prev}")

# We can also recommend the top- with k = 3 and pass it to the ensemble model.
ranking, weights = recommender.recommend(X, y, k=3)

ensemble_qtf.ranking = ranking
ensemble_qtf.weights = weights

# Model is fitted, so we can just predict
pred_prev = ensemble_qtf.predict(X_test)

print(f"\n{ensemble_qtf}")
print(f"True prevalence: {true_prev}")
print(f"Predicted prevalence: {pred_prev}")

EnsembleQuantifier(ranking=('SORD', 'X', 'SMM', 'MAX', 'DyS', 'HDy', 'ACC', 'PACC', 'CC', 'MS', 'PCC'), weights=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], method=median)
True prevalence: 0.7630769230769231
Predicted prevalence: 0.76

EnsembleQuantifier(ranking=('SORD', 'X', 'SMM', 'MAX', 'DyS', 'HDy', 'ACC', 'PACC', 'CC', 'MS', 'PCC'), weights=(np.float64(0.1556898116026983), np.float64(0.13255208404701818), np.float64(0.13152383834692327), np.float64(0.11591094835805989), np.float64(0.11428853307711338), np.float64(0.10866110751828068), np.float64(0.06366978915726007), np.float64(0.048393397778182495), np.float64(0.046386796342616186), np.float64(0.044602061300045154), np.float64(0.03832163247180237)), method=weighted)
True prevalence: 0.7630769230769231
Predicted prevalence: 0.7606429310527548

EnsembleQuantifier(ranking=('DyS', 'ACC', 'CC'), weights=(np.float64(0.5094317799381599), np.float64(0.2838028728287671), np.float64(0.20676534723307297)), method=weighted)
True prevalence: 0.76307692

# Evaluation

Evaluate RegressionRecommender with Leave-One-Out

In [4]:
from regression_recommender import RegressionRecommender

# There is a built-in method to evaluate the RegressionRecommender
# using the leave-one-out strategy. It excludes one instance from the
# meta-table and uses the remaining instances to fit the model. Then,
# it predicts the ranking of the excluded instance and appends the
# predicted ranking alongside the true ranking to the recommender evaluation table.
recommender = RegressionRecommender()
recommender_eval, quantifiers_eval = recommender.leave_one_out_evaluation("./plot_data/regression_recommender_evaluation_table.csv",
                                                                          "./plot_data/regression_recommender_quantifiers_evaluation_table.csv")

We can evaluate the EnsembleQuantifier with APP.

APP is very costly, so we don't exactly run it again. Since our model is an ensemble, we reuse the recommender's and the quantifiers evaluation table generated by the recommender. (It will "run" the APP on the same datasets used to train the recommender).

In [5]:
from ensemble_quantifier import EnsembleQuantifier

# We need the recommender evaluation table and the quantifiers evaluation table
# generated with the leave_one_out_evaluation method from the RegressionRecommender.
ensemble_qtf = EnsembleQuantifier()
ensemble_qtf.evaluation(recommender_eval, quantifiers_eval, "./plot_data/ensemble_quantifier_evaluation_table.csv")

Unnamed: 0,quantifier,dataset,sample_size,sampling_seed,iteration,alpha,pred_prev,abs_error,run_time
0,Top-1,AedesQuinx,100,623,1,0.0,0.000000,0.000000,0.123091
1,Top-1,AedesQuinx,100,243,2,0.0,0.000000,0.000000,0.116522
2,Top-1,AedesQuinx,100,18,3,0.0,0.000000,0.000000,0.133319
3,Top-1,AedesQuinx,100,549,4,0.0,0.000000,0.000000,0.126153
4,Top-1,AedesQuinx,100,447,5,0.0,0.000000,0.000000,0.126957
...,...,...,...,...,...,...,...,...,...
26395,Top-9+W,click-prediction,100,554,6,1.0,0.989238,0.010762,1.902615
26396,Top-9+W,click-prediction,100,379,7,1.0,0.930627,0.069373,1.905128
26397,Top-9+W,click-prediction,100,348,8,1.0,0.989628,0.010372,1.911327
26398,Top-9+W,click-prediction,100,840,9,1.0,0.975554,0.024446,1.911165
