In [1]:
import pandas as pd
import openml
from pathlib import Path
from tqdm.auto import tqdm
from fractions import Fraction

In [2]:
openml.config.apikey = 'd9b3362c2eded26e48910da5e7e243c6'

In [3]:
X = pd.read_csv("/home/belucci/code/cohirf/data/criteo-up-bal/X.csv", index_col=0)
y = pd.read_csv("/home/belucci/code/cohirf/data/criteo-up-bal/y.csv", index_col=0)
df = pd.concat([X, y], axis=1)

In [4]:
df

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,treatment,label
540,12.896938,10.059654,8.214383,3.359763,10.280525,4.115453,-2.411115,4.833815,3.971858,13.190056,5.300375,-0.168679,1,1
548,16.689925,10.059654,8.214383,1.267425,10.280525,4.115453,-6.699321,4.833815,3.971858,13.190056,5.300375,-0.168679,1,1
719,20.729166,10.059654,8.233548,4.679882,12.310110,3.013064,-12.641800,10.076439,3.760462,45.859490,5.988106,-0.168679,1,2
738,24.528223,10.059654,8.403907,4.679882,11.561050,4.115453,-3.993764,4.833815,3.844556,26.606156,6.064094,-0.168679,1,2
743,12.616365,10.059654,8.301676,4.679882,11.029584,4.115453,0.294443,4.833815,3.829288,23.570168,6.318202,-0.168679,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6476051,24.018917,10.059654,8.214383,4.679882,10.280525,4.115453,-6.699321,4.833815,3.971858,13.190056,5.300375,-0.168679,0,0
8629302,12.861885,10.059654,8.649842,-1.249896,10.280525,3.013064,-13.412152,9.324689,3.854508,25.240993,5.300375,-0.168679,1,0
10244353,26.417390,10.059654,8.214383,4.679882,10.280525,4.115453,-2.411115,4.833815,3.971858,13.190056,5.300375,-0.168679,0,0
1991023,13.442709,10.059654,8.214383,1.614662,10.280525,4.115453,-5.987667,4.833815,3.971858,13.190056,5.300375,-0.168679,1,0


In [5]:
df.dtypes

f0           float64
f1           float64
f2           float64
f3           float64
f4           float64
f5           float64
f6           float64
f7           float64
f8           float64
f9           float64
f10          float64
f11          float64
treatment      int64
label          int64
dtype: object

In [6]:
df['treatment'] = df['treatment'].astype('str').astype('category')
df['label'] = df['label'].astype('str').astype('category')

In [7]:
name = "criteo-uplift-balanced"
description = """Criteo Uplift Modeling Dataset, preprocessed and balanced for classification / clustering task

From the website:
-----
Data description
This dataset is constructed by assembling data resulting from several incrementality tests, a particular randomized trial procedure where a random part of the population is prevented from being targeted by advertising. it consists of 25M rows, each one representing a user with 11 features, a treatment indicator and 2 labels (visits and conversions).

Privacy
For privacy reasons the data has been sub-sampled non-uniformly so that the original incrementality level cannot be deduced from the dataset while preserving a realistic, challenging benchmark. Feature names have been anonymized and their values randomly projected so as to keep predictive power while making it practically impossible to recover the original features or user context.

Fields
Here is a detailed description of the fields (they are comma-separated in the file):

f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11: feature values (dense, float)
treatment: treatment group (1 = treated, 0 = control)
conversion: whether a conversion occured for this user (binary, label)
visit: whether a visit occured for this user (binary, label)
exposure: treatment effect, whether the user has been effectively exposed (binary)
Key figures
Format: CSV
Size: 459MB (compressed)
Rows: 25,309,483
Average Visit Rate: .04132
Average Conversion Rate: .00229
Treatment Ratio: .846
Tasks
The dataset was collected and prepared with uplift prediction in mind as the main task. Additionally we can foresee related usages such as but not limited to:

benchmark for causal inference
uplift modeling
interactions between features and treatment
heterogeneity of treatment
benchmark for observational causality methods
Contact
For any question, feel free to contact:

The authors of the paper directly (emails in the paper)
Criteo AI Lab team: http://ailab.criteo.com/contact-us/
Criteo AI Lab twitter account: @CriteoResearch
-----

We have adapted the dataset to a classification / clustering task. The "label" column was obtained by concatenating the columns "conversion", "visit" and "exposure", 
we have then dropped duplicated rows and the rows with "label" "110" and "111", due to having too few samples. We have also randomly sampled 500_000 rows with "label" "000" and removed the 
other ones, so the "label" column is a bit more balanced with a proportion of 0.37 "000", 0.34 "010", 0.18 "001" and 0.11 "011". Finally, we have encoded the "label" column
as 0: "000", 1: "001", 2: "010", 3: "011". The original index of the dataset was kept.
"""
creator = "Criteo AI Lab"
contributor = "Bruno Belucci"
collection_date = "2018"
language = 'English'
licence = 'Creative Commons Attribution 4.0 International'
attributes = 'auto'
data = df
default_target_attribute = 'label'
ignore_attribute = None
citation = 'Diemert, Eustache, Artem Betlei, Christophe Renaudin, and Massih-Reza Amini. "A Large Scale Benchmark for Uplift Modeling." KDD (London, United Kingdom), 2018. https://hal.science/hal-02515860.'
row_id_attribute = None
original_data_url = "https://ailab.criteo.com/criteo-uplift-prediction-dataset/"
paper_url = "https://hal.science/hal-02515860/"
update_comment = None
version_label = None

In [9]:
dataset = openml.datasets.create_dataset(name=name, description=description, creator=creator, contributor=contributor, collection_date=collection_date,
                                         language=language, licence=licence, attributes=attributes, data=data, default_target_attribute=default_target_attribute,
                                         ignore_attribute=ignore_attribute, citation=citation, row_id_attribute=row_id_attribute, original_data_url=original_data_url,
                                         update_comment=update_comment, version_label=version_label)

In [10]:
dataset

OpenML Dataset
Name.........: criteo-uplift-balanced
Version......: None
Format.......: arff
Licence......: Creative Commons Attribution 4.0 International
Download URL.: None
# of features: None

In [12]:
dataset.publish()

OpenML Dataset
Name.........: criteo-uplift-balanced
Version......: None
Format.......: arff
Licence......: Creative Commons Attribution 4.0 International
Download URL.: None
OpenML URL...: https://www.openml.org/d/47039
# of features: None

In [None]:
dataset = openml.datasets.get_dataset(47037)

In [8]:
openml.datasets.delete_dataset(47037)

True

In [None]:
openml.datasets.edit_dataset(47037, default_target_attribute='label')

47037

In [None]:
y.value_counts(normalize=True)

label
0        0.365886
2        0.337827
1        0.183243
3        0.113043
Name: proportion, dtype: float64