# BidooBot: Preprocessing

A notebook that pre-processes the raw scraped data.

The process is five-fold with the following steps that will be performed sequentially:

1. concatenation in a unified CSV sheet
2. anonymization of personally identifiable information (PII) -- username
3. removal of quasi-identifiers (QI) -- URL and image URL
4. translation of text data to lowercase English
5. standardization of feature names

In [None]:
%%writefile requirements.txt
sentencepiece==0.1.97
sacremoses==0.0.53
transformers==4.26.1

In [None]:
!pip install --upgrade pip
!pip install --upgrade -r requirements.txt

In [None]:
import json
import pathlib
import typing
from functools import lru_cache
from typing import Any

import pandas as pd
import torch
from google.colab import drive
from sklearn.preprocessing import LabelEncoder
from tqdm.auto import tqdm
from transformers import pipeline
from transformers.pipelines import text2text_generation

In [None]:
if not pathlib.Path("/content/drive").exists():
    drive.mount("/content/drive")

In [None]:
ROOT = pathlib.Path("/content/drive/MyDrive/Colab Notebooks/bidoobot")

In [None]:
def _01_concatenation(data_location: pathlib.Path) -> pd.DataFrame:
    raw_data_location = data_location / "raw"
    assert raw_data_location.exists(), "Did you run 00-scraping.ipynb?"

    filepaths = list(raw_data_location.glob("closed_auctions*.csv"))
    assert filepaths, "No scraped data available"

    dataset = pd.concat([
        pd.read_csv(filepath) for filepath
        in tqdm(filepaths, desc="Loading scraped data")
    ])
    return dataset

In [None]:
LUT = typing.Dict[int, Any]

In [None]:
def compute_lookup_table(encoder: LabelEncoder) -> LUT:
    classes = encoder.classes_

    encodings = encoder.transform(classes)
    return dict(zip(encodings, classes))

In [None]:
def _02_anonymization(
    dataset: pd.DataFrame, identifiers: typing.List[str]
) -> typing.Tuple[pd.DataFrame, typing.Dict[str, LUT]]:
    mappers = {}

    for identifier in tqdm(identifiers, desc="Anonymizing identifiers"):
        encoder = LabelEncoder()

        dataset[identifier] = encoder.fit_transform(dataset[identifier])

        table = compute_lookup_table(encoder)
        mappers[identifier] = table

    return dataset, mappers

In [None]:
def _03_quasi_identifiers_removal(
    dataset: pd.DataFrame, quasi_identifiers: typing.List[str]
) -> pd.DataFrame:
    return dataset.drop(columns=quasi_identifiers)

In [None]:
@lru_cache(maxsize=1024)
def translate(translator: text2text_generation.TranslationPipeline, text: str) -> str:
    translated = translator(text)[0]
    translated = translated["translation_text"]
    translated = translated.lower()

    return translated

In [None]:
def _04_translation(
    dataset: pd.DataFrame, 
    translator: text2text_generation.TranslationPipeline, 
    nontranslated: typing.List[str]
) -> pd.DataFrame:
    for col in tqdm(nontranslated, "Translating"):
        strings = dataset[col].unique()

        table = {
            text: translate(translator, text) for text 
            in tqdm(strings, desc=f"Translating - {col}")
        }

        dataset[col] = dataset[col].map(table)

    return dataset

In [None]:
def _05_standardization(
    dataset: pd.DataFrame, nonstandard: typing.Dict[str, str]
) -> pd.DataFrame:
    return dataset.rename(columns=nonstandard)

In [None]:
def fix_01_bad_translation(dataset: pd.DataFrame) -> pd.DataFrame:
    mask = dataset["winner_modality"] == "car"
    dataset.loc[mask, "winner_modality"] = "automatic"

    return dataset

In [None]:
def fix_02_localization(dataset: pd.DataFrame) -> pd.DataFrame:
    timestamp = pd.to_datetime(dataset["timestamp"], unit="s")
    localized = timestamp.dt.tz_localize("CET").dt.tz_convert("UTC")

    dataset.loc[:, "timestamp"] = localized.apply(
        lambda datetime: datetime.timestamp()
    )

    return dataset

In [None]:
def hotfix(dataset: pd.DataFrame) -> pd.DataFrame:
    dataset = fix_01_bad_translation(dataset)
    dataset = fix_02_localization(dataset)

    return dataset

In [None]:
def _06_storage(
    data_location: pathlib.Path, 
    dataset: pd.DataFrame, 
    mappers: typing.Dict[str, LUT]
) -> None:
    processed_location = data_location / "processed"

    if not processed_location.exists():
        processed_location.mkdir()

    dataset_filepath = processed_location / "closed_auctions.csv"
    dataset.to_csv(dataset_filepath, index=False)

    for col, mapper in mappers.items():
        mapper_filepath = processed_location / f"mapper_{col}.csv"

        mapper = list(mapper.items())
        mapper = pd.DataFrame(mapper, columns=["idx", col])

        mapper.to_csv(mapper_filepath, index=False)

In [None]:
def preprocessing(
    data_location: pathlib.Path,
    translator: text2text_generation.TranslationPipeline,
    nontranslated: typing.List[str],
    nonstandard: typing.Dict[str, str],
    identifiers: typing.List[str], 
    quasi_identifiers: typing.List[str],
    **kwargs
) -> None:
    dataset = _01_concatenation(data_location)
    dataset = _05_standardization(dataset, nonstandard)
    dataset, mappers = _02_anonymization(dataset, identifiers)
    dataset = _03_quasi_identifiers_removal(dataset, quasi_identifiers)
    dataset = _04_translation(dataset, translator, nontranslated)
    dataset = hotfix(dataset)
    dataset = dataset.sort_values(**kwargs)

    _06_storage(data_location, dataset, mappers)

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

translator = pipeline(
    "translation", model="Helsinki-NLP/opus-mt-tc-big-it-en", device=device
)

In [None]:
FLAGS = {
    "data_location": ROOT / "data",
    "by": "timestamp",
    "ascending": True,
    "translator": translator,
    "nontranslated": ["name", "winner_modality"],
    "nonstandard": {
        "n_bids": "num_bids", "value": "price", "winner_n_bids": "winner_num_bids"
    },
    "identifiers": ["winner_username"],
    "quasi_identifiers": ["URL", "image_URL", "id", "winner_savings_pct"],
}

In [None]:
preprocessing(**FLAGS)