# Generate LaTeX code for RS/ML taxonomy table from a Dataframe

Manually copying each entry from the dataframe to Overleaf was not very appealing as an afternoon activity. So, I decided to waste a few days trying to generate the LaTeX code for the tables/taxonomies from the Dataframe. Believe it or not, it worked out and the table generation is now fully automated. Yet, it's a tedious process and a lot of the decisions are hardcoded. See [table_fpga.py](table_FPGA.py) for the second taxonomy (Table 3).
The only thing I added manually are `\hspace{}` to center the rotated text in large multirows. Indeed, it seems that the width of the rules of the `booktabs` package are not taken into account when centring the label.


In [1]:
import pandas as pd
from pathlib import Path
import re

from utils.utils import print_pretty_df, r, y, g, b, e
from utils.metadata import assign_model_core

from collections import OrderedDict


# Load the data
pickleName = "all_datapoints.pkl"
datapointsDfPath = Path("..") / "data" / "Dataframes" / pickleName
df = pd.read_pickle(datapointsDfPath)

# Path to save the latex table
latexPath = Path("../data/Tables/ML-RS_latex_table.tex")

In [2]:
print_pretty_df(df)
print(f"Total number of experiments: {len(df)}")

+----+-------------------------------------------------------+-------------------------------+------------------+----------------+----------------+---------------------------------------------------+----------------------+-----------------------------------------+------------------------------------------------+----------------------+------------------+-----------+------------+-------------+-----------+--------------+-------------------+------------+-------------+-------------------------------------+----------------+-----------------------------------------+-------------------------------------------------------------+-------------------------------------------+------------+----------+----------+---------------------+
|    |                   BBT Citation Key                    |             Model             | Equivalent model |    Backbone    |    Modality    |                      Dataset                      |         Task         |               Application               |       

### ML/RS Table generation

This code generates the LaTeX code for the ML/RS table in the survey. It's basically divided in 2 parts: the preprocessing of the Dataframe to format, add citations, sort and group fields and the actual generation of the LaTeX code using some recursive tree structure.

#### Data cleaning/preparation


In [None]:
# ---------------------------
# 1) Data cleaning and formatting
# ---------------------------

# ----- Rename columns -----
df = df.rename(
    columns={
        "BBT Citation Key": "Article",
        "Publication year": "Year",
        "Task score": "Score",
        "Power consumption": "Power",
    }
)

# ----- Rename Application values -----
def rename_application(app: str) -> str:
    if app == "Safe UAV landing site identification":
        return "UAV landing site id."
    elif app == "Railway track fastener defect detection":
        return "Railway defect det."
    elif app == "Redshift estimation [regression]":
        return "Redshift est."
    elif app == "Oil spills feature extraction":
        return "Oil spills mon."
    # elif app == "Air Quality Monitoring": # changed to "Air quality classification"
    #     return "Air quality mon."
    elif app == "Diverse":
        return "Diverse det."
    app = app.replace("identification", "id.")
    app = app.replace("detection", "det.")
    app = app.replace("classification", "class.")
    app = app.replace("extraction", "extr.")
    return app
df["Application"] = df["Application"].apply(rename_application)
# print(df["Application"].unique())


# ----- Clean up the Dataset column -----
# Remove the task from the dataset tag
def remove_task_from_dataset_tag(datasetTag: str) -> str:
    return datasetTag.split("{")[0].strip()

df["Dataset"] = df["Dataset"].apply(remove_task_from_dataset_tag)
# Rename long dataset names + add citations
with_citation = True
def wrap_italic(x):
    return r"\textit{" + x + r"}"
dataset_order = {
    "UC-Merced Land Use": "UCM Land Use"
    + r" \cite{yangBagvisualwordsSpatialExtensions2010}"
    if with_citation
    else "",
    "NWPU-RESISC45": "NWPU-RESISC45" + r" \cite{chengRemoteSensingImage2017}"
    if with_citation
    else "",
    "MASATI": "MASATI" + r" \cite{gallegoAutomaticShipClassification2018}"
    if with_citation
    else "",
    "Kaggle SSI": "Kaggle SSI",
    "L8 Biome": "L8 Biome" + r" \cite{fogaCloudDetectionAlgorithm2017}"
    if with_citation
    else "",
    "MSTAR": "MSTAR" + r" \cite{MSTAROverview}" if with_citation else "",
    "University of Pavia": r"University of Pavia"
    + r" \cite{HyperspectralRemoteSensing}"
    if with_citation
    else "",
    "AVIRIS-NG": "AVIRIS-NG",
    "Airport-Beach-Urban": "ABU" + r" \cite{ABUDataset}" if with_citation else "",
    "Potsdam": "Potsdam" + r" \cite{2DSemanticLabeling}" if with_citation else "",
    "38-Cloud": "38-Cloud"
    + r" \cite{mohajeraniSorourMo38CloudCloudSegmentationDataset2024}"
    if with_citation
    else "",
    "DOTAv1.0": "DOTAv1.0" + r" \cite{xiaDOTALargeScaleDataset2018}"
    if with_citation
    else "",
    "DIOR": "DIOR" + r" \cite{liObjectDetectionOptical2020}" if with_citation else "",
    "NWPU VHR-10": "NWPU VHR-10" + r" \cite{chengMulticlassGeospatialObject2014}"
    if with_citation
    else "",
    "DAC 2018": "DAC 2018",
    "PennSyn2Real": "PennSyn2Real" + r" \cite{nguyenPennSyn2RealTrainingObject2021}"
    if with_citation
    else "",
    "Airbus": "Airbus",
    "SSDD": "SSDD" + r" \cite{liShipDetectionSAR2017}" if with_citation else "",
    "Sentinel-2 (cust.)": wrap_italic("Sentinel-2 (cust.)"),
    "Landsat-8-OLI (cust.)": wrap_italic("Landsat-8-OLI (cust.)"),
    "ALOS-2 (cust.)": wrap_italic("ALOS-2 (cust.)"),
    "ALSAT-2A (cust.)": wrap_italic("ALSAT-2A (cust.)"),
    "FastenerDataset (cust.)": wrap_italic("FastenerDataset (cust.)"),
    "RGB (cust.)": wrap_italic("RGB (cust.)"),
    "UAV RGB (cust.)": wrap_italic("UAV RGB (cust.)"),
    "UAV RGB+MMW (cust.)": wrap_italic("UAV RGB+MMW (cust.)"),
    "SAR (sim.)": wrap_italic("SAR (sim.)"),
    "1D Signal (sim.)": wrap_italic("1D Signal (sim.)"),
    "Google Earth Studio RGB (sim.)": wrap_italic("GES RGB (sim.)"),
}
df.replace({"Dataset": dataset_order}, inplace=True)

# ----- Wrap citation keys with the command -----
if not df["Article"][0].startswith(r"\cite"):
    df["Article"] = df["Article"].apply(lambda x: f"\\cite{{{x}}}")

# ----- Clean up "Board" names -----
def determine_board(boardTag: str) -> str:
    family = boardTag.split("(")[0].strip()
    return family
df["Board"] = df["Board"].apply(determine_board)

# ----- Backslash math chararacters in Model names-----
df["Model"] = df["Model"].apply(lambda x: x.replace("_", r"\_"))

#  ----- Adjust metrics units -----
def transform_score(score_val: str) -> str:
    m = re.search(r"([\d.]+)%\s*(\S+)", score_val)
    if not m:
        return score_val
    num_part, metric_part = m.groups()
    return f"{num_part} ({metric_part})"
df["Score"] = df["Score"].fillna("").apply(transform_score)

def transform_complexity(val: str) -> str:
    return val.replace(" OP", "")
df["Complexity"] = df["Complexity"].fillna("").apply(transform_complexity)

def strip_unit(val: str) -> str:
    return re.sub(r"\s*(MB|W|GOP/s)$", "", val)
df["Footprint"] = df["Footprint"].fillna("").apply(strip_unit)
df["Throughput"] = df["Throughput"].fillna("").apply(strip_unit)
df["Power"] = df["Power"].fillna("").apply(strip_unit)

# ----- Simplify modalities -----, e.g., "RGB + infrared" -> "RGB"
def simplify_modality(modalityTag: str) -> str:
    return modalityTag.split("+")[0].strip()
df["Modality"] = df["Modality"].apply(simplify_modality)

# ----- Add the model core -----
df["Model Core"] = df.apply(assign_model_core, axis=1)

# Acronym for long model names
df["Model"] = df["Model"].replace("Roller Dung Bettle Clustering", "RDBC")
df["Model"] = df["Model"].replace("Weightless Neural System", "WNS")

# ---------------------------
# 2) Sorting the data
# ---------------------------
taskOrder = [
    "Classification",
    "Segmentation - Pixel", # previously named "Pixel classification",
    "Segmentation - Tile",  # previously named "Segmentation",
    "Object detection",
    "Regression",
]
modalityOrder = ["RGB", "RGB + infrared", "RGB + MMW", "HSI", "SAR", "1D"]
app_order = [
    "Deforestation det.",
    "UAV landing site id.",
    "Land Use Land Cover",
    "Military targets id.",
    "Ship id.",
    "Urban areas",
    "Cloud coverage class.",
    "Cloud coverage extr.",
    # "General", # changed to "Land Use Land Cover"
    "Redshift est.",
    "Air quality class.",
    "Oil spills mon.",
    "Diverse det.",
    "Flying-object det.",
    "Aircraft det.",
    "UAV obstacles det.",
    "Railway defect det.",
    "Anomaly det.",
    "Ship det.",
]

# Build a custom order_map, create new columns
task_order_map = {val: idx for idx, val in enumerate(taskOrder)}
df["TaskSortKey"] = df["Task"].map(task_order_map)
modality_order_map = {val: idx for idx, val in enumerate(modalityOrder)}
df["ModalitySortKey"] = df["Modality"].map(modality_order_map)
app_order_map = {val: idx for idx, val in enumerate(app_order)}
df["AppSortKey"] = df["Application"].map(app_order_map)
app_order_map = {val: idx for idx, val in enumerate(dataset_order.values())}
df["DatasetSortKey"] = df["Dataset"].map(app_order_map)

# Sort by the new columns
df = df.sort_values(
    by=["TaskSortKey", "ModalitySortKey", "AppSortKey", "DatasetSortKey"]
)
# print_pretty_df(
#     df[
#         [
#             "Task",
#             "TaskSortKey",
#             "Modality",
#             "ModalitySortKey",
#             "AppSortKey",
#             "Application",
#             "DatasetSortKey",
#             "Dataset",
#         ]
#     ]
# )

# Drop the sort keys
df.drop(
    columns=["TaskSortKey", "ModalitySortKey", "AppSortKey", "DatasetSortKey"],
    inplace=True,
)

### Recursive mechanics


In [None]:
def check_array(tags, tdict):
    name = []
    for k, ids in tdict.items():
        for v in ids:
            if v in list(tags)[0]:
                name.append(k)
    if name == []:
        return list(tdict)[-1]
    else:
        return ",".join(name)


class TexNode:
    def __init__(self, name, df, columns, i, sdepth):
        self.name = name
        self.max_depth = len(columns)
        self.nodes = []
        self.depth = i
        self.len = df.shape[0]
        if i > 0:
            self.hook = columns[i - 1].hook
            self.cwidth = columns[i - 1].cwidth

        if i < len(columns):
            self.column = columns[i]
            self.final = False
            if self.column.array:
                for j in range(len(df)):
                    r = df.iloc[[j]]
                    name = check_array(r[self.column.df_key], self.column.tags)
                    self.nodes.append(TexNode(name, r, columns, i + 1, 4))
            else:
                a = 0
                unique_vals = df[self.column.df_key].unique()
                for val in unique_vals:
                    rows = df.loc[df[self.column.df_key] == val]
                    if len(rows) == 0:
                        continue
                    if self.depth >= sdepth:
                        for j in range(len(rows)):
                            self.nodes.append(
                                TexNode(val, rows.iloc[[j]], columns, i + 1, 4)
                            )
                    else:
                        self.nodes.append(TexNode(val, rows, columns, i + 1, 4))
                    a += len(rows)
                if a != self.len:
                    print("ERROR not all rows classified " + self.column.df_key)
                    print(f"Total rows {self.len} used {a}")
                    print(df[self.column.df_key])
                    for tar in df[self.column.df_key].unique():
                        found = 0
                        for k, v in self.column.tags.items():
                            if tar in v:
                                found = 1
                                break
                        if found == 0:
                            print("Missed key " + tar)
        else:
            self.column = None
            self.final = True

    def render(self, text_in, first, last, max_depth):
        text = text_in
        if first:
            text += "".join(["    " for x in range(self.depth - 2)])
        else:
            text += "".join(["   &" for x in range(self.depth - 2)])
        if self.depth > 1:
            text += "   &"
        if self.name == "":
            self.name = "-"
        if self.hook is None:
            text += f"\\multirow{{{self.len}}}{{*}}{{{self.name}}}"
        else:
            text += f"\\multirow{{{self.len}}}{{*}}{{{self.hook(self.name)}}}"
        if self.final:
            text += "\\\\\n"
            for i in range(self.len - 1):
                text += "".join(["   &" for x in range(self.depth - 1)]) + "\\\\\n"
        else:
            text += "\n"
            for n in self.nodes:
                text = n.render(
                    text, (n == self.nodes[0]), (n == self.nodes[-1]), max_depth
                )
        if self.depth <= max_depth:
            if not last:
                text += f"\\cmidrule{{{self.depth}-{self.max_depth}}}\n"
        return text


class TexTable:
    def __init__(self, df, columns):
        self.data = df
        self.columns = columns
        self.start = TexNode("Start", df, columns, 0, 4)

    def render(self, text_in, max_depth):
        self.text = text_in
        self.text += self.header()
        for n in self.start.nodes:
            self.text = n.render(
                self.text,
                (n == self.start.nodes[0]),
                (n == self.start.nodes[-1]),
                max_depth,
            )
        self.text += self.footer()
        return self.text

    def header(self):
        text = "\\begin{table}\n\\centering\n"
        text += r"""
\newcolumntype{H}{>{\setbox0=\hbox\bgroup}c<{\egroup}@{}}

\caption{RS/ML Taxonomy Table}
\label{table:rs-ml_taxonomy}

\begin{adjustbox}{totalheight=\textheight-2\baselineskip}
"""
        text += "\\begin{tabular}{"
        text += "".join(
            ["H" if (c.name == "Eq. Model") else "c" for c in self.columns]
        )  # Hide the Eq. Model column
        text += "}\n"
        # Write the header categories
        text += r"""
\multicolumn{1}{c}{\textbf{CV}} & \multicolumn{3}{c}{\textbf{Remote Sensing}} & \multicolumn{2}{c}{\textbf{Article}} & \multicolumn{4}{c}{\textbf{ML Model}} & \multicolumn{1}{c}{\textbf{FPGA}} &\multicolumn{3}{c}{\textbf{Performance}}\\
\cmidrule(lr){1-1} \cmidrule(lr){2-4} \cmidrule(lr){5-6} \cmidrule(lr){7-10} \cmidrule(lr){11-11} \cmidrule(lr){12-14}
"""
        # Write the header row
        for c in self.columns:
            text += f"\\textbf{{{c.name}}}"
            if c != self.columns[-1]:
                text += " &"
        text += "\\\\\n \\toprule \n"
        return text

    def footer(self):
        text = r"""
\bottomrule
\multicolumn{14}{c}{
\textit{Task:} \textbf{Regr.} Regression; \textit{Dataset:} \textbf{GES} Google Earth Studio, \textbf{MMW} MilliMeter-Wave radar; \textit{Model:} \textbf{RDBC} Roller Dung Bettle Clustering, \textbf{WNS} Weightless Neural System
}
\end{tabular}
\end{adjustbox}
\end{table}
"""
        return text


class TexColumn:
    def __init__(
        self,
        df_key: str,
        tags: dict,
        name: str,
        cwidth: str = "4em",
        array=False,
        hook=None,
    ):
        self.df_key = df_key
        self.tags = tags
        self.name = name
        self.cwidth = cwidth
        self.hook = hook
        self.array = array

In [None]:
# ---------------------------
# 3) Prepare dictionaries for the TexColumn tags
#    Each dictionary key will be how you group the data, with a list of possible values.
#    For brevity, we build them from the actual data in df.
# ---------------------------
def build_tag_dict_from_column(column_values):
    """
    Build an OrderedDict so the iteration order matches the sorted DataFrame order.
    """
    unique_values = list(pd.unique(column_values))
    # This preserves the exact order in which values appear, otherwise we override the sorting
    tag_dict = OrderedDict((val, [val]) for val in unique_values)
    return tag_dict


task_tags = build_tag_dict_from_column(df["Task"])
modality_tags = build_tag_dict_from_column(df["Modality"])
application_tags = build_tag_dict_from_column(df["Application"])
dataset_tags = build_tag_dict_from_column(df["Dataset"])
article_tags = build_tag_dict_from_column(df["Article"])
year_tags = build_tag_dict_from_column(df["Year"])
board_tags = build_tag_dict_from_column(df["Board"])
model_tags = build_tag_dict_from_column(df["Model"])
core_tags = build_tag_dict_from_column(df["Model Core"])
backbone_tags = build_tag_dict_from_column(df["Backbone"])
equivmodel_tags = build_tag_dict_from_column(df["Equivalent model"])
score_tags = build_tag_dict_from_column(df["Score"])
footprint_tags = build_tag_dict_from_column(df["Footprint"])
complex_tags = build_tag_dict_from_column(df["Complexity"])
# latency_tags    = build_tag_dict_from_column(df["Latency"])
# through_tags    = build_tag_dict_from_column(df["Throughput"])
# power_tags      = build_tag_dict_from_column(df["Power"])


# ---------------------------
# 4) Build the TexColumns array for each column we want to show
#    Each TexColumn references the .df_key and uses the dictionary we built.
# ---------------------------
columns = [
    TexColumn(
        "Task",
        task_tags,
        "Task",
        "2.5em",
        hook=lambda x: f"\\rotatebox[origin=c]{{90}}{{{x}}}",
    ),
    TexColumn("Modality", modality_tags, "Mod.", "2.5em"),
    TexColumn("Application", application_tags, "Application", "2.5em"),
    TexColumn("Dataset", dataset_tags, "Dataset", "3em"),
    TexColumn("Article", article_tags, "Ref.", "2em"),
    TexColumn("Year", year_tags, "Year", "1.5em"),
    TexColumn("Model", model_tags, "Original Name", "2.5em"),
    TexColumn("Model Core", core_tags, "Core", "2em"),
    TexColumn("Backbone", backbone_tags, "Backbone", "2em"),
    TexColumn("Equivalent model", equivmodel_tags, "Eq. Model", "2em"),
    TexColumn("Board", board_tags, "Family", "2.5em"),
    TexColumn("Score", score_tags, r"Score [\%]", "2em"),
    TexColumn("Footprint", footprint_tags, "MF[MB]", "2em"),
    TexColumn("Complexity", complex_tags, "C[OP]", "2em"),
    # TexColumn("Latency", latency_tags, "Latency [ms]", "2em"),
    # TexColumn("Throughput", through_tags, "Throughput [GOP/s]", "2em"),
    # TexColumn("Power", power_tags, "Power [W]", "2em"),
]

# ---------------------------
# 5) Create the table
tex_table = TexTable(df, columns)
latex_text = tex_table.render("", max_depth=4)

# ---------------------------
# 6) Last minute changes
# Don't rotate Regression (it's only on one line), use a manual footnote instead
latex_text = re.sub(
    r"\\rotatebox\[origin=c\]\{90\}\{Regression\}", r"Regr.", latex_text
)

# ----- Save to file -----
latexPath.parent.mkdir(parents=True, exist_ok=True)
with open(latexPath, "w") as f:
    f.write(latex_text)
print(f"{g}LaTeX table saved to {latexPath}{e}")

[32mLaTeX table saved to ..\data\Tables\ML-RS_latex_table.tex[0m


#### A little bit more statistics


In [None]:
nb_classif = len(df[df["Task"] == ("Classification")])
nb_seg_pixel = len(df[df["Task"] == ("Segmentation - Pixel")])
nb_seg_tile = len(df[df["Task"] == ("Segmentation - Tile")])
nb_obj_det = len(df[df["Task"] == ("Object detection")])
print(
    f"{nb_classif} datapoints formulated as classsification or {nb_classif / len(df) * 100:.2f}%"
)
print(
    f"{nb_seg_pixel} datapoints formulated as segmentation (pixel) or {nb_seg_pixel / len(df) * 100:.2f}%"
)
print(
    f"{nb_seg_tile} datapoints formulated as segmentation (tile) or {nb_seg_tile / len(df) * 100:.2f}%"
)
print(
    f"{nb_obj_det} datapoints formulated as object detection or {nb_obj_det / len(df) * 100:.2f}%"
)


# Print number of model consuming more than 5W
consumption = list(df["Power"])
# Remove empty values
consumption = [x for x in consumption if x != ""]
# Remove the ' W' suffix
consumption = [x.replace(" W", "") for x in consumption]
consumption = [float(x.replace("W", "")) for x in consumption]
print(
    f"{len(consumption)} experiments report power consumption, average: {sum(consumption) / len(consumption):.2f}W"
)
print(f"{len([x for x in consumption if x > 5])} models consume more than 5W")

24 datapoints formulated as classsification or 36.36%
14 datapoints formulated as pixel classification or 21.21%
6 datapoints formulated as segmentation or 9.09%
21 datapoints formulated as object detection or 31.82%
48 experiments report power consumption, average: 7.08W
21 models consume more than 5W
