# Messy Notebook where I try to analyze data from Zotero


## Imports, global variables and Zotero session


In [67]:
from pyzotero import zotero
from pprint import pprint
from tqdm import tqdm
import pandas as pd

library_id = "8968938"  # From: https://www.zotero.org/settings/keys
library_type = "user"  # Because I access my own library, otherwise "group"
api_key = "jTbkXBSx7Yv0GyOQU3its5Gb"

# Quick ANSI color code shortcurts
r = "\033[31m"
y = "\033[33m"
g = "\033[32m"
b = "\033[34m"
e = "\033[0m"

zot = zotero.Zotero(library_id, library_type, api_key)
print()




### Methods


In [88]:
# --- Quick lambda functions ---
get_total_in_dict_of_lists = lambda d: sum([len(d[tag]) for tag in d])
get_total_in_dict = lambda d: sum([d[tag] for tag in d])


# --- Functions ---
def fetch_articles_data(zot: zotero.Zotero, keys: list[str]):
    # Prepare a list to hold each article's data
    articles_data = []
    print(f"{y}Fetching data for {len(keys)} articles...{e}")

    for key in keys:
        # Fetch the item data for each key
        item = zot.item(key)

        # Extract the relevant information
        data = item["data"]
        title = data.get("title", "")
        doi = data.get("DOI", "")
        url = data.get("url", "")
        abstract_note = data.get("abstractNote", "")
        date = data.get("date", "")
        item_type = data.get("itemType", "")

        # Extract the list of authors
        print(f"{g}{title}{e}: {data.get('creators', [])}]")
        authors = [
            f"{author['firstName']} {author['lastName']}"
            for author in data.get("creators", [])
            if author["creatorType"] == "author"
        ]

        # Extract the list of tags
        tags = [tag["tag"] for tag in data.get("tags", [])]

        # Append the data to the articles_data list
        articles_data.append(
            {
                "Key": key,
                "Title": title,
                "Authors": authors,
                "DOI": doi,
                "URL": url,
                "Tags": tags,
                "Abstract Note": abstract_note,
                "Date": date,
                "Item Type": item_type,
            }
        )

    # Create a DataFrame from the collected data
    df = pd.DataFrame(articles_data)
    df.set_index("Key", inplace=True)

    return df

## Bonus: Playing with the API


#### Print all collections


In [69]:
# collections = zot.all_collections()
# print(len(collections), "collections in your library")
# pprint(collections)

#### Get every possible item


In [70]:
# # Zotero.everything() leverages the 100 items per request limit
# all_items = zot.everything(zot.top())
# # I have ~310 items and it takes 20s to fetch them all
# print(len(all_items), "items in your library")

#### See which possible `itemTypes` are available


In [71]:
# pprint(len(zot.item_types()))

## Access to the review collection

Key for the `"PhD - DLR"`/`"On-board AI"`/`"Review ML / FPGA / RS"`/`"Merge with already read"` library:

- `"LWR4HAWY"`


In [72]:
# Add search paprameters to select only conference papers and journal articles
# /!\ Ideally i just want to NOT select notes and attachments, but I did not find the API syntax to do so
zot.add_parameters(itemType="conferencePaper || journalArticle")
# Fetch all the items in the library (Without the limitation of 100 items per request)
review_items = zot.everything(zot.collection_items("LWR4HAWY"))
print(len(review_items), " items in the review collection")

103  items in the review collection


### Sort articles if selected or not


In [73]:
excluded = {}
articles_selected_for_review = []
# --- For all item ---
for item in tqdm(review_items):
    # Print the item's title and type
    # print(f"{r}{item['data']['itemType']}{e}")
    # print(f" - {b}{item['data']['title']}{e}")

    # print(f'Item Type: {item["data"]["itemType"]} | Key: {item["data"]["key"]}')
    # Get the item's tags as a list
    tags = item["data"].get("tags", [])
    is_excluded = False
    for tag in tags:
        # Weird but each tag is a dictionary with a "tag" key
        tag = tag["tag"]
        # If the tag starts with "excluded: "
        if tag.startswith("Excluded: "):
            # If the tag is not yet in the excluded dictionary
            if tag not in excluded:
                # Add the tag as a key and an empty list as the value
                excluded[tag] = []

            # Add the item's key to the list of keys for the tag
            excluded[tag].append(item["data"]["key"])
            is_excluded = True
            break
    # If the item is not excluded, add its key to the list of keys for the tag "Selected for review"
    if not is_excluded:
        articles_selected_for_review.append(item["data"]["key"])

# Verify that the total of excluded items sum up to the total number of items in the review collection
total_excluded = 0
for key in excluded:
    total_excluded += len(excluded[key])

print(
    f"{r}{total_excluded}{e} excluded items + {g}{len(articles_selected_for_review)}{e} selected for review = {b}{len(review_items)}{e} total items in the review collection"
)
assert total_excluded + len(articles_selected_for_review) == len(review_items)

100%|██████████| 103/103 [00:00<00:00, 102811.35it/s]

[31m54[0m excluded items + [32m49[0m selected for review = [34m103[0m total items in the review collection





#### Print exclusion reasons


In [74]:
# Print the excluded dictionary
print(f"Total number of items excluded: {r}{total_excluded}{e}.")
for tag, keys in excluded.items():
    print(f'{r}{len(keys):>3}{e} items excluded for: {b}"{tag[10:]}"{e}')

Total number of items excluded: [31m54[0m.
[31m  5[0m items excluded for: [34m"Not using ML"[0m
[31m  5[0m items excluded for: [34m"FPGA mentionned in Abstract but no detail"[0m
[31m  3[0m items excluded for: [34m"RS just as example application in intro"[0m
[31m 15[0m items excluded for: [34m"LiDAR for autonomous driving"[0m
[31m  9[0m items excluded for: [34m"AI for Robots in the context of space/UAV"[0m
[31m  4[0m items excluded for: [34m"LiDAR point-clouds"[0m
[31m  2[0m items excluded for: [34m"Not RS but error detection and correction onboard satellites"[0m
[31m  1[0m items excluded for: [34m"Retracted (Faulty Peer review)"[0m
[31m  1[0m items excluded for: [34m"no experiment, discussion paper"[0m
[31m  2[0m items excluded for: [34m"LiDAR for distance estimation"[0m
[31m  1[0m items excluded for: [34m"LiDAR for face recognition"[0m
[31m  1[0m items excluded for: [34m"Wrong acronym, FPGA"[0m
[31m  3[0m items excluded for: [34m"Wr

### Transform the data in a pandas `Dataframe`


In [75]:
# Fetch the data and create a DataFrame
selected_articles_df = fetch_articles_data(zot, articles_selected_for_review)
pprint(selected_articles_df)

[33mFetching data for 49 articles...[0m
[32mFPGA-Based Implementation of a CNN Architecture for the On-Board Processing of Very High-Resolution Remote Sensing Images[0m: [{'creatorType': 'author', 'firstName': 'Romén', 'lastName': 'Neris'}, {'creatorType': 'author', 'firstName': 'Adrián', 'lastName': 'Rodríguez'}, {'creatorType': 'author', 'firstName': 'Raúl', 'lastName': 'Guerra'}, {'creatorType': 'author', 'firstName': 'Sebastián', 'lastName': 'López'}, {'creatorType': 'author', 'firstName': 'Roberto', 'lastName': 'Sarmiento'}]]
[32mAn FPGA-based hardware accelerator for cnns inference on board satellites: Benchmarking with myriad 2-based solution for the CloudScout case study[0m: [{'creatorType': 'author', 'firstName': 'Emilio', 'lastName': 'Rapuano'}, {'creatorType': 'author', 'firstName': 'Gabriele', 'lastName': 'Meoni'}, {'creatorType': 'author', 'firstName': 'Tommaso', 'lastName': 'Pacini'}, {'creatorType': 'author', 'firstName': 'Gianmarco', 'lastName': 'Dinelli'}, {'crea

In [76]:
print(selected_articles_df)

                                                      Title  \
Key                                                           
VCR6UN93  FPGA-Based Implementation of a CNN Architectur...   
9IP6MAEN  An FPGA-based hardware accelerator for cnns in...   
DNLYYQI2  Onboard target detection in hyperspectral imag...   
BBSELZ8H  Hardware Acceleration and Implementation of YO...   
LUVJTIUC  Algorithm/Hardware Codesign for Real-Time On-S...   
MASCY69R  Algorithm–Hardware Co-Optimization and Deploym...   
BEWIWTSJ  FPGA-based remote target classification in hyp...   
IKLUDW2H  An extremely pipelined FPGA-based accelerator ...   
M2B4YSNE  A real-time SC¡SUP¿2¡/SUP¿S-based open-set rec...   
L7Q6CSPR  An Approach to the Implementation of a Neural ...   
P9I46S6F  Artificial Neural Networks-Based Radar Remote ...   
QJTVB2BF  FPGA Accelerated Decentralized Reinforcement L...   
DLYWKUC9  Algorithm-Hardware Co-Optimization for Energy-...   
T88KJNJA  Accelerating GNN-based SAR Automatic Target R

## Analyzes of included articles


In [85]:
# For all included articles, track tags starting by "Board: ", "Task: ", Model: " and "Implementation: "
# Some items may have multiple tags, so we need to track the number of items for each tag
devices_tags = {}
tasks_tags = {}
models_tags = {}
implementations_tags = {}
modalities_tags = {}
miscellaneoustags = {}

# --- Split tags into categories ---
for key in selected_articles_df.index:
    tags = selected_articles_df.loc[key, "Tags"]
    for tag in tags:
        if tag.startswith("Board: "):
            if tag not in devices_tags:
                devices_tags[tag] = []
            devices_tags[tag].append(key)
        elif tag.startswith("Task: "):
            if tag not in tasks_tags:
                tasks_tags[tag] = []
            tasks_tags[tag].append(key)
        elif tag.startswith("Model: "):
            if tag not in models_tags:
                models_tags[tag] = []
            models_tags[tag].append(key)
        elif tag.startswith("Implementation: "):
            if tag not in implementations_tags:
                implementations_tags[tag] = []
            implementations_tags[tag].append(key)
        elif tag.startswith("Modality: "):
            if tag not in modalities_tags:
                modalities_tags[tag] = []
            modalities_tags[tag].append(key)
        else:
            if tag not in miscellaneoustags:
                miscellaneoustags[tag] = []
            miscellaneoustags[tag].append(key)

# Split devices tags into 3 further dictionnaries: Family, model and board/evaluation kit
devices_families = {}
devices_models = {}
devices_boards = {}
for tag in devices_tags:
    # Each tag is formatted like "Board: family (model) {board/evaluation kit}", sometimes there is no board/evaluation kit
    # Example: "Board: Zynq 7000 (Z7020) {PYNQ-Z1}" or "Board: Kintex US (KU115)"
    family = tag.split(" (")[0].split(": ")[1]
    model = tag.split(" (")[1].split(")")[0] if "(" in tag else None
    board = tag.split("{")[1].split("}")[0] if "{" in tag else None
    if family not in devices_families:
        devices_families[family] = 0
    devices_families[family] += len(devices_tags[tag])
    if model not in devices_models:
        devices_models[model] = 0
    devices_models[model] += len(devices_tags[tag])
    if board not in devices_boards:
        devices_boards[board] = 0
    devices_boards[board] += len(devices_tags[tag])

# --- Print the number of articles for each tag ---
print("Number of articles with the corresponding tags:")
print(
    f"{r}{get_total_in_dict_of_lists(devices_tags)}{e} boards, "
    f"{r}{get_total_in_dict_of_lists(tasks_tags)}{e} tasks, "
    f"{r}{get_total_in_dict_of_lists(models_tags)}{e} models, "
    f"{r}{get_total_in_dict_of_lists(implementations_tags)}{e} implementations and "
    f"{r}{get_total_in_dict_of_lists(modalities_tags)}{e} modalities"
)

Number of articles with the corresponding tags:
[31m50[0m boards, [31m47[0m tasks, [31m45[0m models, [31m45[0m implementations and [31m43[0m modalities


In [91]:
# Statistics about the Hardware tags
print(
    f"{r}{len(devices_tags)}{e} different device tags, in total {g}{get_total_in_dict_of_lists(devices_tags)}{e} tags:"
)
# for tag, keys in devices_tags.items():
#     print(f' - {r}{len(keys):>3}{e} items for {b}"{tag[7:]}"{e}')

print(
    f"{r}{len(devices_families)}{e} different families, in total {g}{get_total_in_dict(devices_families)}{e}:"
)
for tag, keys in devices_families.items():
    print(f' - {r}{keys:>3}{e} items for {b}"{tag}"{e}')

print(
    f"{r}{len(devices_models)}{e} different models, in total {g}{get_total_in_dict(devices_models)}{e}:"
)
for tag, keys in devices_models.items():
    print(f' - {r}{keys:>3}{e} items for {b}"{tag}"{e}')

print(
    f"{r}{len(devices_boards)}{e} different boards/evaluation kits, in total {g}{get_total_in_dict(devices_boards)}{e}:"
)
for tag, keys in devices_boards.items():
    print(f' - {r}{keys:>3}{e} items for {b}"{tag}"{e}')


print("\n\n\n")

# Statistics about Implementation means
print(
    f"{r}{len(implementations_tags)}{e} different implementations, in total {g}{get_total_in_dict_of_lists(implementations_tags)}{e}:"
)
for tag, keys in implementations_tags.items():
    print(f' - {r}{len(keys):>3}{e} items for {b}"{tag[16:]}"{e}')

# Statistics about the downstream Tasks
print(
    f"{r}{len(tasks_tags)}{e} different tasks, in total {g}{get_total_in_dict_of_lists(tasks_tags)}{e}:"
)
for tag, keys in tasks_tags.items():
    print(f' - {r}{len(keys):>3}{e} items for {b}"{tag[6:]}"{e}')

# Statistics about the Models
print(
    f"{r}{len(models_tags)}{e} different models, in total {g}{get_total_in_dict_of_lists(models_tags)}{e}:"
)
for tag, keys in models_tags.items():
    print(f' - {r}{len(keys):>3}{e} items for {b}"{tag[7:]}"{e}')

# Statistics about the Modalities
print(
    f"{r}{len(modalities_tags)}{e} different modalities tags, in total {g}{get_total_in_dict_of_lists(modalities_tags)}{e}:"
)
for tag, keys in modalities_tags.items():
    print(f' - {r}{len(keys):>3}{e} items for {b}"{tag}"{e}')

[31m30[0m different device tags, in total [32m50[0m tags:
[31m15[0m different families, in total [32m50[0m:
 - [31m  1[0m items for [34m"Kintex US"[0m
 - [31m  1[0m items for [34m"Kintex US rad-hard"[0m
 - [31m 12[0m items for [34m"Zynq US+"[0m
 - [31m  7[0m items for [34m"Virtex-7"[0m
 - [31m 16[0m items for [34m"Zynq 7000"[0m
 - [31m  2[0m items for [34m"Artix-7"[0m
 - [31m  1[0m items for [34m"Alveo U280"[0m
 - [31m  2[0m items for [34m"Cyclone V"[0m
 - [31m  2[0m items for [34m"Virtex-6"[0m
 - [31m  1[0m items for [34m"Spartan-3A"[0m
 - [31m  1[0m items for [34m"Kintex-7"[0m
 - [31m  1[0m items for [34m"ASIC"[0m
 - [31m  1[0m items for [34m"Virtex US"[0m
 - [31m  1[0m items for [34m"Spartan 6"[0m
 - [31m  1[0m items for [34m"N/A"[0m
[31m19[0m different models, in total [32m50[0m:
 - [31m  1[0m items for [34m"XCKU040"[0m
 - [31m  1[0m items for [34m"XQRKU060"[0m
 - [31m  6[0m items for [34m"ZU7EV"[0m

In [38]:
pprint(zot.item(articles_selected_for_review[0]))
pprint(zot.item(articles_selected_for_review[1]))

{'data': {'DOI': '10.1109/TIE.2017.2708028',
          'ISSN': '1557-9948',
          'abstractNote': 'The Fuzzy ARTMAP is a supervised learning method, '
                          'providing high accuracy in many classifications. In '
                          'this paper, we explore the role of hardware '
                          'accelerators in remote sensing classification '
                          'missions. We focus on the designing and '
                          'implementing a massively parallel hardware '
                          'architecture on a field-programmable gate array '
                          "(FPGA) of the performance phase's algorithm. The "
                          'implementation is mapped on Xilinx Virtex 6 '
                          'XC6VLX240T FPGA chip for an embedded system using '
                          'Xilinx ISE 14.5 software. Embedded blocks dedicated '
                          'to digital signal processing (DSP) and blocks '
            