# Messy Notebook where I try to analyze data from Zotero


## Imports, global variables and Zotero session


In [1]:
from pyzotero import zotero
from pprint import pprint
from tqdm import tqdm
import pandas as pd

group_library_id = "5602981"
user_library_id = "8968938"  # From: https://www.zotero.org/settings/keys
library_type = "group"  # To access the shared library, otherwise for private "user"
api_key = "jTbkXBSx7Yv0GyOQU3its5Gb"

# Quick ANSI color code shortcurts
r = "\033[31m"
y = "\033[33m"
g = "\033[32m"
b = "\033[34m"
e = "\033[0m"

zot = zotero.Zotero(group_library_id, library_type, api_key)
print(zot)

<pyzotero.zotero.Zotero object at 0x0000027E1931B5E0>


### Methods


In [2]:
# --- Quick lambda functions ---
get_total_in_dict_of_lists = lambda d: sum([len(d[tag]) for tag in d])
get_total_in_dict = lambda d: sum([d[tag] for tag in d])


def parse_string_to_dict(input_string):
    # Initialize an empty dictionary
    result_dict = {}

    # Split the input string by newlines
    lines = input_string.split("\n")

    # Iterate over each line
    for line in lines:
        # Split each line by the first occurrence of ': '
        if ": " in line:
            key, value = line.split(": ", 1)
            result_dict[key.strip()] = value.strip()

    return result_dict


# --- Functions ---
def fetch_articles_data(zot: zotero.Zotero, keys: list[str]):
    # Prepare a list to hold each article's data
    articles_data = []
    print(f"{y}Fetching data for {len(keys)} articles...{e}")

    for key in keys:
        # Fetch the item data for each key
        data = zot.item(key)["data"]

        # Extract the relevant information
        title = data.get("title", "")
        doi = data.get("DOI", "")
        url = data.get("url", "")
        abstract_note = data.get("abstractNote", "")
        date = data.get("date", "")
        item_type = data.get("itemType", "")
        extra = parse_string_to_dict(data.get("extra", ""))

        # Extract the list of authors
        print(f"{g}{title}{e}: {data.get('creators', [])}]")
        authors = [
            f"{author['firstName']} {author['lastName']}"
            for author in data.get("creators", [])
            if author["creatorType"] == "author"
        ]

        # Extract the list of tags
        tags = [tag["tag"] for tag in data.get("tags", [])]

        # Append the data to the articles_data list
        articles_data.append(
            {
                "BBT Citation Key": extra["Citation Key"],
                "Title": title,
                "Authors": authors,
                "DOI": doi,
                "URL": url,
                "Tags": tags,
                "Abstract Note": abstract_note,
                "Date": date,
                "Item Type": item_type,
                "Zotero Key": key,
            }
        )

    # Create a DataFrame from the collected data
    df = pd.DataFrame(articles_data)
    df.set_index("BBT Citation Key", inplace=True)

    return df

## Bonus: Playing with the API


#### Print all collections


In [3]:
# collections = zot.all_collections()
# print(len(collections), "collections in your library")
# pprint(collections)

9 collections in your library
[{'data': {'key': 'QNG6R7BP',
           'name': 'Preliminary study',
           'parentCollection': False,
           'relations': {},
           'version': 156},
  'key': 'QNG6R7BP',
  'library': {'id': 5602981,
              'links': {'alternate': {'href': 'https://www.zotero.org/groups/5602981',
                                      'type': 'text/html'}},
              'name': 'Review ML - RS - FPGA',
              'type': 'group'},
  'links': {'alternate': {'href': 'https://www.zotero.org/groups/5602981/collections/QNG6R7BP',
                          'type': 'text/html'},
            'self': {'href': 'https://api.zotero.org/groups/5602981/collections/QNG6R7BP',
                     'type': 'application/json'}},
  'meta': {'numCollections': 0, 'numItems': 22},
  'version': 156},
 {'data': {'key': '4BTDBDQA',
           'name': 'Private libraries',
           'parentCollection': False,
           'relations': {},
           'version': 126},
  'key': '4

#### Get every possible item


In [5]:
# # Zotero.everything() leverages the 100 items per request limit
# all_items = zot.everything(zot.top())
# # I have ~310 items and it takes 20s to fetch them all
# print(len(all_items), "items in your library")

137 items in your library


#### See which possible `itemTypes` are available


In [3]:
# pprint(len(zot.item_types()))

38


## Access to the review collection

Key for the `"PhD - DLR"`/`"On-board AI"`/`"Review ML / FPGA / RS"`/`"Merge with already read"` library:

- `"LWR4HAWY"`


In [4]:
# Add search paprameters to select only conference papers and journal articles
# /!\ Ideally i just want to NOT select notes and attachments, but I did not find the API syntax to do so
zot.add_parameters(itemType="conferencePaper || journalArticle")
# Fetch all the items in the library (Without the limitation of 100 items per request)
# Key of the group "included in review" collection: PEWYQYGG, and key of my private user collection: LWR4HAWY
review_items = zot.everything(zot.collection_items("PEWYQYGG"))
print(len(review_items), " items in the review collection")

41  items in the review collection


### Sort articles if selected or not


In [5]:
excluded = {}
articles_selected_for_review = []
# --- For all item ---
for item in tqdm(review_items):
    # Print the item's title and type
    # print(f"{r}{item['data']['itemType']}{e}")
    # print(f" - {b}{item['data']['title']}{e}")

    # print(f'Item Type: {item["data"]["itemType"]} | Key: {item["data"]["key"]}')
    # Get the item's tags as a list
    tags = item["data"].get("tags", [])
    is_excluded = False
    for tag in tags:
        # Weird but each tag is a dictionary with a "tag" key
        tag = tag["tag"]
        # If the tag starts with "excluded: "
        if tag.startswith("Excluded: "):
            # If the tag is not yet in the excluded dictionary
            if tag not in excluded:
                # Add the tag as a key and an empty list as the value
                excluded[tag] = []

            # Add the item's key to the list of keys for the tag
            excluded[tag].append(item["data"]["key"])
            is_excluded = True
            break
    # If the item is not excluded, add its key to the list of keys for the tag "Selected for review"
    if not is_excluded:
        articles_selected_for_review.append(item["data"]["key"])

# Verify that the total of excluded items sum up to the total number of items in the review collection
total_excluded = 0
for key in excluded:
    total_excluded += len(excluded[key])

print(
    f"{r}{total_excluded}{e} excluded items + {g}{len(articles_selected_for_review)}{e} selected for review = {b}{len(review_items)}{e} total items in the review collection"
)
assert total_excluded + len(articles_selected_for_review) == len(review_items)

100%|██████████| 41/41 [00:00<?, ?it/s]

[31m0[0m excluded items + [32m41[0m selected for review = [34m41[0m total items in the review collection





#### Print exclusion reasons


In [6]:
# Print the excluded dictionary
print(f"Total number of items excluded: {r}{total_excluded}{e}.")
for tag, keys in excluded.items():
    print(f'{r}{len(keys):>3}{e} items excluded for: {b}"{tag[10:]}"{e}')

Total number of items excluded: [31m0[0m.


### Transform the data in a pandas `Dataframe`


In [7]:
# Fetch the data and create a DataFrame
selected_articles_df = fetch_articles_data(zot, articles_selected_for_review)
pprint(selected_articles_df)

[33mFetching data for 41 articles...[0m
[32mParallelization of Fuzzy ARTMAP Architecture on FPGA: Multispectral Classification of ALSAT-2A Images[0m: [{'creatorType': 'author', 'firstName': 'Réda', 'lastName': 'Yahiaoui'}, {'creatorType': 'author', 'firstName': 'Farid', 'lastName': 'Alilat'}, {'creatorType': 'author', 'firstName': 'Saliha', 'lastName': 'Loumi'}]]
[32mLow-power neural networks for semantic segmentation of satellite images[0m: [{'creatorType': 'author', 'firstName': 'Gaetan', 'lastName': 'Bahl'}, {'creatorType': 'author', 'firstName': 'Lionel', 'lastName': 'Daniel'}, {'creatorType': 'author', 'firstName': 'Matthieu', 'lastName': 'Moretti'}, {'creatorType': 'author', 'firstName': 'Florent', 'lastName': 'Lafarge'}]]
[32mShip classification from SAR images based on deep learning[0m: [{'creatorType': 'author', 'firstName': 'Shintaro', 'lastName': 'Hashimoto'}, {'creatorType': 'author', 'firstName': 'Yohei', 'lastName': 'Sugimoto'}, {'creatorType': 'author', 'firstNam

In [28]:
test = zot.item("6WLFZYDZ")["data"]["extra"]
test_dict = {key: value for (key, value) in test}
print(test)

Number: 282
Type: Article
tex.affiliation: Liu, WC (Corresponding Author), Tsinghua Univ, Dept Comp Sci & Technol, Beijing 100084, Peoples R China. Zhang, Ning; Wei, Xin; Chen, He, Beijing Inst Technol, Beijing Key Lab Embedded Real Time Informat Proc, Beijing 100081, Peoples R China. Liu, Wenchao, Tsinghua Univ, Dept Comp Sci & Technol, Beijing 100084, Peoples R China.
tex.author-email: 3120205375@bit.edu.cn weixin@bit.edu.cn chenhe@bitedu.cn liuwenchao@mail.tsinghua.edu.cn
tex.da: 2024-05-06
tex.eissn: 2079-9292
tex.times-cited: 39
tex.unique-id: WOS:000615007400001
Citation Key: zhangFPGAImplementationCNNbased2021a


In [8]:
pprint(selected_articles_df)

                                                                                                Title  \
BBT Citation Key                                                                                        
yahiaouiParallelizationFuzzyARTMAP2017a             Parallelization of Fuzzy ARTMAP Architecture o...   
bahlLowpowerNeuralNetworks2019a                     Low-power neural networks for semantic segment...   
hashimotoShipClassificationSAR2019a                 Ship classification from SAR images based on d...   
fraczekEmbeddedVisionSystem2018                     Embedded Vision System for Automated Drone Lan...   
pitsisEfficientConvolutionalNeural2019a             Efficient convolutional neural network weight ...   
liEfficientObjectDetection2019a                     Efficient object detection framework and hardw...   
matos-carvalhoStaticDynamicAlgorithms2019           Static and Dynamic Algorithms for Terrain Clas...   
weiFPGABasedHybridTypeImplementation2019            FPG

## Analyzes of included articles


### Analyze the Hardware

Which FPGA family, specific model and evaluation boards are used?


In [17]:
# Print all tags
print(
    f"{r}{len(devices_tags)}{e} different device tags, in total {g}{get_total_in_dict_of_lists(devices_tags)}{e} tags:"
)
for tag, keys in devices_tags.items():
    print(f' - {r}{len(keys):>3}{e} items for {b}"{tag[7:]}"{e}')

# Leave out articles that have "Board: ???" or "Board: N/A"
# @TODO: Create a new dataframe per "datapoint", i.e., per experiment, so papers with several models or board will get as many datapoints

print(
    f"{r}{len(devices_families)}{e} different families, in total {g}{get_total_in_dict(devices_families)}{e}:"
)
for tag, keys in devices_families.items():
    print(f' - {r}{keys:>3}{e} items for {b}"{tag}"{e}')

print(
    f"{r}{len(devices_models)}{e} different models, in total {g}{get_total_in_dict(devices_models)}{e}:"
)
for tag, keys in devices_models.items():
    print(f' - {r}{keys:>3}{e} items for {b}"{tag}"{e}')

print(
    f"{r}{len(devices_boards)}{e} different boards/evaluation kits, in total {g}{get_total_in_dict(devices_boards)}{e}:"
)
for tag, keys in devices_boards.items():
    print(f' - {r}{keys:>3}{e} items for {b}"{tag}"{e}')

[31m33[0m different device tags, in total [32m60[0m tags:
 - [31m  1[0m items for [34m"Virtex-6"[0m
 - [31m  1[0m items for [34m"Kintex US rad-hard (XQRKU060)"[0m
 - [31m  1[0m items for [34m"Zynq US+ (ZU7EV) {ZCU106}"[0m
 - [31m  1[0m items for [34m"Zynq 7000 (Z7020) {PYNQ-Z2}"[0m
 - [31m  1[0m items for [34m"Zynq US+ (ZU3EG) {UltraZed-EG}"[0m
 - [31m  3[0m items for [34m"Virtex-7 (VX690T)"[0m
 - [31m  1[0m items for [34m"Zynq US+ (ZU3EG) {OVC3}"[0m
 - [31m  1[0m items for [34m"Zynq US+ (ZU9EG)"[0m
 - [31m  3[0m items for [34m"Zynq 7000 (Z7020)"[0m
 - [31m  3[0m items for [34m"Virtex-7 (VX690T) {VC709}"[0m
 - [31m  2[0m items for [34m"Zynq 7000 (Z7020) {Zedboard}"[0m
 - [31m  1[0m items for [34m"Artix-7 (XC7A35T) {Arty-35T}"[0m
 - [31m  1[0m items for [34m"Zynq 7000 (Z7035)"[0m
 - [31m  3[0m items for [34m"Zynq 7000 (Z7020) {PYNQ-Z1}"[0m
 - [31m  1[0m items for [34m"Kintex US (XCKU040) {KCU105}"[0m
 - [31m  5[0m items f

In [None]:
# For all included articles, track tags starting by "Board: ", "Task: ", Model: " and "Implementation: "
# Some items may have multiple tags, so we need to track the number of items for each tag
devices_tags = {}
tasks_tags = {}
models_tags = {}
implementations_tags = {}
modalities_tags = {}
miscellaneoustags = {}

# --- Split tags into categories ---
for key in selected_articles_df.index:
    tags = selected_articles_df.loc[key, "Tags"]
    for tag in tags:
        if tag.startswith("Board: "):
            if tag not in devices_tags:
                devices_tags[tag] = []
            devices_tags[tag].append(key)
        elif tag.startswith("Task: "):
            if tag not in tasks_tags:
                tasks_tags[tag] = []
            tasks_tags[tag].append(key)
        elif tag.startswith("Model: "):
            if tag not in models_tags:
                models_tags[tag] = []
            models_tags[tag].append(key)
        elif tag.startswith("Implementation: "):
            if tag not in implementations_tags:
                implementations_tags[tag] = []
            implementations_tags[tag].append(key)
        elif tag.startswith("Modality: "):
            if tag not in modalities_tags:
                modalities_tags[tag] = []
            modalities_tags[tag].append(key)
        else:
            if tag not in miscellaneoustags:
                miscellaneoustags[tag] = []
            miscellaneoustags[tag].append(key)

# Split devices tags into 3 further dictionnaries: Family, model and board/evaluation kit
devices_families = {}
devices_models = {}
devices_boards = {}
for tag in devices_tags:
    # Each tag is formatted like "Board: family (model) {board/evaluation kit}", sometimes there is no board/evaluation kit
    # Example: "Board: Zynq 7000 (Z7020) {PYNQ-Z1}" or "Board: Kintex US (KU115)"
    family = tag.split(" (")[0].split(": ")[1]
    model = tag.split(" (")[1].split(")")[0] if "(" in tag else None
    board = tag.split("{")[1].split("}")[0] if "{" in tag else None
    if family not in devices_families:
        devices_families[family] = 0
    devices_families[family] += len(devices_tags[tag])
    if model not in devices_models:
        devices_models[model] = 0
    devices_models[model] += len(devices_tags[tag])
    if board not in devices_boards:
        devices_boards[board] = 0
    devices_boards[board] += len(devices_tags[tag])

# --- Print the number of articles for each tag ---
print("Number of articles with the corresponding tags:")
print(
    f"{r}{get_total_in_dict_of_lists(devices_tags)}{e} boards, "
    f"{r}{get_total_in_dict_of_lists(tasks_tags)}{e} tasks, "
    f"{r}{get_total_in_dict_of_lists(models_tags)}{e} models, "
    f"{r}{get_total_in_dict_of_lists(implementations_tags)}{e} implementations and "
    f"{r}{get_total_in_dict_of_lists(modalities_tags)}{e} modalities"
)

Number of articles with the corresponding tags:
[31m60[0m boards, [31m58[0m tasks, [31m72[0m models, [31m57[0m implementations and [31m53[0m modalities


In [None]:
print("\n\n\n")

# Statistics about Implementation means
print(
    f"{r}{len(implementations_tags)}{e} different implementations, in total {g}{get_total_in_dict_of_lists(implementations_tags)}{e}:"
)
for tag, keys in implementations_tags.items():
    print(f' - {r}{len(keys):>3}{e} items for {b}"{tag[16:]}"{e}')

# Statistics about the downstream Tasks
print(
    f"{r}{len(tasks_tags)}{e} different tasks, in total {g}{get_total_in_dict_of_lists(tasks_tags)}{e}:"
)
for tag, keys in tasks_tags.items():
    print(f' - {r}{len(keys):>3}{e} items for {b}"{tag[6:]}"{e}')

# Statistics about the Models
print(
    f"{r}{len(models_tags)}{e} different models, in total {g}{get_total_in_dict_of_lists(models_tags)}{e}:"
)
for tag, keys in models_tags.items():
    print(f' - {r}{len(keys):>3}{e} items for {b}"{tag[7:]}"{e}')

# Statistics about the Modalities
print(
    f"{r}{len(modalities_tags)}{e} different modalities tags, in total {g}{get_total_in_dict_of_lists(modalities_tags)}{e}:"
)
for tag, keys in modalities_tags.items():
    print(f' - {r}{len(keys):>3}{e} items for {b}"{tag}"{e}')

In [38]:
pprint(zot.item(articles_selected_for_review[0]))
pprint(zot.item(articles_selected_for_review[1]))

{'data': {'DOI': '10.1109/TIE.2017.2708028',
          'ISSN': '1557-9948',
          'abstractNote': 'The Fuzzy ARTMAP is a supervised learning method, '
                          'providing high accuracy in many classifications. In '
                          'this paper, we explore the role of hardware '
                          'accelerators in remote sensing classification '
                          'missions. We focus on the designing and '
                          'implementing a massively parallel hardware '
                          'architecture on a field-programmable gate array '
                          "(FPGA) of the performance phase's algorithm. The "
                          'implementation is mapped on Xilinx Virtex 6 '
                          'XC6VLX240T FPGA chip for an embedded system using '
                          'Xilinx ISE 14.5 software. Embedded blocks dedicated '
                          'to digital signal processing (DSP) and blocks '
            