# Counting results

In [None]:
import pandas as pd
from matplotlib_venn import venn3
from matplotlib import pyplot as plt

## Table in appendix

In [None]:
journals = pd.read_csv("../data/processed/journals_deduplicated.csv")
journals.head()

In [None]:
table = pd.DataFrame({"name":["Taxonomic journals with title",
                             "Journals with Wikidata ID",
                             "Journals with ISSN-L",
                             "Journals with IPNI Publication ID in Wikidata",
                             "Journals with ZooBank Publication ID in Wikidata",
                             "Journals with OpenAlex ID"
                            # union of OpenAlex ID and IPNI etc
                             ],
                      "value":[0,0,0,0,0,0]})

table["value"] = table["value"].astype(str)  # Convert the column to string type

table.loc[0,"value"] = str(len([x for x in journals["title"] if x == x]))

for i, column in enumerate(journals.columns[1:6]):
    table.loc[i+1, "value"] = str(len([x for x in journals[column] if x == x]))

#table = table.set_index("name")
table # ctrl + click to select cells to copy

## 1. Journals

Taxonomic journals:
- with title
- found on wikidata (each method)
- found on OpenAlex
- total
- still publishing in the last 10 years
- dissolved (confirmed)
- with ISSN-L
- with IPNI publication ID
- with ZooBank publication ID
- with OpenAlex ID

In [None]:
print("Number of journals: " + str(len(journals)))
print("Number of journals with known title: "+\
      str(len([x for x in journals["title"] if x == x])))

In [None]:
for i, column in enumerate(journals.columns[1:6]):
    print("Number of journals with " + column + ": "+\
          str(len([x for x in journals[column] if x == x])))

In [None]:
print("Journals that were dissolved (confirmed): ")
journals["dissolved"].value_counts()

In [None]:
articles = pd.read_pickle("../data/interim/filtered_articles.pkl")
articles.head()

In [None]:
print("Number of journals that contained relevant (i.e. taxonomic) articles: " + \
      str(len(set(articles["source_id"]))))

In [None]:
# Venn diagram
ipnizoo = set(journals[journals["source"]=="IPNI or ZooBank ID"]["title"])
openalex = set(journals[journals["source"]=="OpenAlex taxonomy concept"]["title"])
wikisubjects = set(journals[journals["source"]=="Wikidata taxonomic subject"]["title"])

venn3([ipnizoo, openalex, wikisubjects], ("IPNI/ZooBank ID", "OpenAlex", "Wikidata subjects"))
plt.title("Unique journals per source", fontsize=15)

## 2. Articles

- total number
- number with at least one European author

In [None]:
print("Number of articles related to taxonomy: " + str(len(set(articles["id"]))))

In [None]:
print("Number of articles related to taxonomy with at least one European author: " +\
      str(len(set(articles["id"]))))

## 3. Authors

In [None]:
authors = pd.read_csv("../data/processed/country_taxonomic_authors_no_duplicates.tsv", sep="\t",low_memory=False)

In [None]:
print("Number of European authors before disambiguation: "+\
      str(len(authors)))

In [None]:
disamb_authors = pd.read_pickle("../data/processed/authors_disambiguated_truncated.pkl")

In [None]:
print("Number of European authors after disambiguation: "+\
      str(len(disamb_authors)))

In [None]:
print("Number of European institutions publishing taxonomic articles: "+\
      str(len(set(disamb_authors["inst_id"]))))

In [None]:
len(set(authors[authors["inst_country_code"]=="BE"]["inst_display_name"]))

In [None]:
authors.columns

## 4. Taxonomy (not working)

Total number of families within Plantae:
Total number of families within Animalia:
Total number of families within Fungi:
Total number of families within Protists:
…. See similar figure here: https://data-blog.gbif.org/post/gbif-backbone-taxonomy/

In [None]:
# supply and demand order save?
authors_tree = pd.read_pickle("../data/processed/authors_disambiguated_truncated.pkl")

In [None]:
authors2 = pd.read_pickle("../data/interim/country_taxonomic_authors_no_duplicates.pkl")

In [None]:
authors2.head()

In [None]:
authors_tree.head()

In [None]:
"""
def get_number_families(kingdom_name):
    kingdom = []

    for author in authors_tree.itertuples():
        if kingdom_name in author.kingdoms_subjects:
            kingdom.append(author)
            
    kingdom_df = pd.DataFrame(kingdom)
    
    families = []
    
    for author in kingdom_df.itertuples():
        for family in author.families_subjects:
            families.append(family)
            
    return len(set(families))
"""

In [None]:
"""
print("Number of plant families in data: " + str(get_number_families("Plantae")))
print("Number of animal families in data: " + str(get_number_families("Animalia")))
print("Number of fungal families in data: " + str(get_number_families("Fungi")))
# no protists in data
"""

## 5. Most prolific authors

In [None]:
top_authors = authors["author_id"].value_counts()[0:10]

In [None]:
sum(list(authors["author_id"].value_counts()))/len(list(authors["author_id"].value_counts()))

In [None]:
for author_id in top_authors.index:
    print(authors[authors["author_id"]==author_id].iloc[0]["author_display_name"] + \
          " has written " + str(top_authors.loc[author_id]) + " articles")

In [None]:
plt.xlim(1,30)
plt.hist(authors["author_id"].value_counts(), bins=150)

## 6. ORCID and Open Access percentages (not working)

In [None]:
# total percentage of authors with ORCID
disamb_authors["author_orcid"]

In [None]:
orcid = 0
no_orcid = 0

for x in disamb_authors["author_orcid"]:
    if isinstance(x, str):
        if x.split(".org/")[0] == "https://orcid":
            orcid += 1
        #""" # checking if there are ORCIDs in lists didn't change the results
        #elif isinstance(x, tuple) or isinstance(x, set) or isinstance(x, list):
        #    if isinstance(x[0], str):
        #        if x[0].split(".org/")[0] == "https://orcid":
        #            orcid += 1
        #        else:
        #            no_oricd += 1
        #    else:
        #        no_orcid += 1
        #"""
        else:
            no_orcid += 1
    else:
        no_orcid += 1
        
print("Percentage of authors with an ORCID: " + str(orcid/(no_orcid+orcid)*100))

In [None]:
articles = pd.read_pickle("../data/processed/taxonomic_articles_with_subjects.pkl")

In [None]:
vc = articles["oa_is_oa"].value_counts()

# Safely accessing by position using iloc
open_access_count = vc.iloc[0]
non_open_access_count = vc.iloc[1]

# Calculate the percentage
percentage_oa = (open_access_count / (open_access_count + non_open_access_count)) * 100

# Print the result
print(f"Percentage of European articles that are Open Access: {percentage_oa}")


In [None]:
vc = articles["oa_is_oa"].value_counts()

# Safely access by position using .iloc
if len(vc) > 1:  # Ensure there are at least two unique values
    open_access_count = vc.iloc[0]
    non_open_access_count = vc.iloc[1]

    # Calculate the percentage
    percentage_oa = (open_access_count / (open_access_count + non_open_access_count)) * 100

    # Print the result
    print(f"Percentage of European articles that are Open Access: {percentage_oa}")
else:
    print("Not enough data to calculate Open Access percentage.")


In [None]:
articles.columns

In [None]:
def species_to_tree(df, backbone):
    # make dictionary of genus, family, order, class, phylum, kingdom for every species
    seen_species = {}

    for species in backbone.itertuples():
        if species.canonicalName not in seen_species:
            seen_species[species.canonicalName] = list(species)[2:]
    
    # associate the full tree with a certain author or article
    genera, families, orders, classes, phyla, kingdoms, lineages = [], [], [], [], [], [], []

    for row in df.itertuples():
        genus, family, order, tclass, phylum, kingdom, lineage = [], [], [], [], [], [], []
                            # python won't allow class as a variable name

        for species in row.species_subject:
            if species in seen_species:
                genus.append(seen_species[species][-1])
                family.append(seen_species[species][-2])
                order.append(seen_species[species][-3])
                tclass.append(seen_species[species][-4])
                phylum.append(seen_species[species][-5])
                kingdom.append(seen_species[species][-6])
                
                lineage.append(seen_species[species][-6:])

        genera.append(set(genus))
        families.append(set(family))
        orders.append(set(order))
        classes.append(set(tclass))
        phyla.append(set(phylum))
        kingdoms.append(set(kingdom))
        lineages.append(lineage)

    df["genera_subjects"] = genera
    df["families_subjects"] = families
    df["orders_subjects"] = orders
    df["classes_subjects"] = classes
    df["phyla_subjects"] = phyla
    df["kingdoms_subjects"] = kingdoms
    df["lineages_subjects"] = lineages
    
    return df

In [None]:
def preprocess_backbone(path="../data/external/backbone/Taxon.tsv", no_blanks=False):
    # GBIF taxonomic bakcbone
    try:
        backbone = pd.read_csv(
                path,
                sep="\t",
                on_bad_lines='skip',
                low_memory=False,  # Ensures the entire file is processed in memory
                dtype=str  # Treats all columns as strings to avoid mixed-type issues
        )
    except Exception as e:
        raise RuntimeError(f"Failed to load file: {e}")
        
    # Ensure the key columns have appropriate data types
    for col in ["canonicalName", "kingdom", "genus", "family", "order", "class", "phylum"]:
        if col in backbone.columns:
            backbone[col] = backbone[col].astype(str).str.strip()  # Ensure consistent types and no extra whitespace
          
    # only Eukarya
    backbone = backbone[(backbone["kingdom"]=="Animalia") | 
                        (backbone["kingdom"]=="Plantae") |
                        (backbone["kingdom"]=="Fungi")]
    # include non-accepted species (synonyms etc), but not blank canonical names 
    backbone = backbone[np.logical_not(backbone["canonicalName"].isnull())].reset_index(drop=True)
    
    if no_blanks:
        backbone = backbone[np.logical_not(backbone["genus"].isnull())].reset_index(drop=True)
        backbone = backbone[np.logical_not(backbone["family"].isnull())].reset_index(drop=True)
        backbone = backbone[np.logical_not(backbone["order"].isnull())].reset_index(drop=True)
        backbone = backbone[np.logical_not(backbone["class"].isnull())].reset_index(drop=True)
        backbone = backbone[np.logical_not(backbone["phylum"].isnull())].reset_index(drop=True)
        backbone = backbone[np.logical_not(backbone["kingdom"].isnull())].reset_index(drop=True)

    backbone = backbone.drop_duplicates(ignore_index=True)
    return backbone

In [None]:
import numpy as np

In [None]:
backbone = preprocess_backbone()

In [None]:
articles2 = species_to_tree(articles, backbone)

In [None]:
plants_open = 0
plants_closed = 0

animals_open = 0
animals_closed = 0

fungi_open = 0
fungi_closed = 0

for row in articles2.itertuples():
    if "Plantae" in row.kingdoms_subjects:
        if row.oa_is_oa:
            plants_open += 1
        else:
            plants_closed += 1
            
    if "Animalia" in row.kingdoms_subjects:
        if row.oa_is_oa:
            animals_open += 1
        else:
            animals_closed += 1
            
    if "Fungi" in row.kingdoms_subjects:
        if row.oa_is_oa:
            fungi_open += 1
        else:
            fungi_closed += 1

In [None]:
print("Percentage of articles about plants that are open access: " + \
      str(plants_open/(plants_open+plants_closed)*100))
print("Percentage of articles about animals that are open access: " + \
      str(animals_open/(animals_open+animals_closed)*100))
print("Percentage of articles about fungi that are open access: " + \
      str(fungi_open/(fungi_open+fungi_closed)*100))