In [None]:
# Cell 1: Imports
import os
import pandas as pd
import requests
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import time
from tqdm import tqdm
import pypandoc
import pycountry
import re
from mpl_toolkits.basemap import Basemap
from wordcloud import WordCloud

In [None]:
# Function to convert country codes to country names
def get_country_name(code):
    try:
        return pycountry.countries.get(alpha_2=code).name
    except AttributeError:
        return code  # Return the code if it can't find the name

# Function to convert country codes to alpha-3 format
def convert_country_code_to_alpha3(code):
    try:
        return pycountry.countries.get(alpha_2=code).alpha_3
    except AttributeError:
        return code  # Return the code if it can't find the alpha-3 code

# Function to fetch taxon name from GBIF API using taxon key
def fetch_taxon_name(taxon_key):
    url = f"https://api.gbif.org/v1/species/{taxon_key}"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data.get('scientificName', '')
    return ''

# Function to preprocess Markdown content
def preprocess_markdown(content):
    # Replace problematic Unicode characters with ASCII equivalents or remove them
    replacements = {
        '\u223C': '~',  # Replace ∼ with ~
        '\u2010': '-',  # Replace ‐ with -
        '\u2011': '-',  # Replace ‑ with -
        '\u2012': '-',  # Replace ‒ with -
        '\u2013': '-',  # Replace – with -
        '\u2014': '-',  # Replace — with -
        '\u2015': '-',  # Replace ― with -
        '\u2018': "'",  # Replace ‘ with '
        '\u2019': "'",  # Replace ’ with '
        '\u201C': '"',  # Replace “ with "
        '\u201D': '"',  # Replace ” with "
        '\u2026': '...',  # Replace … with ...
        '\u2212': '-',  # Replace − with -
    }
    for old, new in replacements.items():
        content = content.replace(old, new)
    
    # Remove any remaining non-ASCII characters
    content = re.sub(r'[^\x00-\x7F]+', '', content)
    return content

In [None]:
# Cell 2: Read the CSV files and extract unique DOIs
file_path = 'KYMS.csv'  # Update this with your file path
download_summary_path = 'download_summary.csv'  # Path to download_summary.csv

df = pd.read_csv(file_path, encoding='utf-8')
download_summary_df = pd.read_csv(download_summary_path, encoding='utf-8')

# Calculate the number of GBIF IDs associated with each DOI
gbif_id_counts = df.groupby('doi')['gbifID'].nunique().to_dict()

# Create a dictionary to map gbifDownloadKey to total_records
download_summary_dict = download_summary_df.set_index('key')['total_records'].to_dict()

dois = df['doi'].unique().tolist()

In [None]:
# Cell 3: Fetch metadata using GBIF API with progress bar
def fetch_metadata_gbif(doi_list):
    metadata_list = []
    for doi in tqdm(doi_list, desc="Fetching metadata"):
        url = f"https://api.gbif.org/v1/literature/search?doi={doi}"
        response = requests.get(url)
        if response.status_code == 200:
            metadata_list.append(response.json())
        else:
            print(f"Failed to fetch metadata for DOI: {doi}")
        time.sleep(1)  # To respect API rate limits
    return metadata_list

metadata_list = fetch_metadata_gbif(dois)

In [None]:
# Cell 4: Extract relevant fields from metadata and fetch taxon names
def extract_relevant_fields(metadata):
    if not metadata or 'results' not in metadata or not metadata['results']:
        return None
    
    result = metadata['results'][0]
    
    try:
        authors = ", ".join([f"{author['firstName']} {author['lastName']}" for author in result.get('authors', [])])
    except KeyError:
        authors = ''

    title = result.get('title', '')
    abstract = result.get('abstract', '')
    doi = result.get('identifiers', {}).get('doi', '')
    doi_link = f"https://doi.org/{doi}"
    countries_of_coverage = ", ".join(result.get('countriesOfCoverage', []))
    countries_of_researcher = ", ".join([get_country_name(code) for code in result.get('countriesOfResearcher', [])])
    countries_of_researcher_codes = result.get('countriesOfResearcher', [])
    published = result.get('published', '')
    
    gbif_taxon_key = result.get('gbifTaxonKey', [])
    gbif_higher_taxon_key = result.get('gbifHigherTaxonKey', [])
    
    gbif_taxon_names = [fetch_taxon_name(key) for key in gbif_taxon_key]
    gbif_higher_taxon_names = [fetch_taxon_name(key) for key in gbif_higher_taxon_key]
    
    citation_type = result.get('citationType', '')
    literature_type = result.get('literatureType', '')
    open_access = result.get('openAccess', '')
    peer_review = result.get('peerReview', '')
    publisher = result.get('publisher', '')
    relevance = ", ".join(result.get('relevance', []))
    source = result.get('source', '')
    language = result.get('language', '')
    year = result.get('year', '')
    
    topics = ", ".join(result.get('topics', []))
    
    # Get the number of GBIF IDs associated with this DOI
    num_gbif_ids = gbif_id_counts.get(doi, 0)
    
    # Get the total records for gbifDownloadKey
    gbif_download_keys = result.get('gbifDownloadKey', [])
    total_records = sum(download_summary_dict.get(key, 0) for key in gbif_download_keys)
    
    return {
        'authors': authors,
        'title': title,
        'abstract': abstract,
        'doi': doi_link,
        'numGbifIds': num_gbif_ids,
        'totalRecords': total_records,
        'countriesOfCoverage': countries_of_coverage,
        'countriesOfResearcher': countries_of_researcher,
        'countriesOfResearcherCodes': countries_of_researcher_codes,
        'published': published,
        'gbifTaxonKey': ", ".join(map(str, gbif_taxon_key)),
        'gbifHigherTaxonKey': ", ".join(map(str, gbif_higher_taxon_key)),
        'gbifTaxonNames': ", ".join(gbif_taxon_names),
        'gbifHigherTaxonNames': ", ".join(gbif_higher_taxon_names),
        'citationType': citation_type,
        'literatureType': literature_type,
        'openAccess': open_access,
        'peerReview': peer_review,
        'publisher': publisher,
        'relevance': relevance,
        'source': source,
        'language': language,
        'year': year,
        'topics': topics
    }

relevant_data = [extract_relevant_fields(metadata) for metadata in metadata_list if metadata and 'results' in metadata and metadata['results']]

# Convert the extracted data into a DataFrame
df_relevant_data = pd.DataFrame(relevant_data)
print(df_relevant_data.head())

# Save the results to a spreadsheet
output_excel_path = 'gbif_literature_data.xlsx'
df_relevant_data.to_excel(output_excel_path, index=False)
print(f"Results saved to {output_excel_path}")

In [None]:
# Cell 5: Extract keywords and topics
def extract_keywords_gbif(metadata):
    try:
        return metadata['results'][0]['keywords']
    except (KeyError, IndexError):
        return []

def extract_topics_gbif(metadata):
    try:
        return metadata['results'][0]['topics']
    except (KeyError, IndexError):
        return []

keywords_list = [extract_keywords_gbif(metadata) for metadata in metadata_list if metadata and 'results' in metadata and metadata['results']]
topics_list = [extract_topics_gbif(metadata) for metadata in metadata_list if metadata and 'results' in metadata and metadata['results']]

In [None]:
# Cell 6: Create the network
def create_network(items_list):
    G = nx.Graph()
    
    for items in items_list:
        for i, item1 in enumerate(items):
            for item2 in items[i+1:]:
                if G.has_edge(item1, item2):
                    G[item1][item2]['weight'] += 1
                else:
                    G.add_edge(item1, item2, weight=1)
    
    return G

keyword_network = create_network(keywords_list)
topic_network = create_network(topics_list)

In [None]:
# Cell 7: Save the networks as GraphML files
# Add topic counts as node attributes to the topic network
topic_counts = pd.Series([topic for sublist in topics_list for topic in sublist]).value_counts()
for node in topic_network.nodes():
    topic_network.nodes[node]['count'] = topic_counts[node]

nx.write_graphml(keyword_network, 'keyword_network.graphml')
nx.write_graphml(topic_network, 'topic_network.graphml')

In [None]:
# Cell 8: Plot the networks
import matplotlib.font_manager as fm

# Use DejaVu Sans font
dejavu_font_path = fm.findfont(fm.FontProperties(family='DejaVu Sans'))
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['font.sans-serif'] = ['DejaVu Sans']

# Plot the Keyword Network
plt.figure(figsize=(10, 10))
plt.title("Keyword Network", fontsize=14)
pos = nx.spring_layout(keyword_network, k=0.1)
nx.draw(keyword_network, pos, with_labels=True, node_size=50, font_size=10, font_color='blue', edge_color='gray')
plt.show()

# Update Node Sizes for Topic Network
node_sizes = [topic_counts.get(topic, 1) * 10 for topic in topic_network.nodes()]  # Ensure default size if topic not in topic_counts

# Plot the Topic Network
plt.figure(figsize=(10, 10))
plt.title("Topic Network", fontsize=14)
pos = nx.spring_layout(topic_network, k=0.1)
nx.draw(topic_network, pos, with_labels=True, node_size=node_sizes, font_size=10, font_color='green', edge_color='black')
plt.show()


In [None]:
# Cell 9: Create Markdown files for each publication
output_dir = 'publication_markdowns'
os.makedirs(output_dir, exist_ok=True)

for index, row in df_relevant_data.iterrows():
    md_content = f"""
# {row['title']}

**Authors**: {row['authors']}

**Abstract**: {row['abstract']}

**DOI**: [{row['doi']}]({row['doi']})

**Number of GBIF IDs**: {row['numGbifIds']}

**Total Records**: {row['totalRecords']}

**Countries of Coverage**: {row['countriesOfCoverage']}

**Countries of Researcher**: {row['countriesOfResearcher']}

**Published**: {row['published']}

**GBIF Taxon Key**: {row['gbifTaxonKey']}
**GBIF Taxon Names**: {row['gbifTaxonNames']}

**GBIF Higher Taxon Key**: {row['gbifHigherTaxonKey']}
**GBIF Higher Taxon Names**: {row['gbifHigherTaxonNames']}

**Citation Type**: {row['citationType']}
**Literature Type**: {row['literatureType']}
**Open Access**: {row['openAccess']}
**Peer Review**: {row['peerReview']}
**Publisher**: {row['publisher']}
**Relevance**: {row['relevance']}
**Source**: {row['source']}
**Language**: {row['language']}
**Year**: {row['year']}
**Topics**: {row['topics']}
"""
    # Preprocess the Markdown content to handle problematic characters
    md_content = preprocess_markdown(md_content)

    md_file_path = os.path.join(output_dir, f"publication_{index + 1}.md")
    with open(md_file_path, 'w', encoding='utf-8') as f:
        f.write(md_content)

    # Convert Markdown to PDF using the template
    pdf_file_path = os.path.join(output_dir, f"publication_{index + 1}.pdf")
    pypandoc.convert_file(md_file_path, 'pdf', outputfile=pdf_file_path, extra_args=['--template=template.tex'])

print(f"Markdown and PDF files created in the folder: {output_dir}")

In [None]:
# Cell 10: Create a global choropleth map for countriesOfResearcher
# Add a dictionary for country coordinates
country_coordinates = {
    'AF': (67.709953, 33.93911),
    'AL': (20.168331, 41.153332),
    'DZ': (1.659626, 28.033886),
    'AO': (17.873887, -11.202692),
    'AR': (-63.61667199999999, -38.416097),
    'AM': (45.038189, 40.069099),
    'AU': (133.775136, -25.274398),
    'AT': (14.550072, 47.516231),
    'AZ': (47.576927, 40.143105),
    'BS': (-77.39627999999999, 25.03428),
    'BH': (50.637772, 25.930414),
    'BD': (90.356331, 23.684994),
    'BY': (27.953389, 53.709807),
    'BE': (4.469936, 50.503887),
    'BZ': (-88.49765, 17.189877),
    'BJ': (2.315834, 9.30769),
    'BT': (90.433601, 27.514162),
    'BO': (-63.58865299999999, -16.290154),
    'BA': (17.679076, 43.915886),
    'BW': (24.684866, -22.328474),
    'BR': (-51.92528, -14.235004),
    'BN': (114.727669, 4.535277),
    'BG': (25.48583, 42.733883),
    'BF': (-1.561593, 12.238333),
    'BI': (29.918886, -3.373056),
    'KH': (104.990963, 12.565679),
    'CM': (12.354722, 7.369721999999999),
    'CA': (-106.346771, 56.130366),
    'CV': (-24.013197, 16.002082),
    'CF': (20.939444, 6.611110999999999),
    'TD': (18.732207, 15.454166),
    'CL': (-71.542969, -35.675147),
    'CN': (104.195397, 35.86166),
    'CO': (-74.297333, 4.570868),
    'KM': (43.872219, -11.875001),
    'CD': (21.758664, -4.038333),
    'CG': (15.827659, -0.228021),
    'CR': (-83.753428, 9.748916999999999),
    'HR': (15.2, 45.1),
    'CU': (-77.781167, 21.521757),
    'CY': (33.429859, 35.126413),
    'CZ': (15.472962, 49.817492),
    'DK': (9.501785, 56.26392),
    'DJ': (42.590275, 11.825138),
    'DM': (-61.370976, 15.414999),
    'DO': (-70.162651, 18.735693),
    'EC': (-78.18340599999999, -1.831239),
    'EG': (30.802498, 26.820553),
    'SV': (-88.89653, 13.794185),
    'GQ': (10.267895, 1.650801),
    'ER': (39.782334, 15.179384),
    'EE': (25.013607, 58.595272),
    'SZ': (31.465866, -26.522503),
    'ET': (39.782334, 9.145),
    'FJ': (178.065032, -17.713371),
    'FI': (25.748151, 61.92410999999999),
    'FR': (2.213749, 46.603354),
    'GA': (11.609444, -0.803689),
    'GM': (-15.310139, 13.443182),
    'GE': (43.356892, 42.315407),
    'DE': (10.451526, 51.165691),
    'GH': (-1.023194, 7.946527),
    'GR': (21.824312, 39.074208),
    'GD': (-61.604171, 12.262776),
    'GT': (-90.23075899999999, 15.783471),
    'GN': (-9.696645, 9.945587),
    'GW': (-15.180413, 11.803749),
    'GY': (-58.93018, 4.860416),
    'HT': (-72.285215, 18.971187),
    'HN': (-86.241905, 15.199999),
    'HU': (19.503304, 47.162494),
    'IS': (-19.020835, 64.963051),
    'IN': (78.96288, 20.593684),
    'ID': (113.921327, -0.789275),
    'IR': (53.688046, 32.427908),
    'IQ': (43.679291, 33.223191),
    'IE': (-8.24389, 53.41291),
    'IL': (34.851612, 31.046051),
    'IT': (12.56738, 41.87194),
    'JM': (-77.297508, 18.109581),
    'JP': (138.252924, 36.204824),
    'JO': (36.238414, 30.585164),
    'KZ': (66.923684, 48.019573),
    'KE': (37.906193, -0.023559),
    'KI': (-168.734039, -3.370417),
    'KP': (127.510093, 40.339852),
    'KR': (127.766922, 35.907757),
    'KW': (47.481766, 29.31166),
    'KG': (74.766098, 41.20438),
    'LA': (102.495496, 19.85627),
    'LV': (24.603189, 56.879635),
    'LB': (35.862285, 33.854721),
    'LS': (28.233608, -29.609988),
    'LR': (-9.429499000000002, 6.428055),
    'LY': (17.228331, 26.3351),
    'LI': (9.555373, 47.166),
    'LT': (23.881275, 55.169438),
    'LU': (6.129582999999999, 49.815273),
    'MK': (21.745275, 41.608635),
    'MG': (46.869107, -18.766947),
    'MW': (34.301525, -13.254308),
    'MY': (101.975766, 4.210484),
    'MV': (73.22068, 3.202778),
    'ML': (-3.996166, 17.570692),
    'MT': (14.375416, 35.937496),
    'MH': (171.184478, 7.131474),
    'MR': (-10.940835, 21.00789),
    'MU': (57.55215200000001, -20.348404),
    'MX': (-102.552784, 23.634501),
    'FM': (150.550812, 7.425554),
    'MD': (28.369885, 47.411631),
    'MC': (7.412841, 43.750298),
    'MN': (103.846656, 46.862496),
    'ME': (19.37439, 42.708678),
    'MA': (-7.092619999999999, 31.791702),
    'MZ': (35.529562, -18.665695),
    'MM': (95.956223, 21.913965),
    'NA': (18.49041, -22.95764),
    'NR': (166.931503, -0.522778),
    'NP': (84.12400799999999, 28.394857),
    'NL': (5.291265999999999, 52.132633),
    'NZ': (174.885971, -40.900557),
    'NI': (-85.207229, 12.865416),
    'NE': (8.081666, 17.607789),
    'NG': (8.675277, 9.081999),
    'NO': (8.468945999999999, 60.47202399999999),
    'OM': (55.923255, 21.512583),
    'PK': (69.34511599999999, 30.375321),
    'PW': (134.58252, 7.51498),
    'PA': (-80.782127, 8.537981),
    'PG': (143.95555, -6.314992999999999),
    'PY': (-58.443832, -23.442503),
    'PE': (-75.015152, -9.189967),
    'PH': (121.774017, 12.879721),
    'PL': (19.145136, 51.919438),
    'PT': (-8.224454, 39.39987199999999),
    'QA': (51.183884, 25.354826),
    'RO': (24.96676, 45.943161),
    'RU': (105.318756, 61.52401),
    'RW': (29.873888, -1.940278),
    'KN': (-62.782998, 17.357822),
    'LC': (-60.978893, 13.909444),
    'VC': (-61.287228, 12.984305),
    'WS': (-172.104629, -13.759029),
    'SM': (12.457777, 43.94236),
    'ST': (6.613081, 0.18636),
    'SA': (45.079162, 23.885942),
    'SN': (-14.452362, 14.497401),
    'RS': (21.005859, 44.016521),
    'SC': (55.491977, -4.679574),
    'SL': (-11.779889, 8.460555),
    'SG': (103.819836, 1.352083),
    'SK': (19.699024, 48.669026),
    'SI': (14.995463, 46.151241),
    'SB': (160.156194, -9.64571),
    'SO': (46.199616, 5.152149),
    'ZA': (22.937506, -30.559482),
    'SS': (31.3069788, 6.877),
    'ES': (-3.74922, 40.46366700000001),
    'LK': (80.77179699999999, 7.873053999999999),
    'SD': (30.217636, 12.862807),
    'SR': (-56.027783, 3.919305),
    'SZ': (31.465866, -26.522503),
    'SE': (18.643501, 60.12816100000001),
    'CH': (8.227511999999999, 46.818188),
    'SY': (38.996815, 34.80207499999999),
    'TJ': (71.276093, 38.861034),
    'TZ': (34.888822, -6.369028),
    'TH': (100.992541, 15.870032),
    'TL': (125.727539, -8.874217),
    'TG': (0.824782, 8.619543),
    'TO': (-175.198242, -21.178986),
    'TT': (-61.222503, 10.691803),
    'TN': (9.537499, 33.886917),
    'TR': (35.243322, 38.963745),
    'TM': (59.556278, 38.969719),
    'TV': (179.194, -7.109535),
    'UG': (32.290275, 1.373333),
    'UA': (31.16558, 48.379433),
    'AE': (53.847818, 23.424076),
    'GB': (-3.435973, 55.378051),
    'US': (-95.712891, 37.09024),
    'UY': (-55.765835, -32.522779),
    'UZ': (64.585262, 41.377491),
    'VU': (166.959158, -15.376706),
    'VE': (-66.58973, 6.42375),
    'VN': (108.277199, 14.058324),
    'YE': (48.516388, 15.552727),
    'ZM': (27.849332, -13.133897),
    'ZW': (29.154857, -19.015438)
}

In [None]:
# Cell 10: Create a global choropleth map for countriesOfResearcher
def create_choropleth_map(df):
    # Explode the countriesOfResearcherCodes column to get one row per country code
    df_exploded = df.explode('countriesOfResearcherCodes')
    # Group by country code and count the occurrences
    country_counts = df_exploded['countriesOfResearcherCodes'].value_counts().reset_index()
    country_counts.columns = ['country_code_2', 'count']
    # Add alpha-3 country codes
    country_counts['country_code_3'] = country_counts['country_code_2'].apply(convert_country_code_to_alpha3)
    
    # Save the country counts to a CSV file
    country_counts.to_csv('country_counts.csv', index=False)
    
    # Debug: Print the country counts
    print(country_counts.head())
    
    # Create the choropleth map using matplotlib
    plt.figure(figsize=(15, 10))
    m = Basemap(projection='moll', lon_0=0)
    m.drawcoastlines()
    m.drawcountries()
    m.drawmapboundary(fill_color='aqua')
    m.fillcontinents(color='coral', lake_color='aqua')

    # Add the data
    for _, row in country_counts.iterrows():
        try:
            country_code = row['country_code_2']
            count = row['count']
            if country_code in country_coordinates:
                lon, lat = country_coordinates[country_code]
                x, y = m(lon, lat)
                m.plot(x, y, 'bo', markersize=count/2)  # Adjusted marker size for better visibility
        except Exception as e:
            print(f"Could not plot {country_code}: {e}")

    plt.title("Number of Publications by Country of Researcher (Mollweide Projection)")
    plt.savefig("publications_by_country.png", dpi=600)
    plt.show()

create_choropleth_map(df_relevant_data)

In [None]:
# Cell 11: Create word clouds from abstracts and keywords
# Combine all abstracts into a single string
abstracts = " ".join(df_relevant_data['abstract'].dropna())

# Combine all keywords into a single string
keywords = " ".join([" ".join(keyword_list) for keyword_list in keywords_list])

# Generate and plot the word cloud for abstracts
wordcloud_abstracts = WordCloud(width=800, height=400, background_color='white').generate(abstracts)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_abstracts, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud for Abstracts")
plt.savefig("wordcloud_abstracts.png", dpi=600)
plt.show()

# Generate and plot the word cloud for keywords
wordcloud_keywords = WordCloud(width=800, height=400, background_color='white').generate(keywords)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_keywords, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud for Keywords")
plt.savefig("wordcloud_keywords.png", dpi=600)
plt.show()