# Create EndNote Import of References

This code will processes a directory of pdf files and creates a bibliography reference file for import into EndNote 21

* __EndNote Import Instructions__
    * 1. Given the Output file of this run "<some_name>.nbib"
    * 2. Open EndNote21 
            * File->Import… (Select File)
    * 3. If prompted, 
            * _Choose a Filter_
            * Select filter: __BibTex__
    * 4. The References should now be in your EndNote Library


* Before running this notebook On __Watson__
    * Start grobid server to get references
        * `cd Code/Java/grobid`
        * `./gradlew run`
        * Note: When starting the service will never reach 100% but it will still be running!
    * Start NGINX danhiggins.org to lookup pdfs
        * `cd /var/www/danhiggins.org/data`
        * copy any need pdf files into this directory
        * NGINX is not used to create the reference but if running the pdfs will be available for lookup from endnote notes in the reference



In [18]:
import requests
import json
import re
import hashlib
from urllib.parse import quote

def generate_unique_hash(file_path):
    hash_object = hashlib.sha256(file_path.encode())    
    return hash_object.hexdigest()

def process_header(file_path):
    """
    Using the pdf content use AI to find the bibliography reference data
    See: https://grobid.readthedocs.io/en/latest/
    """
    url = 'http://192.168.1.101:8070/api/processHeaderDocument'
    headers = {
        'Accept': 'application/x-bibtex'
    }
    files = {
        'input': open(file_path, 'rb')
    }

    response = requests.post(url, headers=headers, files=files)

    if response.status_code == 200:
        return response.text
    else:
        print("Failed to upload file")
        print("Status code:", response.status_code)
        return ""
        


def bibtex_to_json(bibtex_str, paper_url):
    """Convert the bibliography text from grobid into a dictionary

    Args:
        bibtex_str (str): bibliography text from grobid
        paper_url (str): pdf file name

    Raises:
        ValueError: If the Input text can not be parsed

    Returns:
        dict: The bibliography text as a dictionary
    """
    try:
        # Remove the @misc{ and } at the start and end
        if not bibtex_str.startswith('@misc{'):
            raise ValueError("Input does not start with '@misc{'")
        bibtex_str = bibtex_str.strip()[6:].strip()
        if not bibtex_str.endswith('}'):
            raise ValueError("Input does not end with '}'")
        bibtex_str = bibtex_str[:-1].strip()

        # Split the key and the fields
        try:
            key, fields_str = bibtex_str.split(',', 1)
        except ValueError:
            raise ValueError("Input does not contain a valid key and fields")

        # Extract fields
        fields = re.findall(r'\s*(\w+)\s*=\s*{(.*?)}\s*(?:,|$)', fields_str, re.DOTALL)

        if not fields:
            raise ValueError("No valid fields found in input")

        # Convert to dictionary
        encoded_paper_url = quote(paper_url)
        bibtex_dict = {"type"   : "misc", 
                       "key"    : key.strip(), 
                       "pdf_url": f"https://danhiggins.org/data/{encoded_paper_url}",
                       "uuid"   : generate_unique_hash(paper_url)}
        for field in fields:
            field_name = field[0].strip()
            field_value = field[1].strip().replace('\n', ' ')
            bibtex_dict[field_name] = field_value

        return bibtex_dict
    except Exception as e:
        return {"error": str(e)}




In [19]:
from habanero import Crossref

def cross_ref(doi):
    """Look up the bibliography given that you have a doi reference
    Note: Crossref provides additional information not available from grobid
    but not all bibliographies have a doi reference

    Args:
        doi (URL): doi URL to a document

    Returns:
        dict: The bibliography text as a dictionary
    """
    citation = {}
    chars_to_remove = "(){}[]"
    # Create a translation table that maps each chars_to_remove to None
    translation_table = str.maketrans('', '', chars_to_remove)

    # Remove the special characters from the input
    doi = doi.translate(translation_table)

    cr = Crossref()
    try:
        metadata = cr.works(ids=doi)
        # Extract citation information
        citation['title'] = metadata['message']['title'][0]
        authors_lst = [author.get('given','') + " " + author.get('family','') for author in metadata['message'].get('author','')]
        citation['authors'] = ' and '.join(authors_lst)
        container_title = metadata['message'].get('container-title',[])
        if container_title:
            citation['journal'] = container_title[0]
        citation['year'] = metadata['message']['created']['date-parts'][0][0]
        citation['volume'] = metadata['message'].get('volume', 'N/A')
        citation['issue'] = metadata['message'].get('issue', 'N/A')
        citation['page'] = metadata['message'].get('page', 'N/A')
    except Exception as e:
        print("Failed to process:", e)
    
    return citation
    # Format the citation

# Define the DOI
doi = "10.1534/genetics.117.300106" # Right
print(json.dumps(cross_ref(doi),indent=4))

{
    "title": "Lipid and Carbohydrate Metabolism in Caenorhabditis elegans",
    "authors": "",
    "journal": "Genetics",
    "year": 2023,
    "volume": "N/A",
    "issue": "N/A",
    "page": "N/A"
}


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def similarity_score(str1, str2):
    """Given two string return a score between 0 and 1 the represents how similar the strings are to eachother 
    NOTE: In this code similarity_score is used to compare the likeness of two titles  
    """
    documents = [str1, str2]

    # Convert the strings to vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)

    # Compute cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return cosine_sim[0][0]


str1 = "I love programming"
str2 = "Programming is my passion"
score = similarity_score(str1,str2)
print(f"Cosine similarity: {score}")



Cosine similarity: 0.22028815056182968


In [21]:
def create_nbib(papers, out_file):
    """Give a List of papers convert to a format that can be imported into endnote

    Args:
        papers (list): A list of dictionaries representing articles
        out_file (str): the endnote import file name
    """
    bib_lex_lib=""
    for article in papers:
        bib_lex = (
            f"@Article{{\n"
            f"author ={{{article.get('author', '')}}},\n"
            f"title ={{{article.get('title', '')}}},\n"
            f"year ={{{article.get('year', '')}}},\n"
            f"doi ={{{article.get('doi', '')}}},\n"
            f"abstract ={{{article.get('abstract', '')}}},\n"
            f"keywords ={{{article.get('keywords', '')}}},\n"
            f"url ={{{article.get('pdf_url', '')}}},\n"
            f"journal ={{{article.get('journal', '')}}},\n"
            f"volume ={{{article.get('volume', '')}}},\n"
            f"pages ={{{article.get('pages', '')}}},\n"
            f"source={{{article.get('uuid', '')}}}\n"            
            f"}}\n\n"
        )
        bib_lex_lib += bib_lex
    
    with open(out_file, 'w') as file:
        file.write(bib_lex_lib)
        

In [22]:

import os

def get_last_directory_and_file(file_path):
    # Get the directory name from the file path
    directory = os.path.dirname(file_path)
    
    # Get the file name from the file path
    file_name = os.path.basename(file_path)
    
    # Get the last directory name
    last_directory = os.path.basename(directory)
    
    # Combine the last directory and the file name
    result = os.path.join(last_directory, file_name)
    
    return result

def process_pdfs_in_directory(directory_path, endnote_dict):
    unprocessed_files = []
    # Check if the directory exists
    if not os.path.isdir(directory_path):
        print(f"The directory {directory_path} does not exist.")
        return

    # List all files in the directory
    processed = 0
    file_list = os.listdir(directory_path)
    num_files = len(file_list)
    for filename in file_list:
        # Construct the full file path
        file_path = os.path.join(directory_path, filename)
        
        # Check if the file is a PDF
        if os.path.isfile(file_path) and filename.lower().endswith('.pdf'):
            # Process the PDF file
            last_directory_and_file = get_last_directory_and_file(file_path)
            uuid = generate_unique_hash(last_directory_and_file)
            if not uuid in endnote_dict:
                unprocessed_files.append(file_path)
    return unprocessed_files

def process_pdfs_in_directory1(directory_path):
    papers = []
    # Check if the directory exists
    if not os.path.isdir(directory_path):
        print(f"The directory {directory_path} does not exist.")
        return

    # List all files in the directory
    processed = 0
    file_list = os.listdir(directory_path)
    num_files = len(file_list)
    for filename in file_list:
        # Construct the full file path
        file_path = os.path.join(directory_path, filename)
        
        # Check if the file is a PDF
        if os.path.isfile(file_path) and filename.lower().endswith('.pdf'):
            # Process the PDF file
            print(file_path)
            bibtex_str = process_header(file_path)
            last_directory_and_file = get_last_directory_and_file(file_path)
            
            bibtex_json = bibtex_to_json(bibtex_str, last_directory_and_file)
            if 'doi' in bibtex_json:
                citation = cross_ref(bibtex_json['doi'])
                #print(f"{citation=}")
                if 'title' in bibtex_json and 'title' in citation:
                    score = similarity_score(citation['title'], bibtex_json['title'])
                    bibtex_json['score'] = score
                    if score > 0.9:
                        bibtex_json['title'] = citation.get('title','')
                        bibtex_json['author'] = citation.get('authors','')
                        bibtex_json['journal'] = citation.get('journal','')
                        bibtex_json['volume'] = citation.get('volume','')
                        bibtex_json['pages'] = citation.get('page','')
                        bibtex_json['year'] = citation.get('year','')
                elif not 'title' in bibtex_json and 'title' in citation:
                    # If we dont have a title but we do have a doi
                    # Check if the url and the doi are similar
                    # If they are then assume we have a match
                    print("No title but we do have a doi")
                    bibtex_doi = bibtex_json['doi']
                    bibtex_url = bibtex_json['pdf_url']
                    doi_six = bibtex_doi[-6:]
                    if len(bibtex_url) > 4:
                        url_six = bibtex_url[:-4]
                    else:
                        url_six = ''
                    url_six = url_six[-6:]
                    score = similarity_score(doi_six, url_six)
                    if score > 0.9:
                        print("Found a match for no title!!")
                        bibtex_json['title'] = citation.get('title','')
                        bibtex_json['author'] = citation.get('authors','')
                        bibtex_json['journal'] = citation.get('journal','')
                        bibtex_json['volume'] = citation.get('volume','')
                        bibtex_json['pages'] = citation.get('page','')
                        bibtex_json['year'] = citation.get('year','')
                    
            processed +=1
            print(f"{filename} {processed:03d} of {num_files:03d}")
            papers.append(bibtex_json)
            #if processed > 10:
            #    break
            
    return papers

def write_json_to_file(json_data, file_path):
    with open(file_path, 'w') as json_file:
        json.dump(json_data, json_file, indent=2)


In [23]:
import json
import uuid

def parse_reference_str(references_str):
    """
    Parse a single reference entry into a dictionary.
    """
    lines = references_str.strip().split('\n')
    reference_dict = {}
    
    for line in lines:
        if ": " in line:
            key, value = line.split(": ", 1)
            reference_dict[key.strip()] = value.strip()
    
    return reference_dict

def read_endnote_lib_file(file_path):
    """
    Note: EndNote Export using Output Style "Show All Fields"
    """
    with open(file_path, 'r', encoding='utf-8-sig') as file:
        content = file.read()
    
    # Split content into individual references
    references_str = content.strip().split('\n\n')
    references_dict = {}
    for ref_str in references_str:
        ref_dict = parse_reference_str(ref_str)
        id = ref_dict.get('Name of Database',str(uuid.uuid4()))
        references_dict[id]=ref_dict
    
    return references_dict

# # Specify the file path
# file_path = 'My_EndNote_Library.txt'

# # Parse the file and get the JSON output
# references_dict = {}
# references_dict = read_endnote_lib_file(file_path)

# # Print the JSON output
# json_output = json.dumps(references_dict, indent=4)
# print(json_output)



In [25]:
import time
start_time = time.time()
directory_path = '/var/www/danhiggins.org/data/papers_arf_lipid_golgi/'
endnote_lib_file = 'My_EndNote_Library.txt'

endnote_dict = read_endnote_lib_file(endnote_lib_file)

# Call the function to process PDFs in the directory
unprocessed_files = process_pdfs_in_directory(directory_path, endnote_dict)

print(unprocessed_files)
#write_json_to_file(papers, "./papers.json")
#create_nbib(papers, "arf_lipid_golgi.nbib")

end_time = time.time()
time_difference = end_time - start_time
minutes = int(time_difference // 60)
seconds = int(time_difference % 60)

print(f"Time: {minutes} minutes and {seconds} seconds")

[]
Time: 0 minutes and 0 seconds
