# Extract information to generate the concentration table

## Process file

In [None]:
SEPARATOR_STR = ' | '
ID_STR = 'id'
NAME_STR = 'name'
VALUE_STR = 'value'
REFERENCE_STR = 'ref'

In [None]:
with open('./variables_initial_values_descriptive_cleaned_u0_db_refined_selected_oct2021.jl', 'r') as f:
  julia_file_lines = f.readlines()

In [None]:
julia_file_lines[2]

In [None]:
processed_data = []

for line in julia_file_lines:
  if line.startswith('variable') or line.startswith('\n'):
    continue

  [id, name, val, ref] = line.split(SEPARATOR_STR)

  processed_data.append({
    ID_STR: id,
    NAME_STR: name,
    VALUE_STR: val,
    REFERENCE_STR: ref,
  })

In [None]:
processed_data[0]

## Find references DOI

In [None]:
import re
import requests
from urllib.parse import unquote

DOI_REGEXP = r'DOI:(.+?)[;\n\s]'
TITLE_REGEXP = r'<title>(.+)<\/title>'
DOI_API_URL = 'https://doi.org/api/handles'
PUBMED_API = 'https://pubmed.ncbi.nlm.nih.gov'
REDIRECTION_DETECTION_URL = '<meta.+?Redirect=(.+?)via.+?\/>'

class ReferenceConstants():
  TITLE = 'title'
  URL = 'url'

referenceConstants = ReferenceConstants()
cache_data = {}

In [None]:
# Fill manually the references that are not found
manual_doi_titles = {
  '10.1016/S0021-9258(18)83418-4': {
    'title': 'Effects of Changes in Brain Metabolism on Levels of Pentose Phosphate Pathway Intermediates',
    'url': 'https://linkinghub.elsevier.com/retrieve/pii/S0021925818834184',
  },
  '10.1016/0301-0082(94)90015-9': {
    'title': 'Ions and energy in mammalian brain',
    'url': 'https://linkinghub.elsevier.com/retrieve/pii/0301008294900159'
  },
  '10.1016/S0021-9258(18)91886-7': {
    'title': 'Purification and Specific Kinetic Properties of Erythrocyte Uridine Diphosphate Glucose Pyrophosphorylase',
    'url': 'https://linkinghub.elsevier.com/retrieve/pii/S0021925818918867',
  },
  '10.1016/0020-711X(80)90115-9': {
    'title': 'Kinetic models of glycogen metabolism in normal rat liver, morris Hepatom 7787 and host liver',
    'url': 'https://linkinghub.elsevier.com/retrieve/pii/0020711X80901159',
  },
  '10.1016/0301-0082(93)90022-K': {
    'title': 'Glucose and ketone body utilization by the brain of neonatal rats',
    'url': 'https://linkinghub.elsevier.com/retrieve/pii/030100829390022K',
  },
  '10.1016/S0021-9258(17)33365-3': {
    'title': 'A relation between (NAD+)/(NADH) potentials and glucose utilization in rat brain slices.',
    'url': 'https://linkinghub.elsevier.com/retrieve/pii/S0021925817333653',
  },
  '10.1016/S0021-9258(18)62854-6': {
    'title': 'The Metabolism of Rat Brain Mitochondria: PREPARATION AND CHARACTERIZATION',
    'url': 'https://linkinghub.elsevier.com/retrieve/pii/S0021925818628546',
  },
  '10.1007/978-1-4614-1788-0': {
    'title': 'Neural Metabolism In Vivo',
    'url': 'https://link.springer.com/book/10.1007%2F978-1-4614-1788-0',
  },
  '10.1088/1742-6596/1141/1/012028': {
    'title': 'Brain glutaminases: bridging the gap between the controversial enzyme localizations',
    'url': 'https://iopscience.iop.org/article/10.1088/1742-6596/1141/1/012028',
  },
}

def find_info_doi_api(doi_reference):
  # use the doi api to get the url
  # url = f'{DOI_API_URL}/{doi_reference}'
  # doi_api_response = requests.get(url)
  # paper_url = doi_api_response.json()['values'][0]['data']['value']
  info = manual_doi_titles[doi_reference.upper()]
  print('Using the manual titles')
  return info['title'], info['url']

In [None]:
def find_title_from_html(url):
  response = requests.get(url)
  if not response.ok:
    print(f'Error fetching title: {response.request.url}')
    print(response.text)
    return None

  return re.search(TITLE_REGEXP, response.text).group(1)

def get_reference_html_info(doi_reference):
  ref_url = f'{PUBMED_API}/?term={doi_reference}'
  if ref_url in cache_data:
    return cache_data[ref_url]

  title = find_title_from_html(ref_url)
  
  pruned_title = None
  paper_url = None
  
  if title is not None and title != '':
    pruned_title = title.replace(' - PubMed', '')
    paper_url = ref_url
  else:
    return None
  
  if ' - Search Results' in title:
    # article not found in PubMed. Use DOI api
    pruned_title, paper_url = find_info_doi_api(doi_reference)

  result_info = {
    referenceConstants.TITLE: pruned_title,
    referenceConstants.URL: paper_url,
  }

  cache_data[ref_url] = result_info
  return result_info

def get_reference_info(raw_reference):
  ref_matches = re.findall(DOI_REGEXP, raw_reference)
  refs_info = []

  for match in ref_matches:
    doi_ref = match.replace(' ', '')
    info = get_reference_html_info(doi_ref)
    refs_info.append(info)

  return refs_info

In [None]:
ref_info_by_id = {}

for ref in processed_data:
  raw_reference = ref[REFERENCE_STR]
  ref_info_by_id[ref[ID_STR]] = get_reference_info(raw_reference)

In [None]:
ref_info_by_id['VNeu0']

## Save reference file

In [None]:
import json

def save_file(filename, data):
  with open(filename, 'w') as f:
    f.write(json.dumps(data))

In [None]:
save_file('concentration_table_ref_titles.json', ref_info_by_id)

## Integrate all the table fields in one file

In [None]:
full_table_content = []

for line in processed_data:
  result = {
    ID_STR: line[ID_STR],
    NAME_STR: line[NAME_STR],
    VALUE_STR: line[VALUE_STR],
    'reference_raw': line[REFERENCE_STR],
    'reference_info': ref_info_by_id[line[ID_STR]],
  }
  full_table_content.append(result)

In [None]:
save_file('concentration_table_full.json', full_table_content)