# List project diffs between HCA catalogs

In [None]:
from IPython.display import Javascript

display(Javascript('''google.colab.output.setIframeHeight(0, true, {maxHeight: 1000000})'''))

import json
import requests
from datetime import datetime, timezone

max_entries = 500 # Should be >= the number of projects in any catalog

def get_catalog_projects(catalog):
  url = "https://service.azul.data.humancellatlas.org/index/projects?filters=%7B%7D&size=" + str(max_entries) + "&catalog=dcp" + str(catalog)
  data = requests.get(url).json()
  if "Message" in data:
    raise Exception(data["Message"])
  return {entry["entryId"]: entry for entry in data["hits"]}

def get_presence_diffs_from(entries, other_entries):
  diff_entries = []
  for id, entry in entries.items():
    if not id in other_entries:
      diff_entries.append(entry)
  return diff_entries

def make_type_key(type_info):
  return (type_info["fileType"], type_info["isIntermediate"], type_info["contentDescription"][0])

def get_type_counts(entry):
  type_counts = {}
  for type_info in entry["fileTypeSummaries"]:
    type_counts[make_type_key(type_info)] = type_info["count"]
  return type_counts

def get_entry_changes(entry, entry_prev):
  entry_type_counts = get_type_counts(entry)
  entry_prev_type_counts = get_type_counts(entry_prev)

  present_types = set(entry_type_counts) | set(entry_prev_type_counts)

  added_files = {}
  deleted_files = {}

  for type_key in present_types:
    count = entry_type_counts.get(type_key, 0)
    prev_count = entry_prev_type_counts.get(type_key, 0)
    if count > prev_count:
      added_files[type_key[0]] = added_files.get(type_key[0], 0) + (count - prev_count)
    elif prev_count > count:
      deleted_files[type_key[0]] = deleted_files.get(type_key[0], 0) + (prev_count - count)
  
  return (added_files, deleted_files)

def get_date_timestamp(str):
  try:
    return datetime.fromisoformat(str[:-1] if str[-1] == "Z" else str).timestamp()
  except:
    return -1

def get_updated_projects(prev_catalog_entries, catalog_entries):
  diff_files_entries = []
  diff_meta_entries = []
  invalid_date_entries = []
  for id, entry in catalog_entries.items():
    if id in prev_catalog_entries:
      prev_entry = prev_catalog_entries[id]
      changed_files = get_entry_changes(entry, prev_entry)
      update_timestamp = get_date_timestamp(entry["projects"][0]["aggregateUpdateDate"])
      prev_update_timestamp = get_date_timestamp(prev_entry["projects"][0]["aggregateUpdateDate"])
      if update_timestamp == -1 or prev_update_timestamp == -1:
       invalid_date_entries.append(entry)
      if len(changed_files[0]) > 0 or len(changed_files[1]) > 0:
        diff_files_entries.append((entry, changed_files))
      elif update_timestamp > prev_update_timestamp:
        diff_meta_entries.append(entry)
  return (diff_files_entries, diff_meta_entries, invalid_date_entries)

def list_projects(entries):
  for entry in entries:
    print("1. [" + entry["projects"][0]["projectTitle"] + "](https://data.humancellatlas.org/explore/projects/" + entry["projects"][0]["projectId"] + ")")

def make_file_count_list(files):
  return ", ".join([str(count) + " " + file_type for file_type, count in files.items()])

def list_projects_with_files(entries_info):
  for entry, (added_files, deleted_files) in entries_info:
    line_text = "1. [" + entry["projects"][0]["projectTitle"] + "](https://data.humancellatlas.org/explore/projects/" + entry["projects"][0]["projectId"] + ") | "
    if len(added_files) > 0:
      line_text += "Added files: " + make_file_count_list(added_files)
    if len(deleted_files) > 0:
      if len(added_files) > 0:
        line_text += "; "
      line_text += "Deleted files: " + make_file_count_list(deleted_files)
    print(line_text)

def print_json_summary(catalog, new_entries, updated_files_entries_info, updated_meta_entries):
  data = {
      "catalog": "DCP" + str(catalog),
      "runDate": datetime.now(timezone.utc).isoformat(),
      "new": [entry["projects"][0]["projectId"] for entry in new_entries],
      "updatedFiles": [entry["projects"][0]["projectId"] for (entry, f) in updated_files_entries_info],
      "updatedMetadata": [entry["projects"][0]["projectId"] for entry in updated_meta_entries]
  }
  print(json.dumps(data))

def list_diffs_between(prev_catalog, catalog):
  catalog_entries = get_catalog_projects(catalog)
  prev_catalog_entries = get_catalog_projects(prev_catalog)

  new_entries = get_presence_diffs_from(catalog_entries, prev_catalog_entries)
  print("New projects (" + str(len(new_entries)) + "):")
  list_projects(new_entries)

  print("")

  updated_files_entries_info, updated_meta_entries, invalid_date_entries = get_updated_projects(prev_catalog_entries, catalog_entries)

  print("Projects with updated files (" + str(len(updated_files_entries_info)) + "):")
  list_projects_with_files(updated_files_entries_info)

  print("")

  print("Projects with updated metadata (" + str(len(updated_meta_entries)) + "):")
  list_projects(updated_meta_entries)

  print("")

  deleted_entries = get_presence_diffs_from(prev_catalog_entries, catalog_entries)
  print("Deleted projects (" + str(len(deleted_entries)) + "):")
  list_projects(deleted_entries)

  print("")

  print("Projects with invalid aggregateUpdateDate fields (in either catalog):")
  list_projects(invalid_date_entries)

  print("")

  print("JSON:")
  print_json_summary(catalog, new_entries, updated_files_entries_info, updated_meta_entries)
  print("")



list_diffs_between(10, 11)

<IPython.core.display.Javascript object>

New projects (14):
1. [Cellular heterogeneity of human fallopian tubes in normal and hydrosalpinx disease states identified by scRNA-seq](https://data.humancellatlas.org/explore/projects/21ea8ddb-525f-4f1f-a820-31f0360399a2)
1. [Cryopreservation and post-thaw characterization of dissociated human islet cells](https://data.humancellatlas.org/explore/projects/8559a8ed-5d8c-4fb6-bde8-ab639cebf03c)
1. [Defining human mesenchymal and epithelial heterogeneity in response to oral inflammatory disease](https://data.humancellatlas.org/explore/projects/783c9952-a4ae-4106-a6ce-56f20ce27f88)
1. [Differentiation of Human Intestinal Organoids with Endogenous Vascular Endothelial Cells](https://data.humancellatlas.org/explore/projects/5eafb94b-02d8-423e-81b8-3673da319ca0)
1. [Healthy human kidney cell type single cell RNA-seq data](https://data.humancellatlas.org/explore/projects/94023a08-611d-4f22-a8c9-90956e091b2e)
1. [Human photoreceptor cells from different macular subregions have distinct transc