In [57]:
import numpy as np
from sklearn import metrics
import Levenshtein

In [58]:
def calculuate_metrics(y_true, y_pred):
  acc = metrics.accuracy_score(y_true, y_pred)
  f1 = metrics.f1_score(y_true, y_pred)
  _, _, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()
  return {
    'accuracy': np.around(acc, 4),
    'f1': np.around(f1, 4),
    'incorrect': np.around(fn, 4),
    'correct': np.around(tp, 4)
  }

In [59]:
y_extraction = np.array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1])
y_cohort = np.array([1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1])
y_all = np.concatenate((y_extraction, y_cohort))

print('Overall:')
print(calculuate_metrics([1]*y_all.size, y_all))

Overall:
{'accuracy': 0.88, 'f1': 0.9362, 'incorrect': 6, 'correct': 44}
Information Extraction:
{'accuracy': 0.9, 'f1': 0.9474, 'incorrect': 2, 'correct': 18}
Cohort Discovery:
{'accuracy': 0.8667, 'f1': 0.9286, 'incorrect': 4, 'correct': 26}


In [60]:
incorrect_queries = [
  "SELECT COUNT(DISTINCT collection_id) FROM `bigquery-public-data.idc_current.dicom_all` GROUP BY collection_id HAVING COUNT(DISTINCT PatientID) >= 500",

  "SELECT COUNT(DISTINCT PatientID) AS num_patients, COUNT(*) AS num_images, COUNT(DISTINCT Modality) AS num_modalities, COUNT(DISTINCT BodyPartExamined) AS num_body_parts, COUNT(DISTINCT SequenceName) AS num_sequences, COUNT(DISTINCT StationName) AS num_scanners FROM `bigquery-public-data.idc_current.dicom_all` WHERE collection_id = 'upenn_gbm'",

  "SELECT COUNT(DISTINCT StudyInstanceUID) FROM `bigquery-public-data.idc_current.dicom_all` WHERE Modality='MR' AND BodyPartExamined='BRAIN'",

  "SELECT DISTINCT collection_id FROM `bigquery-public-data.idc_current.dicom_all` WHERE modality = 'CT' AND BodyPartExamined LIKE '%Lung%'",

  "SELECT collection_id, ROUND(SUM(file_size)/POWER(10,9),2) AS download_size_GB FROM `bigquery-public-data.idc_current.dicom_all` GROUP BY collection_id",

  "SELECT DISTINCT collection_id FROM `bigquery-public-data.idc_current.dicom_all` WHERE StudyDescription LIKE '%glioblastoma%'"
]

correct_queries = [
  "SELECT COUNT(collection_id) FROM `bigquery-public-data.idc_current.dicom_all` GROUP BY collection_id HAVING COUNT(DISTINCT PatientID) >= 500",

  "SELECT COUNT(DISTINCT PatientID) AS num_patients, COUNT(*) AS num_images, COUNT(DISTINCT Modality) AS num_modalities, COUNT(DISTINCT BodyPartExamined) AS num_body_parts, COUNT(DISTINCT SequenceName) AS num_sequences, COUNT(DISTINCT ManufacturerModelName) AS num_scanners FROM `bigquery-public-data.idc_current.dicom_all` WHERE collection_id = 'upenn_gbm'",

  "SELECT COUNT(DISTINCT collection_id) FROM `bigquery-public-data.idc_current.dicom_all` WHERE Modality='MR' AND BodyPartExamined='BRAIN'",

  "SELECT DISTINCT collection_id FROM `bigquery-public-data.idc_current.dicom_all` WHERE modality = 'CT' AND LOWER(BodyPartExamined) LIKE '%lung%'",

  "SELECT collection_id, ROUND(SUM(instance_size)/POWER(10,9),2) AS download_size_GB FROM `bigquery-public-data.idc_current.dicom_all` GROUP BY collection_id",

  "SELECT DISTINCT collection_id FROM `bigquery-public-data.idc_current.dicom_all` WHERE LOWER(collection_cancerType) LIKE '%glioblastoma%'"
]

In [61]:
lev = []
for i, (correct, incorrect) in enumerate(zip(correct_queries, incorrect_queries)):
  lev += [Levenshtein.distance(correct, incorrect)]
print(lev, np.mean(lev), np.std(lev))

[9, 14, 15, 8, 7, 24] 12.833333333333334 5.814254514170802
