In [1]:
!coconnect run py --help

Usage: coconnect run py [OPTIONS] COMMAND [ARGS]...

  Commands for using python configurations to run the ETL transformation.

Options:
  --help  Show this message and exit.

Commands:
  list      List all the python classes there are available to run
  make      Generate a python class from the OMOP mapping json
  map       Perform OMOP Mapping given a python configuration file.
  register  Register a python class with the tool
  remove    remove a registered class


In [2]:
!coconnect run py make --name ExampleDataset ../data/rules.json

Recreating file /Users/calummacdonald/Usher/CO-CONNECT/Software/tests/demo-dataset/notebooks/ExampleDataset.py


This automatically creates a file that looks like this:

In [15]:
# %load ExampleDataset.py
from coconnect.cdm import define_person, define_condition_occurrence, define_visit_occurrence, define_measurement, define_observation, define_drug_exposure
from coconnect.cdm import CommonDataModel
import json

class ExampleDataset(CommonDataModel):
    
    def __init__(self,**kwargs):
        """ 
        initialise the inputs and setup indexing 
        """
        super().__init__(**kwargs)
        
    
    @define_person
    def person_0(self):
        """
        Create CDM object for person
        """
        self.birth_datetime.series = self.inputs["Demographics.csv"]["Age"]
        self.gender_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_value.series = self.inputs["Demographics.csv"]["Sex"]
        self.person_id.series = self.inputs["Demographics.csv"]["ID"]
        
        # --- insert field operations --- 
        self.birth_datetime.series = self.tools.get_datetime_from_age(self.birth_datetime.series)
        
        # --- insert term mapping --- 
        self.gender_concept_id.series = self.gender_concept_id.series.map(
            {
                "Male": 8507
            }
        )
        self.gender_source_concept_id.series = self.gender_source_concept_id.series.map(
            {
                "Male": 8507
            }
        )
        
    @define_person
    def person_1(self):
        """
        Create CDM object for person
        """
        self.birth_datetime.series = self.inputs["Demographics.csv"]["Age"]
        self.gender_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_value.series = self.inputs["Demographics.csv"]["Sex"]
        self.person_id.series = self.inputs["Demographics.csv"]["ID"]
        
        # --- insert field operations --- 
        self.birth_datetime.series = self.tools.get_datetime_from_age(self.birth_datetime.series)
        
        # --- insert term mapping --- 
        self.gender_concept_id.series = self.gender_concept_id.series.map(
            {
                "Female": 8532
            }
        )
        self.gender_source_concept_id.series = self.gender_source_concept_id.series.map(
            {
                "Female": 8532
            }
        )
        
    @define_observation
    def observation_0(self):
        """
        Create CDM object for observation
        """
        self.observation_concept_id.series = self.inputs["Serology.csv"]["IgG"]
        self.observation_datetime.series = self.inputs["Serology.csv"]["Date"]
        self.observation_source_concept_id.series = self.inputs["Serology.csv"]["IgG"]
        self.observation_source_value.series = self.inputs["Serology.csv"]["IgG"]
        self.person_id.series = self.inputs["Serology.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.observation_concept_id.series = self.tools.make_scalar(self.observation_concept_id.series,4288455)
        self.observation_source_concept_id.series = self.tools.make_scalar(self.observation_source_concept_id.series,4288455)
        
    @define_observation
    def observation_1(self):
        """
        Create CDM object for observation
        """
        self.observation_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_datetime.series = self.inputs["Hospital_Visit.csv"]["admission_date"]
        self.observation_source_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_source_value.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.person_id.series = self.inputs["Hospital_Visit.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.observation_concept_id.series = self.observation_concept_id.series.map(
            {
                "Heart Attack": 4059317
            }
        )
        self.observation_source_concept_id.series = self.observation_source_concept_id.series.map(
            {
                "Heart Attack": 4059317
            }
        )
        
    @define_observation
    def observation_2(self):
        """
        Create CDM object for observation
        """
        self.observation_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_datetime.series = self.inputs["Hospital_Visit.csv"]["admission_date"]
        self.observation_source_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_source_value.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.person_id.series = self.inputs["Hospital_Visit.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.observation_concept_id.series = self.observation_concept_id.series.map(
            {
                "COVID-19": 37311065
            }
        )
        self.observation_source_concept_id.series = self.observation_source_concept_id.series.map(
            {
                "COVID-19": 37311065
            }
        )
        
    @define_observation
    def observation_3(self):
        """
        Create CDM object for observation
        """
        self.observation_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_datetime.series = self.inputs["Hospital_Visit.csv"]["admission_date"]
        self.observation_source_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.observation_source_value.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.person_id.series = self.inputs["Hospital_Visit.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.observation_concept_id.series = self.observation_concept_id.series.map(
            {
                "Cancer": 40757663
            }
        )
        self.observation_source_concept_id.series = self.observation_source_concept_id.series.map(
            {
                "Cancer": 40757663
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_0(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Headache"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Headache"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Headache"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 378253
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 378253
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_1(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Fatigue"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Fatigue"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Fatigue"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 4223659
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 4223659
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_2(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Dizzy"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Dizzy"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Dizzy"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 4223938
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 4223938
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_3(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Cough"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Cough"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Cough"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 254761
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 254761
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_4(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Fever"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Fever"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Fever"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 437663
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 437663
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_5(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Symptoms.csv"]["Muscle_Pain"]
        self.condition_end_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.condition_source_concept_id.series = self.inputs["Symptoms.csv"]["Muscle_Pain"]
        self.condition_source_value.series = self.inputs["Symptoms.csv"]["Muscle_Pain"]
        self.condition_start_datetime.series = self.inputs["Symptoms.csv"]["date_occurrence"]
        self.person_id.series = self.inputs["Symptoms.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Yes": 442752
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Yes": 442752
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_6(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.condition_end_datetime.series = self.inputs["Hospital_Visit.csv"]["admission_date"]
        self.condition_source_concept_id.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.condition_source_value.series = self.inputs["Hospital_Visit.csv"]["reason"]
        self.condition_start_datetime.series = self.inputs["Hospital_Visit.csv"]["admission_date"]
        self.person_id.series = self.inputs["Hospital_Visit.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Pneumonia": 255848
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Pneumonia": 255848
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_7(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_end_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.condition_source_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_source_value.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_start_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.person_id.series = self.inputs["GP_Records.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Mental Health": 4131548
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Mental Health": 4131548
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_8(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_end_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.condition_source_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_source_value.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_start_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.person_id.series = self.inputs["GP_Records.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Mental Health": 432586
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Mental Health": 432586
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_9(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_end_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.condition_source_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_source_value.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_start_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.person_id.series = self.inputs["GP_Records.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Diabetes Type-II": 201826
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Diabetes Type-II": 201826
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_10(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_end_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.condition_source_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_source_value.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_start_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.person_id.series = self.inputs["GP_Records.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "Heart Condition": 4185932
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "Heart Condition": 4185932
            }
        )
        
    @define_condition_occurrence
    def condition_occurrence_11(self):
        """
        Create CDM object for condition_occurrence
        """
        self.condition_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_end_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.condition_source_concept_id.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_source_value.series = self.inputs["GP_Records.csv"]["comorbidity"]
        self.condition_start_datetime.series = self.inputs["GP_Records.csv"]["date_of_visit"]
        self.person_id.series = self.inputs["GP_Records.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.condition_concept_id.series = self.condition_concept_id.series.map(
            {
                "High Blood Pressure": 316866
            }
        )
        self.condition_source_concept_id.series = self.condition_source_concept_id.series.map(
            {
                "High Blood Pressure": 316866
            }
        )
        
    @define_drug_exposure
    def drug_exposure_0(self):
        """
        Create CDM object for drug_exposure
        """
        self.drug_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_exposure_end_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_exposure_start_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_source_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_source_value.series = self.inputs["Vaccinations.csv"]["type"]
        self.person_id.series = self.inputs["Vaccinations.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.drug_concept_id.series = self.drug_concept_id.series.map(
            {
                "Moderna": 35894915
            }
        )
        self.drug_source_concept_id.series = self.drug_source_concept_id.series.map(
            {
                "Moderna": 35894915
            }
        )
        
    @define_drug_exposure
    def drug_exposure_1(self):
        """
        Create CDM object for drug_exposure
        """
        self.drug_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_exposure_end_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_exposure_start_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_source_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_source_value.series = self.inputs["Vaccinations.csv"]["type"]
        self.person_id.series = self.inputs["Vaccinations.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.drug_concept_id.series = self.drug_concept_id.series.map(
            {
                "AstraZenica": 35894915
            }
        )
        self.drug_source_concept_id.series = self.drug_source_concept_id.series.map(
            {
                "AstraZenica": 35894915
            }
        )
        
    @define_drug_exposure
    def drug_exposure_2(self):
        """
        Create CDM object for drug_exposure
        """
        self.drug_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_exposure_end_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_exposure_start_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_source_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_source_value.series = self.inputs["Vaccinations.csv"]["type"]
        self.person_id.series = self.inputs["Vaccinations.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.drug_concept_id.series = self.drug_concept_id.series.map(
            {
                "Pfizer": 35894915
            }
        )
        self.drug_source_concept_id.series = self.drug_source_concept_id.series.map(
            {
                "Pfizer": 35894915
            }
        )
        
    @define_drug_exposure
    def drug_exposure_3(self):
        """
        Create CDM object for drug_exposure
        """
        self.drug_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_exposure_end_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_exposure_start_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_source_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_source_value.series = self.inputs["Vaccinations.csv"]["type"]
        self.person_id.series = self.inputs["Vaccinations.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.drug_concept_id.series = self.drug_concept_id.series.map(
            {
                "Moderna": 37003518
            }
        )
        self.drug_source_concept_id.series = self.drug_source_concept_id.series.map(
            {
                "Moderna": 37003518
            }
        )
        
    @define_drug_exposure
    def drug_exposure_4(self):
        """
        Create CDM object for drug_exposure
        """
        self.drug_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_exposure_end_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_exposure_start_datetime.series = self.inputs["Vaccinations.csv"]["date_of_vaccination"]
        self.drug_source_concept_id.series = self.inputs["Vaccinations.csv"]["type"]
        self.drug_source_value.series = self.inputs["Vaccinations.csv"]["type"]
        self.person_id.series = self.inputs["Vaccinations.csv"]["ID"]
        
        # --- insert field operations --- 
        
        # --- insert term mapping --- 
        self.drug_concept_id.series = self.drug_concept_id.series.map(
            {
                "Pfizer": 37003436
            }
        )
        self.drug_source_concept_id.series = self.drug_source_concept_id.series.map(
            {
                "Pfizer": 37003436
            }
        )



Loading some inputs..

In [5]:
import coconnect
import glob
inputs = coconnect.tools.load_csv(glob.glob('../data/part1/*'))
inputs

[32m2022-03-16 14:45:48[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - DataCollection Object Created
[32m2022-03-16 14:45:48[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  Symptoms.csv [<coconnect.io.common.DataBrick object at 0x109534430>]
[32m2022-03-16 14:45:48[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  Blood_Test.csv [<coconnect.io.common.DataBrick object at 0x1093bfdf0>]
[32m2022-03-16 14:45:48[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  Serology.csv [<coconnect.io.common.DataBrick object at 0x1093bfa30>]
[32m2022-03-16 14:45:49[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  GP_Records.csv [<coconnect.io.common.DataBrick object at 0x1093bf220>]
[32m2022-03-16 14:45:49[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  Vaccinations.csv [<coconnect.io.common.DataBrick object at 0x1093bf280>]
[32m2022-03-16 14:45:49[0m - [34mLocalDataCollection[0m - [1;37mINF

<coconnect.io.plugins.local.LocalDataCollection at 0x1095341c0>

A new instances can be created from the created python class 

In [16]:
instance = ExampleDataset(inputs=inputs)
instance

[32m2022-03-17 10:52:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - CommonDataModel (5.3.1) created with co-connect-tools version 0.0.0
[32m2022-03-17 10:52:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Running with an DataCollection object
[32m2022-03-17 10:52:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Turning on automatic cdm column filling
[32m2022-03-17 10:52:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Added condition_occurrence_0 of type condition_occurrence
[32m2022-03-17 10:52:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Added condition_occurrence_1 of type condition_occurrence
[32m2022-03-17 10:52:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Added condition_occurrence_10 of type condition_occurrence
[32m2022-03-17 10:52:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Added condition_occurrence_11 of type condition_occurrence
[32m2022-03-17 10:52:54[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Added condition_occu

<__main__.ExampleDataset at 0x13394cee0>

In [17]:
instance.process()

[32m2022-03-17 10:52:58[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Starting processing in order: ['person', 'condition_occurrence', 'drug_exposure', 'observation']
[32m2022-03-17 10:52:58[0m - [34mExampleDataset[0m - [1;37mINFO[0m - Number of objects to process for each table...
{
      "condition_occurrence": 12,
      "drug_exposure": 5,
      "observation": 4,
      "person": 2
}
[32m2022-03-17 10:52:58[0m - [34mExampleDataset[0m - [1;37mINFO[0m - for person: found 2 objects
[32m2022-03-17 10:52:58[0m - [34mExampleDataset[0m - [1;37mINFO[0m - working on person
[32m2022-03-17 10:52:58[0m - [34mExampleDataset[0m - [1;37mINFO[0m - starting on person_0
[32m2022-03-17 10:52:58[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Retrieving initial dataframe for 'Demographics.csv' for the first time
[32m2022-03-17 10:52:59[0m - [34mPerson[0m - [1;37mINFO[0m - Automatically formatting data columns.
[32m2022-03-17 10:53:00[0m - [34mPerson[0m - [

[32m2022-03-17 10:53:09[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31mEither they are not in the original data, or while creating the person table, [0m
[32m2022-03-17 10:53:09[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31mstudies have been removed due to lack of required fields, such as birthdate.[0m
[32m2022-03-17 10:53:09[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31m12773/12846 were good, 73 studies are removed.[0m
[32m2022-03-17 10:53:09[0m - [34mExampleDataset[0m - [1;37mINFO[0m - starting on condition_occurrence_2
[32m2022-03-17 10:53:09[0m - [34mConditionOccurrence[0m - [1;37mINFO[0m - Automatically formatting data columns.
[32m2022-03-17 10:53:09[0m - [34mConditionOccurrence[0m - [1;37mINFO[0m - created df (0x13662d6a0)[condition_occurrence_2]
[32m2022-03-17 10:53:09[0m - [34mExampleDataset[0m - [1;37mINFO[0m - finished condition_occurrence_2 (0x13662d6a0) ... 5/12 completed, 13046 rows
[32m2022-03-17 10:53:09[0m - 

[32m2022-03-17 10:53:15[0m - [34mConditionOccurrence[0m - [1;37mINFO[0m - created df (0x137c1e190)[condition_occurrence_7]
[32m2022-03-17 10:53:15[0m - [34mExampleDataset[0m - [1;37mINFO[0m - finished condition_occurrence_7 (0x137c1e190) ... 10/12 completed, 46141 rows
[32m2022-03-17 10:53:16[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31mThere are person_ids in this table that are not in the output person table![0m
[32m2022-03-17 10:53:16[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31mEither they are not in the original data, or while creating the person table, [0m
[32m2022-03-17 10:53:16[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31mstudies have been removed due to lack of required fields, such as birthdate.[0m
[32m2022-03-17 10:53:16[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31m45893/46141 were good, 248 studies are removed.[0m
[32m2022-03-17 10:53:16[0m - [34mExampleDataset[0m - [1;37mINFO[0m - starting on condition_o

[32m2022-03-17 10:53:20[0m - [34mExampleDataset[0m - [1;37mINFO[0m - starting on drug_exposure_0
[32m2022-03-17 10:53:20[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Retrieving initial dataframe for 'Vaccinations.csv' for the first time
[32m2022-03-17 10:53:20[0m - [34mDrugExposure[0m - [1;37mINFO[0m - Automatically formatting data columns.
[32m2022-03-17 10:53:21[0m - [34mDrugExposure[0m - [1;37mINFO[0m - created df (0x13812b520)[drug_exposure_0]
[32m2022-03-17 10:53:21[0m - [34mExampleDataset[0m - [1;37mINFO[0m - finished drug_exposure_0 (0x13812b520) ... 1/5 completed, 24484 rows
[32m2022-03-17 10:53:22[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31mThere are person_ids in this table that are not in the output person table![0m
[32m2022-03-17 10:53:22[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31mEither they are not in the original data, or while creating the person table, [0m
[32m2022-03-17 10:53:22[0m - [34mExampleDataset

[32m2022-03-17 10:53:29[0m - [34mExampleDataset[0m - [1;37mINFO[0m - starting on observation_0
[32m2022-03-17 10:53:29[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Retrieving initial dataframe for 'Serology.csv' for the first time
[32m2022-03-17 10:53:29[0m - [34mObservation[0m - [1;37mINFO[0m - Automatically formatting data columns.
[32m2022-03-17 10:53:30[0m - [34mObservation[0m - [1;37mINFO[0m - created df (0x13bd0bdf0)[observation_0]
[32m2022-03-17 10:53:30[0m - [34mExampleDataset[0m - [1;37mINFO[0m - finished observation_0 (0x13bd0bdf0) ... 1/4 completed, 41140 rows
[32m2022-03-17 10:53:30[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31mThere are person_ids in this table that are not in the output person table![0m
[32m2022-03-17 10:53:30[0m - [34mExampleDataset[0m - [1;37mERROR[0m - [31mEither they are not in the original data, or while creating the person table, [0m
[32m2022-03-17 10:53:30[0m - [34mExampleDataset[0m - [1;3

In [18]:
instance.keys()

dict_keys(['person', 'condition_occurrence', 'drug_exposure', 'observation'])

In [19]:
instance['observation'].dropna(axis=1)

Unnamed: 0_level_0,person_id,observation_concept_id,observation_date,observation_datetime,observation_source_value,observation_source_concept_id
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,36796.0,4288455,2021-10-22,2021-10-22 00:00:00.000000,57.527969992111224,4288455
2,17453.0,4288455,2020-10-26,2020-10-26 00:00:00.000000,1.2722186060345093,4288455
3,58448.0,4288455,2021-02-03,2021-02-03 00:00:00.000000,88.1524834152799,4288455
4,83263.0,4288455,2020-04-05,2020-04-05 00:00:00.000000,54.55584411002554,4288455
5,6544.0,4288455,2022-11-19,2022-11-19 00:00:00.000000,29.283913796854904,4288455
...,...,...,...,...,...,...
118869,54967.0,40757663,2019-05-10,2019-05-10 00:00:00.000000,Cancer,40757663
118870,54969.0,40757663,2020-01-09,2020-01-09 00:00:00.000000,Cancer,40757663
118871,54969.0,40757663,2021-04-28,2021-04-28 00:00:00.000000,Cancer,40757663
118872,54973.0,40757663,2019-01-29,2019-01-29 00:00:00.000000,Cancer,40757663


## Manually edited 

By generating a python class from the rules files, you can manually edit the python file setting up i/o as well as making some edits to the various tables. Once done, it could simple be run as a python file:
```
python  ExampleDatasetModified.py
```


In [11]:
# %load ExampleDatasetModified.py
from coconnect.cdm import define_person, define_condition_occurrence, define_visit_occurrence, define_measurement, define_observation, define_drug_exposure
from coconnect.cdm import CommonDataModel
from coconnect.tools import load_csv,create_csv_store
import json
import glob
import pandas as pd

class ExampleDatasetModified(CommonDataModel):
    
    def __init__(self,**kwargs):
        """ 
        initialise the inputs and setup indexing 
        """
        inputs = load_csv(glob.glob('../data/part1/*'))
        outputs = create_csv_store(output_folder="./data_tests/",
                                                   sep="\t",
                                                   write_separate=True,
                                                   write_mode='w')
        
        super().__init__(inputs=inputs,outputs=outputs,**kwargs)
        self.process()
    
    @define_person
    def person_0(self):
        """
        Create CDM object for person
        """
        self.birth_datetime.series = self.inputs["Demographics.csv"]["Age"]
        self.gender_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_concept_id.series = self.inputs["Demographics.csv"]["Sex"]
        self.gender_source_value.series = self.inputs["Demographics.csv"]["Sex"]
        self.person_id.series = self.inputs["Demographics.csv"]["ID"]
        
        # --- insert field operations --- 
        self.birth_datetime.series = self.tools.get_datetime_from_age(self.birth_datetime.series)
        
        # --- insert term mapping --- 
        self.gender_concept_id.series = self.gender_concept_id.series.map(
            {
                "Male": 8507,
                "Female": 8532
            }
        )

    @define_observation
    def observation_0(self):
        """
        Create CDM object for observation
        """

        def convert_igg(x):
            """
            A custom function to convert the IgG into g/L
            """
            
            #example of a dataset where the assay has been recalibrated after a certain date
            #therefore you might need to do some conversion based upon the date
            factor = 1.2 if x['Date'].year < 2021 else 1
            
            #apply a factor to convert to g/L
            factor = factor * 10
            
            #return the modified IgG value
            return float(x['IgG'])*factor
        
        #save the source value of the IgG
        self.observation_source_value.series = self.inputs["Serology.csv"]["IgG"]

        #convert the date into a datetime object
        self.inputs["Serology.csv"]["Date"] =  pd.to_datetime(self.inputs["Serology.csv"]["Date"])
        
        #recalculate the IgG based upon a custom function
        self.inputs["Serology.csv"]["IgG"] = self.inputs["Serology.csv"].apply(lambda x: convert_igg(x),axis=1)
        #set the output units
        self.inputs["Serology.csv"]["Units"] = 'g/L'
        
        #set additional columns we did not have before...
        self.unit_source_value.series = self.inputs["Serology.csv"]["Units"]
        self.value_as_number.series = self.inputs["Serology.csv"]["IgG"]

        
        self.observation_concept_id.series = self.inputs["Serology.csv"]["IgG"]
        self.observation_datetime.series = self.inputs["Serology.csv"]["Date"]
        self.observation_source_concept_id.series = self.inputs["Serology.csv"]["IgG"]
        self.person_id.series = self.inputs["Serology.csv"]["ID"]

        
        # --- insert term mapping --- 
        self.observation_concept_id.series = self.tools.make_scalar(self.observation_concept_id.series,4288455)
        self.observation_source_concept_id.series = self.tools.make_scalar(self.observation_source_concept_id.series,4288455)
        


In [12]:
instance = ExampleDatasetModified()
instance

[32m2022-03-16 14:46:24[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - DataCollection Object Created
[32m2022-03-16 14:46:24[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  Symptoms.csv [<coconnect.io.common.DataBrick object at 0x12e49d8b0>]
[32m2022-03-16 14:46:24[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  Blood_Test.csv [<coconnect.io.common.DataBrick object at 0x12e49da30>]
[32m2022-03-16 14:46:24[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  Serology.csv [<coconnect.io.common.DataBrick object at 0x12e49dca0>]
[32m2022-03-16 14:46:24[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  GP_Records.csv [<coconnect.io.common.DataBrick object at 0x12e49d7f0>]
[32m2022-03-16 14:46:24[0m - [34mLocalDataCollection[0m - [1;37mINFO[0m - Registering  Vaccinations.csv [<coconnect.io.common.DataBrick object at 0x12e49d820>]
[32m2022-03-16 14:46:25[0m - [34mLocalDataCollection[0m - [1;37mINF

<__main__.ExampleDatasetModified at 0x12e49d400>

In [13]:
instance.keys()

dict_keys(['person', 'observation'])

In [14]:
instance['observation'].dropna(axis=1)

Unnamed: 0_level_0,person_id,observation_concept_id,observation_date,observation_datetime,value_as_number,observation_source_value,observation_source_concept_id,unit_source_value
observation_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,66578,4288455,2021-10-22,2021-10-22 00:00:00.000000,57.52797,57.527969992111224,4288455,g/L
2,31668,4288455,2020-10-26,2020-10-26 00:00:00.000000,1.526662,1.2722186060345093,4288455,g/L
3,7717,4288455,2021-02-03,2021-02-03 00:00:00.000000,88.152483,88.1524834152799,4288455,g/L
4,63190,4288455,2020-04-05,2020-04-05 00:00:00.000000,65.467013,54.55584411002554,4288455,g/L
5,11876,4288455,2022-11-19,2022-11-19 00:00:00.000000,29.283914,29.283913796854904,4288455,g/L
...,...,...,...,...,...,...,...,...
41136,34692,4288455,2021-04-10,2021-04-10 00:00:00.000000,44.055308,44.055307552316364,4288455,g/L
41137,14922,4288455,2021-03-16,2021-03-16 00:00:00.000000,68.023008,68.02300798516276,4288455,g/L
41138,61035,4288455,2021-05-25,2021-05-25 00:00:00.000000,20.857946,20.857945874128763,4288455,g/L
41139,12653,4288455,2021-01-22,2021-01-22 00:00:00.000000,22.646181,22.64618089045576,4288455,g/L
