Improve metadata (#12)

* Improves metadata about batches. * Supports multiple batches per downloader/processor job. * Adds organisms model for retrieving NCBI taxonmy ids.
AlexsLemonade · Jun 2, 2017 · ce380e5 · ce380e5
1 parent 629709f
commit ce380e5
Show file tree

Hide file tree

Showing 45 changed files with 1,683 additions and 597 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 # Project specific files
 workers/volume
+foreman/volume
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/README.md b/README.md
@@ -5,7 +5,9 @@ supported by Greene Lab.
 
 ## Getting Started
 
-Note: The following steps assume you have already installed PostgreSQL (>=9.4) and Python (Most versions should work, but this has been tested with Python 3.5) on Ubuntu (Tested with 16.04. It should be possible to use other versions or even a Mac though).
+Note: The following steps assume you have already installed PostgreSQL (>=9.4)
+and Python (>=3.5) on Ubuntu (Tested with 16.04. It should be possible to use
+other versions or even a Mac though).
 
 Run `./install.sh` to set up the virtualenv. It will activate the `dr_env`
 for you the first time. This virtualenv is valid for the entire data_refinery
@@ -18,6 +20,18 @@ instructions on doing so.
 
 ## Development
 
+R files in this repo follow
+[Google's R Style Guide](https://google.github.io/styleguide/Rguide.xml).
+Python Files in this repo follow
+[PEP 8](https://www.python.org/dev/peps/pep-0008/). All files (including
+python and R) have a line limit of 100 characters.
+
+A `setup.cfg` file has been included in the root of this repo which specifies
+the line length limit for the autopep8 and flake8 linters. If you run either
+of those programs from anywhere within the project's directory tree they will
+enforce a limit of 100 instead of 80. This will also be true for editors which
+rely on them.
+
 It can be useful to have an interactive python interpreter running within the
 context of the Docker container. The `run_shell.sh` script has been provided
 for this purpose. It is in the top level directory so that if you wish to

diff --git a/data_models/data_refinery_models/migrations/0001_initial.py b/data_models/data_refinery_models/migrations/0001_initial.py
@@ -1,5 +1,5 @@
 # -*- coding: utf-8 -*-
-# Generated by Django 1.11 on 2017-05-02 18:50
+# Generated by Django 1.10.6 on 2017-05-26 15:12
 from __future__ import unicode_literals
 
 from django.db import migrations, models
@@ -22,14 +22,20 @@ class Migration(migrations.Migration):
                 ('updated_at', models.DateTimeField()),
                 ('source_type', models.CharField(max_length=256)),
                 ('size_in_bytes', models.IntegerField()),
-                ('download_url', models.CharField(max_length=2048)),
+                ('download_url', models.CharField(max_length=4096)),
                 ('raw_format', models.CharField(max_length=256, null=True)),
                 ('processed_format', models.CharField(max_length=256, null=True)),
                 ('pipeline_required', models.CharField(max_length=256)),
-                ('accession_code', models.CharField(max_length=32)),
+                ('platform_accession_code', models.CharField(max_length=32)),
+                ('experiment_accession_code', models.CharField(max_length=32)),
+                ('experiment_title', models.CharField(max_length=256)),
                 ('status', models.CharField(max_length=20)),
+                ('release_date', models.DateField()),
+                ('last_uploaded_date', models.DateField()),
+                ('name', models.CharField(max_length=1024)),
                 ('internal_location', models.CharField(max_length=256, null=True)),
-                ('organism', models.IntegerField()),
+                ('organism_id', models.IntegerField()),
+                ('organism_name', models.CharField(max_length=256)),
             ],
             options={
                 'db_table': 'batches',
@@ -60,12 +66,38 @@ class Migration(migrations.Migration):
                 ('success', models.NullBooleanField()),
                 ('num_retries', models.IntegerField(default=0)),
                 ('worker_id', models.CharField(max_length=256, null=True)),
-                ('batch', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='data_refinery_models.Batch')),
             ],
             options={
                 'db_table': 'downloader_jobs',
             },
         ),
+        migrations.CreateModel(
+            name='DownloaderJobsToBatches',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('created_at', models.DateTimeField(editable=False)),
+                ('updated_at', models.DateTimeField()),
+                ('batch', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='data_refinery_models.Batch')),
+                ('downloader_job', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='data_refinery_models.DownloaderJob')),
+            ],
+            options={
+                'db_table': 'downloader_jobs_to_batches',
+            },
+        ),
+        migrations.CreateModel(
+            name='Organism',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('created_at', models.DateTimeField(editable=False)),
+                ('updated_at', models.DateTimeField()),
+                ('name', models.CharField(max_length=256)),
+                ('taxonomy_id', models.IntegerField()),
+                ('is_scientific_name', models.BooleanField(default=False)),
+            ],
+            options={
+                'db_table': 'organisms',
+            },
+        ),
         migrations.CreateModel(
             name='ProcessorJob',
             fields=[
@@ -78,12 +110,24 @@ class Migration(migrations.Migration):
                 ('pipeline_applied', models.CharField(max_length=256)),
                 ('num_retries', models.IntegerField(default=0)),
                 ('worker_id', models.CharField(max_length=256, null=True)),
-                ('batch', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='data_refinery_models.Batch')),
             ],
             options={
                 'db_table': 'processor_jobs',
             },
         ),
+        migrations.CreateModel(
+            name='ProcessorJobsToBatches',
+            fields=[
+                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('created_at', models.DateTimeField(editable=False)),
+                ('updated_at', models.DateTimeField()),
+                ('batch', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='data_refinery_models.Batch')),
+                ('processor_job', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='data_refinery_models.ProcessorJob')),
+            ],
+            options={
+                'db_table': 'processor_jobs_to_batches',
+            },
+        ),
         migrations.CreateModel(
             name='SurveyJob',
             fields=[
@@ -118,6 +162,6 @@ class Migration(migrations.Migration):
         migrations.AddField(
             model_name='batch',
             name='survey_job',
-            field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='data_refinery_models.SurveyJob'),
+            field=models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, to='data_refinery_models.SurveyJob'),
         ),
     ]
diff --git a/data_models/data_refinery_models/models.py b/data_models/data_refinery_models/models.py
diff --git a/data_models/data_refinery_models/models/__init__.py b/data_models/data_refinery_models/models/__init__.py
@@ -0,0 +1,13 @@
+from data_refinery_models.models.surveys import SurveyJob, SurveyJobKeyValue
+from data_refinery_models.models.batches import (
+    BatchStatuses,
+    Batch,
+    BatchKeyValue
+)
+from data_refinery_models.models.jobs import (
+    DownloaderJob,
+    ProcessorJob,
+    DownloaderJobsToBatches,
+    ProcessorJobsToBatches
+)
+from data_refinery_models.models.organism import Organism
diff --git a/data_models/data_refinery_models/models/base_models.py b/data_models/data_refinery_models/models/base_models.py
@@ -0,0 +1,20 @@
+from django.db import models
+from django.utils import timezone
+
+
+class TimeTrackedModel(models.Model):
+    """Base model with auto created_at and updated_at fields."""
+
+    created_at = models.DateTimeField(editable=False)
+    updated_at = models.DateTimeField()
+
+    def save(self, *args, **kwargs):
+        """ On save, update timestamps """
+        current_time = timezone.now()
+        if not self.id:
+            self.created_at = current_time
+        self.updated_at = current_time
+        return super(TimeTrackedModel, self).save(*args, **kwargs)
+
+    class Meta:
+        abstract = True
diff --git a/data_models/data_refinery_models/models/batches.py b/data_models/data_refinery_models/models/batches.py
@@ -0,0 +1,64 @@
+from enum import Enum
+from django.db import models
+from data_refinery_models.models.base_models import TimeTrackedModel
+from data_refinery_models.models.surveys import SurveyJob
+
+
+class BatchStatuses(Enum):
+    """Valid values for the status field of the Batch model."""
+
+    NEW = "NEW"
+    DOWNLOADED = "DOWNLOADED"
+    PROCESSED = "PROCESSED"
+
+
+class Batch(TimeTrackedModel):
+    """Represents a batch of data.
+
+    The definition of a Batch is intentionally that vague. What a batch
+    is will vary from source to source. It could be a single file, or
+    a group of files with some kind of logical grouping such as an
+    experiment.
+    """
+
+    survey_job = models.ForeignKey(SurveyJob, on_delete=models.PROTECT)
+    source_type = models.CharField(max_length=256)
+    size_in_bytes = models.IntegerField()
+    download_url = models.CharField(max_length=4096)
+    raw_format = models.CharField(max_length=256, null=True)
+    processed_format = models.CharField(max_length=256, null=True)
+    pipeline_required = models.CharField(max_length=256)
+    platform_accession_code = models.CharField(max_length=32)
+    experiment_accession_code = models.CharField(max_length=32)
+    experiment_title = models.CharField(max_length=256)
+    status = models.CharField(max_length=20)
+    release_date = models.DateField()
+    last_uploaded_date = models.DateField()
+    name = models.CharField(max_length=1024)
+
+    # This field will denote where in our system the file can be found.
+    internal_location = models.CharField(max_length=256, null=True)
+
+    # This corresponds to the organism taxonomy ID from NCBI.
+    organism_id = models.IntegerField()
+    # This is the organism name as it appeared in the experiment.
+    organism_name = models.CharField(max_length=256)
+
+    class Meta:
+        db_table = "batches"
+
+
+class BatchKeyValue(TimeTrackedModel):
+    """Tracks additional fields for Batches.
+
+    Useful for fields that would be sparsely populated if they were
+    their own columns. I.e. one source may have an extra field or two
+    that are worth tracking but are specific to that source.
+    """
+
+    batch = models.ForeignKey(Batch, on_delete=models.CASCADE)
+    key = models.CharField(max_length=256)
+    value = models.CharField(max_length=256)
+
+    class Meta:
+        db_table = "batch_key_values"