Skip to content

Commit

Permalink
Improve metadata (#12)
Browse files Browse the repository at this point in the history
* Improves metadata about batches.

* Supports multiple batches per downloader/processor job.

* Adds organisms model for retrieving NCBI taxonmy ids.
  • Loading branch information
kurtwheeler committed Jun 2, 2017
1 parent 629709f commit ce380e5
Show file tree
Hide file tree
Showing 45 changed files with 1,683 additions and 597 deletions.
1 change: 1 addition & 0 deletions .gitignore
@@ -1,5 +1,6 @@
# Project specific files
workers/volume
foreman/volume

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
16 changes: 15 additions & 1 deletion README.md
Expand Up @@ -5,7 +5,9 @@ supported by Greene Lab.

## Getting Started

Note: The following steps assume you have already installed PostgreSQL (>=9.4) and Python (Most versions should work, but this has been tested with Python 3.5) on Ubuntu (Tested with 16.04. It should be possible to use other versions or even a Mac though).
Note: The following steps assume you have already installed PostgreSQL (>=9.4)
and Python (>=3.5) on Ubuntu (Tested with 16.04. It should be possible to use
other versions or even a Mac though).

Run `./install.sh` to set up the virtualenv. It will activate the `dr_env`
for you the first time. This virtualenv is valid for the entire data_refinery
Expand All @@ -18,6 +20,18 @@ instructions on doing so.

## Development

R files in this repo follow
[Google's R Style Guide](https://google.github.io/styleguide/Rguide.xml).
Python Files in this repo follow
[PEP 8](https://www.python.org/dev/peps/pep-0008/). All files (including
python and R) have a line limit of 100 characters.

A `setup.cfg` file has been included in the root of this repo which specifies
the line length limit for the autopep8 and flake8 linters. If you run either
of those programs from anywhere within the project's directory tree they will
enforce a limit of 100 instead of 80. This will also be true for editors which
rely on them.

It can be useful to have an interactive python interpreter running within the
context of the Docker container. The `run_shell.sh` script has been provided
for this purpose. It is in the top level directory so that if you wish to
Expand Down
58 changes: 51 additions & 7 deletions data_models/data_refinery_models/migrations/0001_initial.py
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.11 on 2017-05-02 18:50
# Generated by Django 1.10.6 on 2017-05-26 15:12
from __future__ import unicode_literals

from django.db import migrations, models
Expand All @@ -22,14 +22,20 @@ class Migration(migrations.Migration):
('updated_at', models.DateTimeField()),
('source_type', models.CharField(max_length=256)),
('size_in_bytes', models.IntegerField()),
('download_url', models.CharField(max_length=2048)),
('download_url', models.CharField(max_length=4096)),
('raw_format', models.CharField(max_length=256, null=True)),
('processed_format', models.CharField(max_length=256, null=True)),
('pipeline_required', models.CharField(max_length=256)),
('accession_code', models.CharField(max_length=32)),
('platform_accession_code', models.CharField(max_length=32)),
('experiment_accession_code', models.CharField(max_length=32)),
('experiment_title', models.CharField(max_length=256)),
('status', models.CharField(max_length=20)),
('release_date', models.DateField()),
('last_uploaded_date', models.DateField()),
('name', models.CharField(max_length=1024)),
('internal_location', models.CharField(max_length=256, null=True)),
('organism', models.IntegerField()),
('organism_id', models.IntegerField()),
('organism_name', models.CharField(max_length=256)),
],
options={
'db_table': 'batches',
Expand Down Expand Up @@ -60,12 +66,38 @@ class Migration(migrations.Migration):
('success', models.NullBooleanField()),
('num_retries', models.IntegerField(default=0)),
('worker_id', models.CharField(max_length=256, null=True)),
('batch', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='data_refinery_models.Batch')),
],
options={
'db_table': 'downloader_jobs',
},
),
migrations.CreateModel(
name='DownloaderJobsToBatches',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created_at', models.DateTimeField(editable=False)),
('updated_at', models.DateTimeField()),
('batch', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='data_refinery_models.Batch')),
('downloader_job', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='data_refinery_models.DownloaderJob')),
],
options={
'db_table': 'downloader_jobs_to_batches',
},
),
migrations.CreateModel(
name='Organism',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created_at', models.DateTimeField(editable=False)),
('updated_at', models.DateTimeField()),
('name', models.CharField(max_length=256)),
('taxonomy_id', models.IntegerField()),
('is_scientific_name', models.BooleanField(default=False)),
],
options={
'db_table': 'organisms',
},
),
migrations.CreateModel(
name='ProcessorJob',
fields=[
Expand All @@ -78,12 +110,24 @@ class Migration(migrations.Migration):
('pipeline_applied', models.CharField(max_length=256)),
('num_retries', models.IntegerField(default=0)),
('worker_id', models.CharField(max_length=256, null=True)),
('batch', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='data_refinery_models.Batch')),
],
options={
'db_table': 'processor_jobs',
},
),
migrations.CreateModel(
name='ProcessorJobsToBatches',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created_at', models.DateTimeField(editable=False)),
('updated_at', models.DateTimeField()),
('batch', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='data_refinery_models.Batch')),
('processor_job', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='data_refinery_models.ProcessorJob')),
],
options={
'db_table': 'processor_jobs_to_batches',
},
),
migrations.CreateModel(
name='SurveyJob',
fields=[
Expand Down Expand Up @@ -118,6 +162,6 @@ class Migration(migrations.Migration):
migrations.AddField(
model_name='batch',
name='survey_job',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='data_refinery_models.SurveyJob'),
field=models.ForeignKey(on_delete=django.db.models.deletion.PROTECT, to='data_refinery_models.SurveyJob'),
),
]
138 changes: 0 additions & 138 deletions data_models/data_refinery_models/models.py

This file was deleted.

13 changes: 13 additions & 0 deletions data_models/data_refinery_models/models/__init__.py
@@ -0,0 +1,13 @@
from data_refinery_models.models.surveys import SurveyJob, SurveyJobKeyValue
from data_refinery_models.models.batches import (
BatchStatuses,
Batch,
BatchKeyValue
)
from data_refinery_models.models.jobs import (
DownloaderJob,
ProcessorJob,
DownloaderJobsToBatches,
ProcessorJobsToBatches
)
from data_refinery_models.models.organism import Organism
20 changes: 20 additions & 0 deletions data_models/data_refinery_models/models/base_models.py
@@ -0,0 +1,20 @@
from django.db import models
from django.utils import timezone


class TimeTrackedModel(models.Model):
"""Base model with auto created_at and updated_at fields."""

created_at = models.DateTimeField(editable=False)
updated_at = models.DateTimeField()

def save(self, *args, **kwargs):
""" On save, update timestamps """
current_time = timezone.now()
if not self.id:
self.created_at = current_time
self.updated_at = current_time
return super(TimeTrackedModel, self).save(*args, **kwargs)

class Meta:
abstract = True
64 changes: 64 additions & 0 deletions data_models/data_refinery_models/models/batches.py
@@ -0,0 +1,64 @@
from enum import Enum
from django.db import models
from data_refinery_models.models.base_models import TimeTrackedModel
from data_refinery_models.models.surveys import SurveyJob


class BatchStatuses(Enum):
"""Valid values for the status field of the Batch model."""

NEW = "NEW"
DOWNLOADED = "DOWNLOADED"
PROCESSED = "PROCESSED"


class Batch(TimeTrackedModel):
"""Represents a batch of data.
The definition of a Batch is intentionally that vague. What a batch
is will vary from source to source. It could be a single file, or
a group of files with some kind of logical grouping such as an
experiment.
"""

survey_job = models.ForeignKey(SurveyJob, on_delete=models.PROTECT)
source_type = models.CharField(max_length=256)
size_in_bytes = models.IntegerField()
download_url = models.CharField(max_length=4096)
raw_format = models.CharField(max_length=256, null=True)
processed_format = models.CharField(max_length=256, null=True)
pipeline_required = models.CharField(max_length=256)
platform_accession_code = models.CharField(max_length=32)
experiment_accession_code = models.CharField(max_length=32)
experiment_title = models.CharField(max_length=256)
status = models.CharField(max_length=20)
release_date = models.DateField()
last_uploaded_date = models.DateField()
name = models.CharField(max_length=1024)

# This field will denote where in our system the file can be found.
internal_location = models.CharField(max_length=256, null=True)

# This corresponds to the organism taxonomy ID from NCBI.
organism_id = models.IntegerField()
# This is the organism name as it appeared in the experiment.
organism_name = models.CharField(max_length=256)

class Meta:
db_table = "batches"


class BatchKeyValue(TimeTrackedModel):
"""Tracks additional fields for Batches.
Useful for fields that would be sparsely populated if they were
their own columns. I.e. one source may have an extra field or two
that are worth tracking but are specific to that source.
"""

batch = models.ForeignKey(Batch, on_delete=models.CASCADE)
key = models.CharField(max_length=256)
value = models.CharField(max_length=256)

class Meta:
db_table = "batch_key_values"

0 comments on commit ce380e5

Please sign in to comment.