Skip to content

Commit

Permalink
Improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
AaronKalair committed Jul 25, 2015
1 parent 841bfa9 commit 2008c3c
Show file tree
Hide file tree
Showing 7 changed files with 170 additions and 54 deletions.
Binary file modified .DS_Store
Binary file not shown.
4 changes: 4 additions & 0 deletions .gitignore
@@ -1 +1,5 @@
all_forms_PROACT.txt
*.pyc
*.swp
.DS_Store
all_forms_validate_spike.txt
174 changes: 130 additions & 44 deletions ImportData.py
@@ -1,56 +1,142 @@
import csv
import peewee
from peewee import *
from Models import PatientMeasurements, Features
from functools import partial
import datetime
from math import ceil

TRAINING_DATA_FILE_NAME = "all_forms_PROACT.txt"
from peewee import MySQLDatabase

from Models import PatientMeasurement, Feature
import settings


#TRAINING_DATA_FILE_NAME = "all_forms_PROACT.txt"
#TRAINING_DATA_FILE_NAME = "all_forms_validate_spike.txt"
TRAINING_DATA_FILE_NAME = "test_data.txt"
SUBJECT_ID = 0
FORM_NAME = 1
FEATURE_NAME = 2
FEATURE_VALUE = 3
FEATURE_UNIT = 4
FEATURE_DELTA = 5
HOST = "localhost"
USERNAME = "root"
PASSWORD = ""
DATABASE = "als"
DONT_RECREATE_TABLES = False

db = MySQLDatabase(DATABASE, user=USERNAME, passwd=PASSWORD)
db.connect()
PatientMeasurements.drop_table()
Features.drop_table()
db.create_tables([PatientMeasurements, Features], safe=DONT_RECREATE_TABLES)
DROP_EXISTING_TABLES = True
CREATE_TABLES = True
BATCH_SIZE = 1000
seen_features = set()


def validate_feature_value(value):
"""Some missing values are stored as NA, we can't insert NA into numeric
field so set it to None which ends up being converted to null
Other fields have a semicolon seperated list of numbers some of which are
NA
"""
if value == "NA":
return None
if ";" in value:
#print "Value contained a ; seperated list, decide what to do with " \
# "these"
return None
return value


def drop_tables():
"""Drops the tables so you can start again fresh"""
try:
PatientMeasurement.drop_table()
Feature.drop_table()
except Exception as e:
print "Failed to drop tables", e
raise


def create_database_connection():
"""Creates a connection to a MySQLDatabase using the parmeters specified
in the setting file"""
try:
db = MySQLDatabase(
settings.DATABASE,
user=settings.USERNAME,
passwd=settings.PASSWORD
)
db.connect()
except Exception as e:
print "Failed to connect to database", e
return db


def create_tables():
"""Creates the required tables assuming they dont exist and a database
connection db exists as a global object"""
try:
db.create_tables([PatientMeasurement, Feature], safe=True)
except Exception as e:
print "Failed to create tables", e
raise


def yield_x_rows(rows, start_index, x):
"""Yields x rows beginning at start_index from rows"""
for i in xrange(x):
row = rows[start_index + i]
res = {
"subject_id": row[SUBJECT_ID],
"feature_name": row[FEATURE_NAME],
"delta": validate_feature_value(row[FEATURE_DELTA]),
"value": validate_feature_value(row[FEATURE_VALUE])
}
yield res


print "Creating database connection"
db = create_database_connection()
if DROP_EXISTING_TABLES:
drop_tables()
if CREATE_TABLES:
create_tables()


with open(TRAINING_DATA_FILE_NAME, "rb") as csvfile:
reader = csv.reader(csvfile, delimiter="|")
next(reader, None)
for count, row in enumerate(reader):
print count
insert_fn = partial(
PatientMeasurements,
subject_id=row[SUBJECT_ID],
feature_name=row[FEATURE_NAME],
)
try:
insert_fn(
delta=row[FEATURE_DELTA],
value=row[FEATURE_VALUE]
).save()
except ValueError:
print row[SUBJECT_ID]
print row[FEATURE_NAME]
print row[FEATURE_DELTA]
print row[FEATURE_VALUE]
insert_fn(
delta=None,
value=None
).save()
try:
Features.get(feature_name=row[FEATURE_NAME])
except Features.DoesNotExist:
Features(
feature_name=row[FEATURE_NAME],
form_name=row[FORM_NAME]
).save()
row_list = list(reader)
number_of_rows = len(row_list)
start_index = 0
print "Inserting %s rows" % number_of_rows
# xrange does not include the final number but its fine because we index
# from 0
start_time = datetime.datetime.now()
for _ in xrange(int(ceil(number_of_rows / BATCH_SIZE))):
batch_start_time = datetime.datetime.now()
PatientMeasurement.insert_many(
yield_x_rows(row_list, start_index, BATCH_SIZE)
).execute()
batch_end_time = datetime.datetime.now()
batch_time_taken = \
(batch_end_time - batch_start_time).total_seconds()
print "Inserted %s rows in %s seconds" % (BATCH_SIZE, batch_time_taken)
start_index += BATCH_SIZE
end_time = datetime.datetime.now()
time_taken = \
(end_time - start_time).total_seconds()
print "Inserted %s rows in %s seconds" % (number_of_rows, time_taken)


#for count, row in enumerate(reader):
# try:
# PatientMeasurement(
# subject_id=row[SUBJECT_ID],
# feature_name=row[FEATURE_NAME],
# delta=validate_feature_value(row[FEATURE_DELTA]),
# value=validate_feature_value(row[FEATURE_VALUE])
# ).save()
# except ValueError:
# print row[SUBJECT_ID]
# print row[FEATURE_NAME]
# print row[FEATURE_DELTA]
# print row[FEATURE_VALUE]
# raise
# if not row[FEATURE_NAME] in seen_features:
# Feature(
# feature_name=row[FEATURE_NAME],
# form_name=row[FORM_NAME]
# ).save()
# seen_features.add(row[FEATURE_NAME])
31 changes: 21 additions & 10 deletions Models.py
@@ -1,14 +1,25 @@
import peewee
from peewee import *
from peewee import Model, TextField, IntegerField, MySQLDatabase
import settings

database = MySQLDatabase(
settings.DATABASE,
user=settings.USERNAME,
passwd=settings.PASSWORD
)

class PatientMeasurements(peewee.Model):
subject_id = peewee.IntegerField()
feature_name = peewee.TextField()
value = peewee.TextField(null=True)
delta = peewee.IntegerField(null=True)

class BaseModel(Model):
class Meta:
database = database

class Features(peewee.Model):
feature_name = peewee.TextField()
form_name = peewee.TextField()

class PatientMeasurement(BaseModel):
subject_id = IntegerField()
feature_name = TextField()
value = TextField(null=True)
delta = IntegerField(null=True)


class Feature(BaseModel):
feature_name = TextField()
form_name = TextField()
Binary file removed Models.pyc
Binary file not shown.
4 changes: 4 additions & 0 deletions settings.py
@@ -0,0 +1,4 @@
HOST = "localhost"
USERNAME = "root"
PASSWORD = ""
DATABASE = "als"
11 changes: 11 additions & 0 deletions test_data.txt
@@ -0,0 +1,11 @@
SubjectID|form_name|feature_name|feature_value|feature_unit|feature_delta
4205|Lab Test|Gamma-glutamyltransferase|58|U/L|92
4205|Lab Test|AST(SGOT)|39|U/L|92
4205|ALSHX|onset_site|Limb|NA|0
4205|Lab Test|Alkaline phosphatase|64|U/L|92
4205|Vitals|bp_systolic|118|mmHg|92
4205|Lab Test|ALT(SGPT)|27|U/L|92
4205|Riluzole|if_use_Riluzole|Yes|NA|0
4205|Vitals|weight|63.8|kg|365
4205|Vitals|weight|69.1|kg|183
4205|Vitals|bp_diastolic|66|mmHg|365

0 comments on commit 2008c3c

Please sign in to comment.