Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
841bfa9
commit 2008c3c
Showing
7 changed files
with
170 additions
and
54 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,5 @@ | ||
all_forms_PROACT.txt | ||
*.pyc | ||
*.swp | ||
.DS_Store | ||
all_forms_validate_spike.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,56 +1,142 @@ | ||
import csv | ||
import peewee | ||
from peewee import * | ||
from Models import PatientMeasurements, Features | ||
from functools import partial | ||
import datetime | ||
from math import ceil | ||
|
||
TRAINING_DATA_FILE_NAME = "all_forms_PROACT.txt" | ||
from peewee import MySQLDatabase | ||
|
||
from Models import PatientMeasurement, Feature | ||
import settings | ||
|
||
|
||
#TRAINING_DATA_FILE_NAME = "all_forms_PROACT.txt" | ||
#TRAINING_DATA_FILE_NAME = "all_forms_validate_spike.txt" | ||
TRAINING_DATA_FILE_NAME = "test_data.txt" | ||
SUBJECT_ID = 0 | ||
FORM_NAME = 1 | ||
FEATURE_NAME = 2 | ||
FEATURE_VALUE = 3 | ||
FEATURE_UNIT = 4 | ||
FEATURE_DELTA = 5 | ||
HOST = "localhost" | ||
USERNAME = "root" | ||
PASSWORD = "" | ||
DATABASE = "als" | ||
DONT_RECREATE_TABLES = False | ||
|
||
db = MySQLDatabase(DATABASE, user=USERNAME, passwd=PASSWORD) | ||
db.connect() | ||
PatientMeasurements.drop_table() | ||
Features.drop_table() | ||
db.create_tables([PatientMeasurements, Features], safe=DONT_RECREATE_TABLES) | ||
DROP_EXISTING_TABLES = True | ||
CREATE_TABLES = True | ||
BATCH_SIZE = 1000 | ||
seen_features = set() | ||
|
||
|
||
def validate_feature_value(value): | ||
"""Some missing values are stored as NA, we can't insert NA into numeric | ||
field so set it to None which ends up being converted to null | ||
Other fields have a semicolon seperated list of numbers some of which are | ||
NA | ||
""" | ||
if value == "NA": | ||
return None | ||
if ";" in value: | ||
#print "Value contained a ; seperated list, decide what to do with " \ | ||
# "these" | ||
return None | ||
return value | ||
|
||
|
||
def drop_tables(): | ||
"""Drops the tables so you can start again fresh""" | ||
try: | ||
PatientMeasurement.drop_table() | ||
Feature.drop_table() | ||
except Exception as e: | ||
print "Failed to drop tables", e | ||
raise | ||
|
||
|
||
def create_database_connection(): | ||
"""Creates a connection to a MySQLDatabase using the parmeters specified | ||
in the setting file""" | ||
try: | ||
db = MySQLDatabase( | ||
settings.DATABASE, | ||
user=settings.USERNAME, | ||
passwd=settings.PASSWORD | ||
) | ||
db.connect() | ||
except Exception as e: | ||
print "Failed to connect to database", e | ||
return db | ||
|
||
|
||
def create_tables(): | ||
"""Creates the required tables assuming they dont exist and a database | ||
connection db exists as a global object""" | ||
try: | ||
db.create_tables([PatientMeasurement, Feature], safe=True) | ||
except Exception as e: | ||
print "Failed to create tables", e | ||
raise | ||
|
||
|
||
def yield_x_rows(rows, start_index, x): | ||
"""Yields x rows beginning at start_index from rows""" | ||
for i in xrange(x): | ||
row = rows[start_index + i] | ||
res = { | ||
"subject_id": row[SUBJECT_ID], | ||
"feature_name": row[FEATURE_NAME], | ||
"delta": validate_feature_value(row[FEATURE_DELTA]), | ||
"value": validate_feature_value(row[FEATURE_VALUE]) | ||
} | ||
yield res | ||
|
||
|
||
print "Creating database connection" | ||
db = create_database_connection() | ||
if DROP_EXISTING_TABLES: | ||
drop_tables() | ||
if CREATE_TABLES: | ||
create_tables() | ||
|
||
|
||
with open(TRAINING_DATA_FILE_NAME, "rb") as csvfile: | ||
reader = csv.reader(csvfile, delimiter="|") | ||
next(reader, None) | ||
for count, row in enumerate(reader): | ||
print count | ||
insert_fn = partial( | ||
PatientMeasurements, | ||
subject_id=row[SUBJECT_ID], | ||
feature_name=row[FEATURE_NAME], | ||
) | ||
try: | ||
insert_fn( | ||
delta=row[FEATURE_DELTA], | ||
value=row[FEATURE_VALUE] | ||
).save() | ||
except ValueError: | ||
print row[SUBJECT_ID] | ||
print row[FEATURE_NAME] | ||
print row[FEATURE_DELTA] | ||
print row[FEATURE_VALUE] | ||
insert_fn( | ||
delta=None, | ||
value=None | ||
).save() | ||
try: | ||
Features.get(feature_name=row[FEATURE_NAME]) | ||
except Features.DoesNotExist: | ||
Features( | ||
feature_name=row[FEATURE_NAME], | ||
form_name=row[FORM_NAME] | ||
).save() | ||
row_list = list(reader) | ||
number_of_rows = len(row_list) | ||
start_index = 0 | ||
print "Inserting %s rows" % number_of_rows | ||
# xrange does not include the final number but its fine because we index | ||
# from 0 | ||
start_time = datetime.datetime.now() | ||
for _ in xrange(int(ceil(number_of_rows / BATCH_SIZE))): | ||
batch_start_time = datetime.datetime.now() | ||
PatientMeasurement.insert_many( | ||
yield_x_rows(row_list, start_index, BATCH_SIZE) | ||
).execute() | ||
batch_end_time = datetime.datetime.now() | ||
batch_time_taken = \ | ||
(batch_end_time - batch_start_time).total_seconds() | ||
print "Inserted %s rows in %s seconds" % (BATCH_SIZE, batch_time_taken) | ||
start_index += BATCH_SIZE | ||
end_time = datetime.datetime.now() | ||
time_taken = \ | ||
(end_time - start_time).total_seconds() | ||
print "Inserted %s rows in %s seconds" % (number_of_rows, time_taken) | ||
|
||
|
||
#for count, row in enumerate(reader): | ||
# try: | ||
# PatientMeasurement( | ||
# subject_id=row[SUBJECT_ID], | ||
# feature_name=row[FEATURE_NAME], | ||
# delta=validate_feature_value(row[FEATURE_DELTA]), | ||
# value=validate_feature_value(row[FEATURE_VALUE]) | ||
# ).save() | ||
# except ValueError: | ||
# print row[SUBJECT_ID] | ||
# print row[FEATURE_NAME] | ||
# print row[FEATURE_DELTA] | ||
# print row[FEATURE_VALUE] | ||
# raise | ||
# if not row[FEATURE_NAME] in seen_features: | ||
# Feature( | ||
# feature_name=row[FEATURE_NAME], | ||
# form_name=row[FORM_NAME] | ||
# ).save() | ||
# seen_features.add(row[FEATURE_NAME]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,25 @@ | ||
import peewee | ||
from peewee import * | ||
from peewee import Model, TextField, IntegerField, MySQLDatabase | ||
import settings | ||
|
||
database = MySQLDatabase( | ||
settings.DATABASE, | ||
user=settings.USERNAME, | ||
passwd=settings.PASSWORD | ||
) | ||
|
||
class PatientMeasurements(peewee.Model): | ||
subject_id = peewee.IntegerField() | ||
feature_name = peewee.TextField() | ||
value = peewee.TextField(null=True) | ||
delta = peewee.IntegerField(null=True) | ||
|
||
class BaseModel(Model): | ||
class Meta: | ||
database = database | ||
|
||
class Features(peewee.Model): | ||
feature_name = peewee.TextField() | ||
form_name = peewee.TextField() | ||
|
||
class PatientMeasurement(BaseModel): | ||
subject_id = IntegerField() | ||
feature_name = TextField() | ||
value = TextField(null=True) | ||
delta = IntegerField(null=True) | ||
|
||
|
||
class Feature(BaseModel): | ||
feature_name = TextField() | ||
form_name = TextField() |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
HOST = "localhost" | ||
USERNAME = "root" | ||
PASSWORD = "" | ||
DATABASE = "als" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
SubjectID|form_name|feature_name|feature_value|feature_unit|feature_delta | ||
4205|Lab Test|Gamma-glutamyltransferase|58|U/L|92 | ||
4205|Lab Test|AST(SGOT)|39|U/L|92 | ||
4205|ALSHX|onset_site|Limb|NA|0 | ||
4205|Lab Test|Alkaline phosphatase|64|U/L|92 | ||
4205|Vitals|bp_systolic|118|mmHg|92 | ||
4205|Lab Test|ALT(SGPT)|27|U/L|92 | ||
4205|Riluzole|if_use_Riluzole|Yes|NA|0 | ||
4205|Vitals|weight|63.8|kg|365 | ||
4205|Vitals|weight|69.1|kg|183 | ||
4205|Vitals|bp_diastolic|66|mmHg|365 |