/
serialize.py
160 lines (136 loc) · 5.98 KB
/
serialize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import datetime
import json
import os
import tarfile
import tempfile
import boto3
from featuretools.utils.gen_utils import use_s3fs_es, use_smartopen_es
from featuretools.utils.wrangle import _is_s3, _is_url
FORMATS = ['csv', 'pickle', 'parquet']
SCHEMA_VERSION = "1.0.0"
def entity_to_description(entity):
'''Serialize entity to data description.
Args:
entity (Entity) : Instance of :class:`.Entity`.
Returns:
dictionary (dict) : Description of :class:`.Entity`.
'''
index = entity.df.columns.isin([variable.id for variable in entity.variables])
dtypes = entity.df[entity.df.columns[index]].dtypes.astype(str).to_dict()
description = {
"id": entity.id,
"index": entity.index,
"time_index": entity.time_index,
"properties": {
'secondary_time_index': entity.secondary_time_index,
'last_time_index': entity.last_time_index is not None,
},
"variables": [variable.to_data_description() for variable in entity.variables],
"loading_info": {
'params': {},
'properties': {
'dtypes': dtypes
}
}
}
return description
def entityset_to_description(entityset):
'''Serialize entityset to data description.
Args:
entityset (EntitySet) : Instance of :class:`.EntitySet`.
Returns:
description (dict) : Description of :class:`.EntitySet`.
'''
entities = {entity.id: entity_to_description(entity) for entity in entityset.entities}
relationships = [relationship.to_dictionary() for relationship in entityset.relationships]
data_description = {
'schema_version': SCHEMA_VERSION,
'id': entityset.id,
'entities': entities,
'relationships': relationships,
}
return data_description
def write_entity_data(entity, path, format='csv', **kwargs):
'''Write entity data to disk or S3 path.
Args:
entity (Entity) : Instance of :class:`.Entity`.
path (str) : Location on disk to write entity data.
format (str) : Format to use for writing entity data. Defaults to csv.
kwargs (keywords) : Additional keyword arguments to pass as keywords arguments to the underlying serialization method.
Returns:
loading_info (dict) : Information on storage location and format of entity data.
'''
format = format.lower()
basename = '.'.join([entity.id, format])
location = os.path.join('data', basename)
file = os.path.join(path, location)
if format == 'csv':
entity.df.to_csv(
file,
index=kwargs['index'],
sep=kwargs['sep'],
encoding=kwargs['encoding'],
compression=kwargs['compression'],
)
elif format == 'parquet':
# Serializing to parquet format raises an error when columns contain tuples.
# Columns containing tuples are mapped as dtype object.
# Issue is resolved by casting columns of dtype object to string.
df = entity.df.copy()
columns = df.select_dtypes('object').columns
df[columns] = df[columns].astype('unicode')
df.columns = df.columns.astype('unicode') # ensures string column names for python 2.7
df.to_parquet(file, **kwargs)
elif format == 'pickle':
entity.df.to_pickle(file, **kwargs)
else:
error = 'must be one of the following formats: {}'
raise ValueError(error.format(', '.join(FORMATS)))
return {'location': location, 'type': format, 'params': kwargs}
def write_data_description(entityset, path, profile_name=None, **kwargs):
'''Serialize entityset to data description and write to disk or S3 path.
Args:
entityset (EntitySet) : Instance of :class:`.EntitySet`.
path (str) : Location on disk or S3 path to write `data_description.json` and entity data.
profile_name (str, bool): The AWS profile specified to write to S3. Will default to None and search for AWS credentials.
Set to False to use an anonymous profile.
kwargs (keywords) : Additional keyword arguments to pass as keywords arguments to the underlying serialization method or to specify AWS profile.
'''
if _is_s3(path):
with tempfile.TemporaryDirectory() as tmpdir:
os.makedirs(os.path.join(tmpdir, 'data'))
dump_data_description(entityset, tmpdir, **kwargs)
file_path = create_archive(tmpdir)
transport_params = {}
session = boto3.Session()
if isinstance(profile_name, str):
transport_params = {'session': boto3.Session(profile_name=profile_name)}
use_smartopen_es(file_path, path, transport_params, read=False)
elif profile_name is False:
use_s3fs_es(file_path, path, read=False)
elif session.get_credentials() is not None:
use_smartopen_es(file_path, path, read=False)
else:
use_s3fs_es(file_path, path, read=False)
elif _is_url(path):
raise ValueError("Writing to URLs is not supported")
else:
path = os.path.abspath(path)
os.makedirs(os.path.join(path, 'data'), exist_ok=True)
dump_data_description(entityset, path, **kwargs)
def dump_data_description(entityset, path, **kwargs):
description = entityset_to_description(entityset)
for entity in entityset.entities:
loading_info = write_entity_data(entity, path, **kwargs)
description['entities'][entity.id]['loading_info'].update(loading_info)
file = os.path.join(path, 'data_description.json')
with open(file, 'w') as file:
json.dump(description, file)
def create_archive(tmpdir):
file_name = "es-{date:%Y-%m-%d_%H:%M:%S}.tar".format(date=datetime.datetime.now())
file_path = os.path.join(tmpdir, file_name)
tar = tarfile.open(str(file_path), 'w')
tar.add(str(tmpdir) + '/data_description.json', arcname='/data_description.json')
tar.add(str(tmpdir) + '/data', arcname='/data')
tar.close()
return file_path