-
Notifications
You must be signed in to change notification settings - Fork 6
/
dynamo_pandas.py
223 lines (178 loc) · 7.92 KB
/
dynamo_pandas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
import pandas as pd
from .transactions import get_all_items
from .transactions import get_items
from .transactions import put_items
def get_df(*, table, keys=None, attributes=None, dtype=None):
"""Get items from a table into a dataframe.
Parameters
----------
table : str
Name of the DynamoDB table.
keys : list[dict]
List of keys to get where each key is represented by a dictionary.
attributes : list[str]
Names of the item attributes to return as dataframe columns. If None (default),
all attributes are returned.
dtype : data type or dict of column names -> data type
Use a numpy.dtype or Python type to cast entire pandas object to the same type.
Alternatively, use {col: dtype, …}, where col is a column label and dtype is a
numpy.dtype or Python type to cast one or more of the DataFrame’s columns to
column-specific types.
Returns
-------
pandas.DataFrame
A dataframe where each item from the table matching the requested keys is
represented by a row and its attributes by columns.
Examples
--------
>>> df = get_df(
... table="players",
... keys=[{"player_id": "player_three"}, {"player_id": "player_one"}]
... )
>>> print(df)
bonus_points player_id last_play rating play_time
0 4 player_three 2021-01-21 10:22:43 2.5 1 days 14:01:19
1 3 player_one 2021-01-18 22:47:23 4.3 2 days 17:41:55
By default, the data types of the returned dataframe are basic pandas/numpy types:
>>> df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 bonus_points 1 non-null float64
1 player_id 2 non-null object
2 last_play 2 non-null object
3 rating 2 non-null float64
4 play_time 2 non-null object
dtypes: float64(2), object(3)
memory usage: 208.0+ bytes
The ``dtype`` parameter can be used to specify the data types of the different
columns:
>>> df = get_df(
... table="players",
... keys=keys(player_id=["player_two", "player_four"]),
... dtype={
... "bonus_points": "Int8",
... "last_play": "datetime64[ns, UTC]",
... # "play_time": "timedelta64[ns]" # See note below.
... }
... )
>>> df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 bonus_points 1 non-null Int8
1 player_id 2 non-null object
2 last_play 2 non-null datetime64[ns, UTC]
3 rating 2 non-null float64
4 play_time 2 non-null object
dtypes: Int8(1), datetime64[ns, UTC](1), float64(1), object(2)
memory usage: 196.0+ bytes
.. note:: Due to a
`known bug in pandas <https://github.com/pandas-dev/pandas/issues/38509>`_,
timedelta strings cannot currently be converted back to timedelta64 type via the
``dtype`` parameter. Use the ``pandas.to_timedelta`` function instead:
>>> df.play_time = pd.to_timedelta(df.play_time)
>>> df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 bonus_points 1 non-null Int8
1 player_id 2 non-null object
2 last_play 2 non-null datetime64[ns, UTC]
3 rating 2 non-null float64
4 play_time 2 non-null timedelta64[ns]
dtypes: Int8(1), datetime64[ns, UTC](1), float64(1), object(1), timedelta64[ns](1)
memory usage: 196.0+ bytes
Omitting the ``keys`` parameter performs a scan of the table and returns all the
items.
>>> df = get_df(table="players")
>>> print(df)
bonus_points player_id last_play rating play_time
0 4.0 player_three 2021-01-21 10:22:43 2.5 1 days 14:01:19
1 NaN player_four 2021-01-22 13:51:12 4.8 0 days 03:45:49
2 3.0 player_one 2021-01-18 22:47:23 4.3 2 days 17:41:55
3 1.0 player_two 2021-01-19 19:07:54 3.8 0 days 22:07:34
Specifying item attributes via the ``attributes`` parameter returns only the
columns corresponding to the specified attributes:
>>> df = get_df(table="players", attributes=["player_id", "rating"])
>>> print(df)
player_id rating
0 player_three 2.5
1 player_four 4.8
2 player_one 4.3
3 player_two 3.8
""" # noqa: E501
if keys is not None:
items = get_items(keys=keys, table=table, attributes=attributes)
else:
items = get_all_items(table=table, attributes=attributes)
return _to_df(items=items, dtype=dtype)
def keys(**kwargs):
"""Generate a list of key dictionaries from the partition key attribute name and a
list of values. This can simplify the generation of keys to use with the ``get_df``
function when only a partition key is used.
Parameters
----------
**kwargs
A single keyword argument corresponding to the partition key name with a value
corresponding to the list of key values to return.
Returns
-------
list[dict]
A list of key dictionaries.
Examples
--------
Assuming we have a table with ``player_id`` as the partition key, we can generate
the list of keys from the list of players:
>>> key_list = keys(player_id=["player_two", "player_three", "player_four"])
>>> print(key_list)
[{'player_id': 'player_one'}, {'player_id': 'player_three'}, {'player_id': 'player_four'}]
""" # noqa: E501
if len(kwargs.keys()) > 1:
raise ValueError("Only one key attribute (partition key) is supported.")
k = list(kwargs.keys())[0]
return [{k: v} for v in kwargs[k]]
def put_df(df, *, table):
"""Put rows of a dataframe as items into a table. If the item(s) do not exist in the
table they are created, otherwise the existing items are replaced with the new ones.
Parameters
----------
df : pandas.DataFrame
Dataframe of items to add/update in the table. The dataframe must, at a minimum,
contain columns that correspond to the table's primary key attribute(s).
table : str
Name of the DynamoDB table.
Examples
--------
Assume with have the following dataframe:
>>> print(players_df)
player_id last_play play_time rating bonus_points
0 player_one 2021-01-18 22:47:23 2 days 17:41:55 4.3 3
1 player_two 2021-01-19 19:07:54 0 days 22:07:34 3.8 1
2 player_three 2021-01-21 10:22:43 1 days 14:01:19 2.5 4
3 player_four 2021-01-22 13:51:12 0 days 03:45:49 4.8 <NA>
The following will add or update the corresponding items in the table named
``players``:
>>> put_df(players_df, table="players")
"""
put_items(items=_to_items(df), table=table)
def _to_df(items, *, dtype=None):
"""Convert an item dictionary or list of item dictionaries into a pandas DataFrame.
"""
if isinstance(items, dict):
items = [items]
df = pd.DataFrame(items)
if dtype is not None:
df = df.astype(dtype)
return df
def _to_items(df):
"""Convert a pandas dataframe to a dictionary of items."""
if not isinstance(df, pd.DataFrame):
raise TypeError("df must be a pandas DataFrame")
return df.to_dict("records")