/
wrangle.py
165 lines (148 loc) · 5.61 KB
/
wrangle.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import re
from datetime import datetime
import numpy as np
import pandas as pd
from featuretools import variable_types
from featuretools.entityset.timedelta import Timedelta
def _check_timedelta(td):
"""
Convert strings to Timedelta objects
Allows for both shortform and longform units, as well as any form of capitalization
'2 Minutes'
'2 minutes'
'2 m'
'1 Minute'
'1 minute'
'1 m'
'1 units'
'1 Units'
'1 u'
Shortform is fine if space is dropped
'2m'
'1u"
If a pd.Timedelta object is passed, units will be converted to seconds due to the underlying representation
of pd.Timedelta.
If a pd.DateOffset object is passed, it will be converted to a Featuretools Timedelta if it has one
temporal parameter. Otherwise, it will remain a pd.DateOffset.
"""
if td is None:
return td
if isinstance(td, Timedelta):
return td
elif not isinstance(td, (int, float, str, pd.DateOffset, pd.Timedelta)):
raise ValueError("Unable to parse timedelta: {}".format(td))
if isinstance(td, pd.Timedelta):
unit = 's'
value = td.total_seconds()
times = {unit: value}
return Timedelta(times, delta_obj=td)
elif isinstance(td, pd.DateOffset):
# DateOffsets
if td.__class__.__name__ == "DateOffset":
times = dict()
for td_unit, td_value in td.kwds.items():
times[td_unit] = td_value
return Timedelta(times, delta_obj=td)
# Special offsets (such as BDay)
else:
unit = td.__class__.__name__
value = td.__dict__['n']
times = dict([(unit, value)])
return Timedelta(times, delta_obj=td)
else:
pattern = '([0-9]+) *([a-zA-Z]+)$'
match = re.match(pattern, td)
value, unit = match.groups()
try:
value = int(value)
except Exception:
try:
value = float(value)
except Exception:
raise ValueError("Unable to parse value {} from ".format(value) +
"timedelta string: {}".format(td))
times = {unit: value}
return Timedelta(times)
def _check_time_against_column(time, time_column):
'''
Check to make sure that time is compatible with time_column,
where time could be a timestamp, or a Timedelta, number, or None,
and time_column is a Variable. Compatibility means that
arithmetic can be performed between time and elements of time_columnj
If time is None, then we don't care if arithmetic can be performed
(presumably it won't ever be performed)
'''
if time is None:
return True
elif isinstance(time, (int, float)):
return isinstance(time_column,
variable_types.Numeric)
elif isinstance(time, (pd.Timestamp, datetime, pd.DateOffset)):
return isinstance(time_column,
variable_types.Datetime)
elif isinstance(time, Timedelta):
return (isinstance(time_column, (variable_types.Datetime, variable_types.DatetimeTimeIndex)) or
(isinstance(time_column, (variable_types.Ordinal, variable_types.Numeric, variable_types.TimeIndex)) and
time.unit not in Timedelta._time_units))
else:
return False
def _check_time_type(time):
'''
Checks if `time` is an instance of common int, float, or datetime types.
Returns "numeric", "datetime", or "unknown" based on results
'''
time_type = None
if isinstance(time, (datetime, np.datetime64)):
time_type = variable_types.DatetimeTimeIndex
elif isinstance(time, (int, float)) or np.issubdtype(time, np.integer) or np.issubdtype(time, np.floating):
time_type = variable_types.NumericTimeIndex
return time_type
def _dataframes_equal(df1, df2):
# ^ means XOR
if df1.empty ^ df2.empty:
return False
elif not df1.empty and not df2.empty:
if not set(df1.columns) == set(df2.columns):
return False
for c in df1:
df1c = df1[c]
df2c = df2[c]
if df1c.dtype == object:
df1c = df1c.astype('unicode')
if df2c.dtype == object:
df2c = df2c.astype('unicode')
normal_compare = True
if df1c.dtype == object:
dropped = df1c.dropna()
if not dropped.empty:
if isinstance(dropped.iloc[0], tuple):
dropped2 = df2[c].dropna()
normal_compare = False
for i in range(len(dropped.iloc[0])):
try:
equal = dropped.apply(lambda x: x[i]).equals(
dropped2.apply(lambda x: x[i]))
except IndexError:
raise IndexError("If column data are tuples, they must all be the same length")
if not equal:
return False
if normal_compare:
# handle nan equality correctly
# This way is much faster than df1.equals(df2)
result = df1c == df2c
result[pd.isnull(df1c) == pd.isnull(df2c)] = True
if not result.all():
return False
return True
def _is_s3(string):
'''
Checks if the given string is a s3 path.
Returns a boolean.
'''
return "s3://" in string
def _is_url(string):
'''
Checks if the given string is an url path.
Returns a boolean.
'''
return 'http' in string