/
formats.rb
330 lines (295 loc) · 12.1 KB
/
formats.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
require 'date'
module ValidatesTimeliness
# A date and time format regular expression generator. Allows you to
# construct a date, time or datetime format using predefined tokens in
# a string. This makes it much easier to catalogue and customize the formats
# rather than dealing directly with regular expressions. The formats are then
# compiled into regular expressions for use validating date or time strings.
#
# Formats can be added or removed to customize the set of valid date or time
# string values.
#
class Formats
cattr_accessor :time_formats,
:date_formats,
:datetime_formats,
:time_expressions,
:date_expressions,
:datetime_expressions,
:format_tokens,
:format_proc_args
# Format tokens:
# y = year
# m = month
# d = day
# h = hour
# n = minute
# s = second
# u = micro-seconds
# ampm = meridian (am or pm) with or without dots (e.g. am, a.m, or a.m.)
# _ = optional space
# tz = Timezone abbreviation (e.g. UTC, GMT, PST, EST)
# zo = Timezone offset (e.g. +10:00, -08:00, +1000)
#
# All other characters are considered literal. You can embed regexp in the
# format but no gurantees that it will remain intact. If you avoid the use
# of any token characters and regexp dots or backslashes as special characters
# in the regexp, it may well work as expected. For special characters use
# POSIX character clsses for safety.
#
# Repeating tokens:
# x = 1 or 2 digits for unit (e.g. 'h' means an hour can be '9' or '09')
# xx = 2 digits exactly for unit (e.g. 'hh' means an hour can only be '09')
#
# Special Cases:
# yy = 2 or 4 digit year
# yyyyy = exactly 4 digit year
# mmm = month long name (e.g. 'Jul' or 'July')
# ddd = Day name of 3 to 9 letters (e.g. Wed or Wednesday)
# u = microseconds matches 1 to 6 digits
#
# Any other invalid combination of repeating tokens will be swallowed up
# by the next lowest length valid repeating token (e.g. yyy will be
# replaced with yy)
@@time_formats = [
'hh:nn:ss',
'hh-nn-ss',
'h:nn',
'h.nn',
'h nn',
'h-nn',
'h:nn_ampm',
'h.nn_ampm',
'h nn_ampm',
'h-nn_ampm',
'h_ampm'
]
@@date_formats = [
'yyyy-mm-dd',
'yyyy/mm/dd',
'yyyy.mm.dd',
'm/d/yy',
'd/m/yy',
'm\d\yy',
'd\m\yy',
'd-m-yy',
'd.m.yy',
'd mmm yy'
]
@@datetime_formats = [
'yyyy-mm-dd hh:nn:ss',
'yyyy-mm-dd h:nn',
'yyyy-mm-dd hh:nn:ss.u',
'm/d/yy h:nn:ss',
'm/d/yy h:nn_ampm',
'm/d/yy h:nn',
'd/m/yy hh:nn:ss',
'd/m/yy h:nn_ampm',
'd/m/yy h:nn',
'ddd, dd mmm yyyy hh:nn:ss (zo|tz)', # RFC 822
'ddd mmm d hh:nn:ss zo yyyy', # Ruby time string
'yyyy-mm-ddThh:nn:ss(?:Z|zo)' # iso 8601
]
# All tokens available for format construction. The token array is made of
# token regexp, validation regexp and key for format proc mapping if any.
# If the token needs no format proc arg then the validation regexp should
# not have a capturing group, as all captured groups are passed to the
# format proc.
#
# The token regexp should only use a capture group if 'look-behind' anchor
# is required. The first capture group will be considered a literal and put
# into the validation regexp string as-is. This is a hack.
@@format_tokens = [
{ 'd' => [ /(\A|[^d])d{1}(?=[^d])/, '(\d{1,2})', :day ] }, #/
{ 'ddd' => [ /d{3,}/, '(\w{3,9})' ] },
{ 'dd' => [ /d{2,}/, '(\d{2})', :day ] },
{ 'mmm' => [ /m{3,}/, '(\w{3,9})', :month ] },
{ 'mm' => [ /m{2}/, '(\d{2})', :month ] },
{ 'm' => [ /(\A|[^ap])m{1}/, '(\d{1,2})', :month ] },
{ 'yyyy' => [ /y{4,}/, '(\d{4})', :year ] },
{ 'yy' => [ /y{2,}/, '(\d{4}|\d{2})', :year ] },
{ 'hh' => [ /h{2,}/, '(\d{2})', :hour ] },
{ 'h' => [ /h{1}/, '(\d{1,2})', :hour ] },
{ 'nn' => [ /n{2,}/, '(\d{2})', :min ] },
{ 'n' => [ /n{1}/, '(\d{1,2})', :min ] },
{ 'ss' => [ /s{2,}/, '(\d{2})', :sec ] },
{ 's' => [ /s{1}/, '(\d{1,2})', :sec ] },
{ 'u' => [ /u{1,}/, '(\d{1,6})', :usec ] },
{ 'ampm' => [ /ampm/, '((?:[aApP])\.?[mM]\.?)', :meridian ] },
{ 'zo' => [ /zo/, '([+-]\d{2}:?\d{2})', :offset ] },
{ 'tz' => [ /tz/, '(?:[A-Z]{1,4})' ] },
{ '_' => [ /_/, '\s?' ] }
]
# Arguments which will be passed to the format proc if matched in the
# time string. The key must be the key from the format tokens. The array
# consists of the arry position of the arg, the arg name, and the code to
# place in the time array slot. The position can be nil which means the arg
# won't be placed in the array.
#
# The code can be used to manipulate the arg value if required, otherwise
# should just be the arg name.
#
@@format_proc_args = {
:year => [0, 'y', 'unambiguous_year(y)'],
:month => [1, 'm', 'month_index(m)'],
:day => [2, 'd', 'd'],
:hour => [3, 'h', 'full_hour(h,md)'],
:min => [4, 'n', 'n'],
:sec => [5, 's', 's'],
:usec => [6, 'u', 'microseconds(u)'],
:offset => [7, 'z', 'offset_in_seconds(z)'],
:meridian => [nil, 'md', nil]
}
class << self
def compile_format_expressions
@@time_expressions = compile_formats(@@time_formats)
@@date_expressions = compile_formats(@@date_formats)
@@datetime_expressions = compile_formats(@@datetime_formats)
end
# Loop through format expressions for type and call proc on matches. Allow
# pre or post match strings to exist if strict is false. Otherwise wrap
# regexp in start and end anchors.
# Returns time array if matches a format, nil otherwise.
def parse(string, type, options={})
return string unless string.is_a?(String)
options.reverse_merge!(:strict => true)
sets = if options[:format]
[ send("#{type}_expressions").assoc(options[:format]) ]
else
expression_set(type, string)
end
matches = nil
processor = sets.each do |format, regexp, proc|
full = /\A#{regexp}\Z/ if options[:strict]
full ||= case type
when :date then /\A#{regexp}/
when :time then /#{regexp}\Z/
when :datetime then /\A#{regexp}\Z/
end
break(proc) if matches = full.match(string.strip)
end
last = options[:include_offset] ? 8 : 7
processor.call(*matches[1..last]) if matches
end
# Delete formats of specified type. Error raised if format not found.
def remove_formats(type, *remove_formats)
remove_formats.each do |format|
unless self.send("#{type}_formats").delete(format)
raise "Format #{format} not found in #{type} formats"
end
end
compile_format_expressions
end
# Adds new formats. Must specify format type and can specify a :before
# option to nominate which format the new formats should be inserted in
# front on to take higher precedence.
# Error is raised if format already exists or if :before format is not found.
def add_formats(type, *add_formats)
formats = self.send("#{type}_formats")
options = {}
options = add_formats.pop if add_formats.last.is_a?(Hash)
before = options[:before]
raise "Format for :before option #{format} was not found." if before && !formats.include?(before)
add_formats.each do |format|
raise "Format #{format} is already included in #{type} formats" if formats.include?(format)
index = before ? formats.index(before) : -1
formats.insert(index, format)
end
compile_format_expressions
end
# Removes formats where the 1 or 2 digit month comes first, to eliminate
# formats which are ambiguous with the European style of day then month.
# The mmm token is ignored as its not ambigous.
def remove_us_formats
us_format_regexp = /\Am{1,2}[^m]/
date_formats.reject! { |format| us_format_regexp =~ format }
datetime_formats.reject! { |format| us_format_regexp =~ format }
compile_format_expressions
end
private
# Compile formats into validation regexps and format procs
def format_expression_generator(string_format)
regexp = string_format.dup
order = {}
regexp.gsub!(/([\.\\])/, '\\\\\1') # escapes dots and backslashes
format_tokens.each do |token|
token_name = token.keys.first
token_regexp, regexp_str, arg_key = *token.values.first
# hack for lack of look-behinds. If has a capture group then is
# considered an anchor to put straight back in the regexp string.
regexp.gsub!(token_regexp) {|m| "#{$1}" + regexp_str }
order[arg_key] = $~.begin(0) if $~ && !arg_key.nil?
end
return Regexp.new(regexp), format_proc(order)
rescue
raise "The following format regular expression failed to compile: #{regexp}\n from format #{string_format}."
end
# Generates a proc which when executed maps the regexp capture groups to a
# proc argument based on order captured. A time array is built using the proc
# argument in the position indicated by the first element of the proc arg
# array.
#
def format_proc(order)
arg_map = format_proc_args
args = order.invert.sort.map {|p| arg_map[p[1]][1] }
arr = [nil] * 7
order.keys.each {|k| i = arg_map[k][0]; arr[i] = arg_map[k][2] unless i.nil? }
proc_string = "lambda {|#{args.join(',')}| md||=nil; [#{arr.map {|i| i.nil? ? 'nil' : i }.join(',')}].map {|i| i.is_a?(Float) ? i : i.to_i } }"
eval proc_string
end
def compile_formats(formats)
formats.map { |format| [ format, *format_expression_generator(format) ] }
end
# Pick expression set and combine date and datetimes for
# datetime attributes to allow date string as datetime
def expression_set(type, string)
case type
when :date
date_expressions
when :time
time_expressions
when :datetime
# gives a speed-up for date string as datetime attributes
if string.length < 11
date_expressions + datetime_expressions
else
datetime_expressions + date_expressions
end
end
end
def full_hour(hour, meridian)
hour = hour.to_i
return hour if meridian.nil?
if meridian.delete('.').downcase == 'am'
hour == 12 ? 0 : hour
else
hour == 12 ? hour : hour + 12
end
end
def unambiguous_year(year, threshold=30)
year = "#{year.to_i < threshold ? '20' : '19'}#{year}" if year.length == 2
year.to_i
end
def month_index(month)
return month.to_i if month.to_i.nonzero?
abbr_month_names.index(month.capitalize) || month_names.index(month.capitalize)
end
def month_names
defined?(I18n) ? I18n.t('date.month_names') : Date::MONTHNAMES
end
def abbr_month_names
defined?(I18n) ? I18n.t('date.abbr_month_names') : Date::ABBR_MONTHNAMES
end
def microseconds(usec)
(".#{usec}".to_f * 1_000_000).to_i
end
def offset_in_seconds(offset)
sign = offset =~ /^-/ ? -1 : 1
parts = offset.scan(/\d\d/).map {|p| p.to_f }
parts[1] = parts[1].to_f / 60
(parts[0] + parts[1]) * sign * 3600
end
end
end
end
ValidatesTimeliness::Formats.compile_format_expressions