Skip to content

Commit

Permalink
Restructured the parser a bit, made it a bit faster
Browse files Browse the repository at this point in the history
  • Loading branch information
EnTeQuAk committed Feb 14, 2012
1 parent 0453ebb commit 61e8e22
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 120 deletions.
6 changes: 3 additions & 3 deletions period/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
#-*- coding: utf-8 -*-
from period.parser import Parser

#TODO: for now only an alias for iso8601 parsing, will be expanded later.
from period.iso8601 import parse_date as parse_iso8601

def parse(string):
obj = Period(string, parse_iso8601(string))
parser = Parser()
obj = Period(string, parser.parse(string))
return obj


Expand Down
82 changes: 82 additions & 0 deletions period/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import re
import itertools
from datetime import date, timedelta

#: Regular expressions that can be tagged, sorted by priority.
EXPRESSIONS = []

#: Compile set of regular expressions to parse ISO dates.
#:
#: Besides the fact that it would be necessary to fix the number
#: of year digits to support proper parsing, we do not because we parse a
#: bit more fuzzy to support a wider range of formats.
#: Thus we cannot distinguish between various ISO date formats
#: but just "support them".
#: ISO 8601 expanded DATE formats allow an arbitrary number of year
#: digits with a leading +/- sign.

EXPRESSIONS.extend([
# 1. complete dates:
# YYYY-MM-DD or +- YYYYYY-MM-DD... extended date format
('complete_date', r"(?P<sign>[+-]){0,1}(?P<year>[0-9]{4,6})"
r"-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})"),

# YYYYMMDD or +- YYYYYYMMDD... basic date format
('basic_date', r"(?P<sign>[+-]){0}(?P<year>[0-9]{4})"
r"(?P<month>[0-9]{2})(?P<day>[0-9]{2})"),

# YYYYMMDD or +- YYYYYYMMDD... basic date format
('basic_date', r"(?P<sign>[+-]){1}(?P<year>[0-9]{6})"
r"(?P<month>[0-9]{2})(?P<day>[0-9]{2})"),

# 2. complete week dates:
# YYYY-Www-D or +-YYYYYY-Www-D ... extended week date
('complete_week_date', r"(?P<sign>[+-]){0,1}(?P<year>[0-9]{4,6})"
r"-W(?P<week>[0-9]{2})-(?P<day>[0-9]{1})"),

# YYYYWwwD or +-YYYYYYWwwD ... basic week date
('basic_week_date', r"(?P<sign>[+-]){0,1}(?P<year>[0-9]{4,6})W"
r"(?P<week>[0-9]{2})(?P<day>[0-9]{1})"),

# 3. ordinal dates:
# YYYY-DDD or +-YYYYYY-DDD ... extended format
('ordinal_date', r"(?P<sign>[+-]){0,1}(?P<year>[0-9]{4,6})"
r"-(?P<day>[0-9]{3})"),

# YYYYDDD or +-YYYYYYDDD ... basic format
('basic_date_format', r"(?P<sign>[+-]){0,1}(?P<year>[0-9]{4,6})"
r"(?P<day>[0-9]{3})"),

# 4. week dates:
# YYYY-Www or +-YYYYYY-Www ... extended reduced accuracy week date
('week_date', r"(?P<sign>[+-]){0,1}(?P<year>[0-9]{4,6})"
r"-W(?P<week>[0-9]{2})"),

# YYYYWww or +-YYYYYYWww ... basic reduced accuracy week date
('basic_reduced_accuracy_week_date', r"(?P<sign>[+-]){0,1}(?P<year>[0-9]{4,6})W"
r"(?P<week>[0-9]{2})"),

# 5. month dates:
# YYY-MM or +-YYYYYY-MM ... reduced accuracy specific month
('month_date', r"(?P<sign>[+-]){0,1}(?P<year>[0-9]{4,6})"
r"-(?P<month>[0-9]{2})"),

# 7. century dates:
# YY or +-YYYY ... reduced accuracy specific century
('century_date', r"(?P<sign>[+-]){1}"
r"(?P<century>[0-9]{4})$"),

('century_date', r"(?P<sign>[+-]){0}"
r"(?P<century>[0-9]{2})$"),

# 6. year dates:
# YYYY or +-YYYYYY ... reduced accuracy specific year
('year_date', r"(?P<sign>[+-]){0,1}(?P<year>[0-9]{4,6})"),

])


# Compile all regular expressions, eases debugging and boosts
# performance.
for idx, val in enumerate(EXPRESSIONS):
EXPRESSIONS[idx] = (val[0], re.compile(val[1]))
117 changes: 0 additions & 117 deletions period/iso8601.py

This file was deleted.

61 changes: 61 additions & 0 deletions period/parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#-*- coding: utf-8 -*-
import re
import itertools
from datetime import date, timedelta
from period.constants import EXPRESSIONS


class Parser(object):

def __init__(self):
#: These rules are named after predefined constants in period.constants
self.rules = {
self.handle_date: ('complete_date', 'basic_date', 'basic_week_date',
'ordinal_date', 'basic_date_format', 'week_date',
'basic_reduced_accuracy_week_date', 'month_date',
'year_date', 'century_date', 'complete_week_date')}

def parse(self, string):
mapping = {}
for handler, values in self.rules.items():
for value in values:
mapping[value] = handler

if not string:
return None
for handler, expr in EXPRESSIONS:
match = expr.match(string)
if match:
return mapping[handler](match)
return None

def handle_date(self, match):
groups = match.groupdict()
# sign, century, year, month, week, day,
# FIXME: negative dates not possible with python standard types
sign = (groups['sign'] == '-' and -1) or 1
if 'century' in groups:
return date(sign * (int(groups['century']) * 100 + 1), 1, 1)
if not 'month' in groups: # weekdate or ordinal date
ret = date(sign * int(groups['year']), 1, 1)
if 'week' in groups:
isotuple = ret.isocalendar()
if 'day' in groups:
days = int(groups['day'] or 1)
else:
days = 1
# if first week in year, do weeks-1
return ret + timedelta(weeks=int(groups['week']) -
(((isotuple[1] == 1) and 1) or 0),
days = -isotuple[2] + days)
elif 'day' in groups: # ordinal date
return ret + timedelta(days=int(groups['day'])-1)
else: # year date
return ret
# year-, month-, or complete date
if 'day' not in groups or groups['day'] is None:
day = 1
else:
day = int(groups['day'])
return date(sign * int(groups['year']),
int(groups['month']) or 1, day)

0 comments on commit 61e8e22

Please sign in to comment.