In [1]:
import dateparser
from utils.myutils import load_object, save_object

In [2]:
language = 'en'

In [3]:
def get_date_form(date, date_str):
    
    #y: Only year
    #ym: Year and month
    #ymd: Full date
    
    if len(date_str) <= 4:
        return 'y'
    elif dateparser.parse(date_str, languages=[language], settings={'STRICT_PARSING': True}) is None:
        return 'ym'
    else:
        return 'ymd'

In [4]:
def get_date_unified(date_str):
    date = dateparser.parse(date_str, languages=[language])
    
    if date is None:
        return None
    
    if str(date.year) not in date_str:
        #print(str(date.year) + ' not found in ' + date_str)
        return None
        
    if date.hour != 0 or date.minute != 0 or date.second != 0:
        #print(date_str + ' invalid')
        return None
    
    form = get_date_form(date, date_str)
    
    try:
        if form == 'y':
            date_unified = str(date.year).zfill(4)
        elif form == 'ym':
            date_unified = str(date.year).zfill(4) + '-' + str(date.month).zfill(2)
        else:
            date_unified = str(date.year).zfill(4) + '-' + str(date.month).zfill(2) + '-' + str(date.day).zfill(2)
    except (TypeError, ValueError, OverflowError, OSError) as e:
        print(str(type(e)) + ': ' + str(e) + ' / ' + str(date))
        return None
    
    return (date_unified, form)

In [5]:
spacy_date_dict = load_object('spacy_date_dict')
abstract_dict = load_object('abstract_dict')
date_dict = dict()

In [6]:
for entity, abstract in abstract_dict.items():
    
    date_dict[entity] = []
    
    for date_idx in spacy_date_dict[entity]:
        date_text = abstract[date_idx[0]:date_idx[1]]
        try:
            date_unified = get_date_unified(date_text)
            if date_unified is None:
                continue
            #print(date_text + ', (' + date_unified[0] + ',' + date_unified[1] + ')')
        except (TypeError, ValueError, OverflowError, OSError) as e:
            print(str(type(e)) + ': ' + str(e) + ' / ' + str(date_text))
            continue
        
        date_dict[entity].append((date_unified, date_idx[0]))

<class 'TypeError'>: Required argument 'day' (pos 3) not found / 42, 1980
<class 'ValueError'>: year -282 is out of range / 2300 years
<class 'ValueError'>: year -982 is out of range / 3000 years ago
<class 'ValueError'>: year -20 is out of range / the 2038th year
<class 'ValueError'>: year -5962 is out of range / 7980 years
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 82-0192
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 82-0192
<class 'ValueError'>: year -22 is out of range / the 2040th year
<class 'ValueError'>: year -21 is out of range / the 2039th year
<class 'ValueError'>: year -1 is out of range / the 2019th year
<class 'ValueError'>: year -3 is out of range / the 2021st year
<class 'ValueError'>: year 0 is out of range / the 2018th year
<class 'ValueError'>: year -2 is out of range / the 2020th year
<class 'ValueError'>: year -6 is out of range / the 2024th year
<class 'ValueError'>: year -4 is out of range / the 2022nd year
<class 'ValueE

<class 'ValueError'>: year -482 is out of range / 2500 years
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 92, 2000
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 50 of 2013
<class 'ValueError'>: year -2682 is out of range / about 4700 years ago
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 36-2903
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 36-2903
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 57 in 1982
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 91, 2004
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 74), 1799
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 84), 1824
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 50 (1723
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 60 (1727
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 74 (1750
<class 'TypeError'>: Required argument 'day' (pos

<class 'ValueError'>: year -1982 is out of range / 3000–4000 years ago
<class 'ValueError'>: year -2212 is out of range / 4230 years
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 94 of 1996
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 47-3220
<class 'ValueError'>: year -4682 is out of range / 5600–6700 years ago
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 45-1174
<class 'ValueError'>: year -482 is out of range / 2500 years
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 47-3078
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 47-3216
<class 'ValueError'>: year -4982 is out of range / 7000 years
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 43 in 2001
<class 'ValueError'>: year -696 is out of range / 5.2714 years
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 50 (1994
<class 'TypeError'>: '<=' not supported between instances of 'str' and 'int' / 1080 – aft
<c

<class 'TypeError'>: Required argument 'day' (pos 3) not found / 45 of 1953
<class 'ValueError'>: year -4482 is out of range / about 6500 years
<class 'ValueError'>: year -5982 is out of range / 8000 years ago
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 90 in 2014
<class 'OverflowError'>: date value out of range / 4.348125 weeks
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 86-1044
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 40 in 1921
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 47-7046
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 47-7167
<class 'ValueError'>: year -3982 is out of range / about 6000 years ago
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 80-0172
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 66 1966
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 53 in 1964
<class 'ValueError'>: year -4982 is out of range / 7000 yea

<class 'ValueError'>: year -5982 is out of range / 8000 years ago
<class 'ValueError'>: year -2982 is out of range / 5000 years
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 48 of 1968
<class 'TypeError'>: '<=' not supported between instances of 'str' and 'int' / 1471 – aft
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 98, 1516 and
<class 'ValueError'>: year -2982 is out of range / 5000 years ago
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 94 in 2006
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 96 in 1996
<class 'ValueError'>: year -2982 is out of range / about 5000 years ago
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 50/2007
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 47-3115
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 50 2008
<class 'ValueError'>: year -4982 is out of range / 5000–7000 years ago
<class 'TypeError'>: Required argument 'day' (po

<class 'TypeError'>: Required argument 'day' (pos 3) not found / 33, 1996
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 82 (1993
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 89 in 1945
<class 'OverflowError'>: date value out of range / 1231.1780240 days
<class 'OverflowError'>: date value out of range / 1321.9792938 days
<class 'OverflowError'>: date value out of range / 1362.7556828 days
<class 'OverflowError'>: date value out of range / 1375.3864884 days
<class 'OverflowError'>: date value out of range / 1283.9726227 days
<class 'ValueError'>: year -4982 is out of range / 7000 years ago
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 39, 1973
<class 'OverflowError'>: date value out of range / 1685.9164766 days
<class 'TypeError'>: '<=' not supported between instances of 'str' and 'int' / 1273/74 – aft
<class 'ValueError'>: year -3882 is out of range / 5900 years
<class 'ValueError'>: year -982 is out of range / 3000 years
<class

<class 'OverflowError'>: date value out of range / 0.56686776 days
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 81-7002-
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 99, 1989
<class 'TypeError'>: '<=' not supported between instances of 'str' and 'int' / 1292 – aft
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 53rd in 2004
<class 'ValueError'>: year -3182 is out of range / about 5200 years
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 44-0004
<class 'ValueError'>: year -1982 is out of range / about 4000 years ago
<class 'ValueError'>: year -2982 is out of range / 5000 years ago
<class 'ValueError'>: year -6982 is out of range / 9000 years
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 48, 1964
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 57 in 1932
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 46 of 2000
<class 'ValueError'>: year -482 is out of range / 

<class 'ValueError'>: year -1182 is out of range / 3200 years ago
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 76 of 1993
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 78, 4013
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 78, 1045
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 65-0965
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 94-8137
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 45-1109
<class 'ValueError'>: year -982 is out of range / about 3000 years
<class 'ValueError'>: year -2982 is out of range / 5000 Years
<class 'ValueError'>: invalid literal for int() with base 10: '⁰' / 120⁰
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 86-1197
<class 'ValueError'>: year -482 is out of range / about 2500 years ago
<class 'ValueError'>: year -3582 is out of range / 5600 years ago
<class 'ValueError'>: year -97982 is out of range / 100000 years
<class 'ValueE

<class 'TypeError'>: '<=' not supported between instances of 'str' and 'int' / April 1859 – aft
<class 'ValueError'>: year -582 is out of range / 2600 years ago
<class 'TypeError'>: Required argument 'day' (pos 3) not found / the 52nd and 1061st
<class 'ValueError'>: year -2982 is out of range / 5000 years
<class 'ValueError'>: year -31082 is out of range / 33100 years
<class 'ValueError'>: year -27882 is out of range / 29900 years
<class 'TypeError'>: Required argument 'day' (pos 3) not found / the 68th (2012
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 85-2114
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 95-1707
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 43 of 1999
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 48 in 2008
<class 'ValueError'>: year -982 is out of range / about 3000 years ago
<class 'TypeError'>: Required argument 'day' (pos 3) not found / 88-1490
<class 'ValueError'>: year -4982 is out of

In [7]:
save_object(date_dict, 'date_dict')

In [8]:
date_dict

{'<http://dbpedia.org/resource/Animalia_(book)>': [(('1986', 'y'), 90),
  (('1996', 'y'), 139),
  (('2012', 'y'), 179),
  (('1996', 'y'), 299)],
 '<http://dbpedia.org/resource/Actrius>': [(('1997', 'y'), 34)],
 '<http://dbpedia.org/resource/Alain_Connes>': [(('1947-04-01', 'ymd'), 39)],
 '<http://dbpedia.org/resource/Agricultural_science>': [],
 '<http://dbpedia.org/resource/International_Atomic_Time>': [(('2015-06-30',
    'ymd'),
   395),
  (('1972', 'y'), 605)],
 '<http://dbpedia.org/resource/Astronomer>': [],
 '<http://dbpedia.org/resource/An_American_in_Paris>': [(('1928', 'y'), 110),
  (('1928-12-13', 'ymd'), 595)],
 '<http://dbpedia.org/resource/Allan_Dwan>': [],
 '<http://dbpedia.org/resource/List_of_Atlas_Shrugged_characters>': [],
 '<http://dbpedia.org/resource/Achilles>': [],
 '<http://dbpedia.org/resource/A>': [],
 '<http://dbpedia.org/resource/Arraignment>': [],
 '<http://dbpedia.org/resource/Answer>': [],
 '<http://dbpedia.org/resource/Appellate_court>': [],
 '<http://dbp