In [1]:
import re
from datetime import datetime, timedelta
from dateutil.parser import parse
import jieba.posseg as psg  # 利用词性标注功能，获取词性为数字('m')或时间('t')的词

In [19]:
UTIL_CN_NUM = {
    '零': 0, '一': 1, '二': 2, '两': 2, '三': 3, '四': 4,
    '五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
    '0': 0, '1': 1, '2': 2, '3': 3, '4': 4,
    '5': 5, '6': 6, '7': 7, '8': 8, '9': 9
}
UTIL_CN_UNIT = {'十': 10, '百': 100, '千': 1000, '万': 10000}


fuzzy_year = "([0-9零一二两三四五六七八九十]+年)?"
fuzzy_month = "([0-9零一二两三四五六七八九十]+月)?"
fuzzy_day = "([0-9零一二两三四五六七八九十]+[号日])?"
fuzzy_noon = "([上中下午晚早]+)?"
fuzzy_hour = "([0-9零一二两三四五六七八九十百]+[点:\.时])?"
fuzzy_minute = "([0-9零一二三四五六七八九十百]+分?)?"
fuzzy_second = "([0-9零一二三四五六七八九十百]+秒)?"
fuzzy_pattern = fuzzy_year + fuzzy_month + fuzzy_day + fuzzy_noon + fuzzy_hour + fuzzy_minute + fuzzy_second
print(fuzzy_pattern)

([0-9零一二两三四五六七八九十]+年)?([0-9零一二两三四五六七八九十]+月)?([0-9零一二两三四五六七八九十]+[号日])?([上中下午晚早]+)?([0-9零一二两三四五六七八九十百]+[点:\.时])?([0-9零一二三四五六七八九十百]+分?)?([0-9零一二三四五六七八九十百]+秒)?


In [141]:
def year2dig(year):
    """
    将年份文本转换成数字
    """
    res = ""
    # 将年份文本经过数字映射处理
    for w in year:
        try:
            d = UTIL_CN_NUM[w]
            res += str(d)
        except KeyError:
            res += w
    m = re.match("\d+", res)
    if m:
        r = m.group(0)
        if len(r) == 2:
            return int(datetime.today().year / 100) * 100 + int(r)
        else:
            return int(r)
    else:
        return None

def cn2dig(src):
    """
    将其他日期文本转换成数字
    """
    if src == "":
        return None
    m = re.match("\d+", src)
    if m:
        return int(m.group(0))
    
    rsl = 0  # 累加结果容器
    num = 0  # 当前值
    unit = 1  # 当前单位
    src = src[::-1]  # 倒排才能提前发现单位变化
    for w in src:
        try:
            unit = UTIL_CN_UNIT[w]
        except KeyError:
            try:
                num = UTIL_CN_NUM[w]
            except KeyError:
                return None
            rsl += unit * num
    if rsl < unit:
        rsl += unit * 1
    return rsl    

def parse_datetime(msg):
    """
    根据时间日期词标准化输出
    """
    result_format = "%Y-%m-%d %H:%M:%S"
#     try:
#         dt = parse(msg, fuzzy=True)
#         return dt.strftime(result_format)
#     except Exception as e:
    m = re.match(fuzzy_pattern, msg)
    if m.group(0) is not None:
        res = {
            "year": m.group(1),
            "month": m.group(2),
            "day": m.group(3),
            #"is_pm": m.group(4),
            "hour": m.group(5) if m.group(5) is not None else "00",
            "minute": m.group(6) if m.group(6) is not None else "00",
            "second": m.group(7)if m.group(7) is not None else "00"
        }

        params = {}  # 在今天的日期上进行时间元素替换，替换元素的存储容器
        for name, value in res.items():
            ## 对文本进行数字转换
            if value is not None and len(value) > 0:
                tmp = None  # 文本 to 数字 的结果容器
                value = value[:-1]  # 去掉文本末尾的 ‘年’， ‘月’， ‘日’
                if name == "year":
                    tmp = year2dig(value)
                else:
                    tmp = cn2dig(value)
                if tmp is not None:
                    params[name] = tmp
        target_date = datetime.today().replace(**params)

        # 最后根据上下午再进行调整
        is_pm = m.group(4)
        if is_pm is not None:
            if is_pm in ["下午", "晚上", "中午", "傍晚"]:
                hour = target_date.time().hour
                if hour < 12:
                    target_date = target_date.replace(hour=hour + 12)
        return target_date.strftime(result_format)
    else:
        return None


def check_time_valid(word):
    """
    检查时间是否合法
    """
    m = re.match("\d+", word)
    if m:
        word1 = re.sub('[号|日]\d+$', '日', word)
        if len(word1) <= 1:
            return None
        else:
            return word1
    else:
        return None
    
def time_extract(text):
    """
    基于规则的时间匹配，主要函数
    """
    time_res = []
    word = ""
    keyDate = {"今天": 0, "明天": 1, "后天": 2}
    pos_list = psg.cut(text)
    
    # 将全部能表示日期的词都提取出来
    for w, v in pos_list:
        try:
            days = keyDate[w]
            if word != "":
                time_res.append(word)
            word = (datetime.today() + timedelta(days=days)).strftime('%Y{y}%m{m}%d{d}').format(y='年',m='月',d='日')
        except KeyError:
            if word != "":
                if v in ["m", "t"]:
                    word += w
                else:
                    time_res.append(word)
                    word = ""
            elif v in ["m", "t"]:
                word = w
    if word != "":
        time_res.append(word)
    #print(time_res)
    result = [parse_datetime(w) for w in time_res if check_time_valid(w) is not None]
    return result
        
                

In [143]:
text1 = '我要住到明天下午三点'
print(text1, time_extract(text1), sep=':')  ## fine

text2 = '预定28号的房间'
print(text2, time_extract(text2), sep=':')  ## fine

text3 = '我要从18号下午4点住到11月20号'
print(text3, time_extract(text3), sep=':')  ## fine

text4 = '我要预订今天到30的房间'
print(text4, time_extract(text4), sep=':')  ## not fine

text5 = '今天30号呵呵'
print(text5, time_extract(text5), sep=':')  ## not fine

我要住到明天下午三点:['2018-11-07 15:00:00']
预定28号的房间:['2018-11-28 00:00:00']
我要从18号下午4点住到11月20号:['2018-11-18 16:00:00', '2018-11-20 00:00:00']
我要预订今天到30的房间:['2018-11-06 00:00:00', '2018-11-06 00:03:00']
今天30号呵呵:['2018-11-06 00:03:00']
