# Задание:

В эпикризах, которые у нас представлены, формат дат, относительно простой и легко поддаётся "чистке" и подготовке.

В домашнем задании, я хочу, чтобы вы отработали навыки использования регулярных выражений для подготовки дат.

Извлеките даты из предоставленного файла dates.txt.

Преобразование/нормализация извлеченных дат в желаемый формат гггг-ММ-дд

Пример: дата извлечения 27 сентября 2021 г.; Форматированная дата 27.09.2021

Запишите результаты в файл .txt


In [118]:
import re
from datetime import datetime

# Регулярные выражения

In [119]:
regex_patterns = {
    r'\b(\d{1,2})/(\d{1,2})/(\d{4})\b': '%m/%d/%Y',   # мм/дд/гггг
    r'\b(\d{1,2})/(\d{1,2})/(\d{2})\b': '%m/%d/%y',   # мм/дд/гг
    r'\b(\d{1,2})/(\d{4})\b': '%m/%Y',                # мм/гггг
    r'\b([A-Za-z]+),\s(\d{4})\b': '%B, %Y'            # ммм, гггг
}


In [120]:
'''regex_patterns = {
    r'\b(\d{1,2})/(\d{1,2})/(\d{4})(\D|$)': '%m/%d/%Y',   # мм/дд/гггг с разделителем или концом строки
    r'\b(\d{1,2})/(\d{1,2})/(\d{2})(\D|$)': '%m/%d/%y',   # мм/дд/гг с разделителем или концом строки
    r'\b(\d{1,2})/(\d{4})(\D|$)': '%m/%Y',                # мм/гггг с разделителем или концом строки
    r'\b([A-Za-z]+),\s(\d{4})(\D|$)': '%B, %Y'            # ммм, гггг с разделителем или концом строки
}'''

"regex_patterns = {\n    r'\x08(\\d{1,2})/(\\d{1,2})/(\\d{4})(\\D|$)': '%m/%d/%Y',   # мм/дд/гггг с разделителем или концом строки\n    r'\x08(\\d{1,2})/(\\d{1,2})/(\\d{2})(\\D|$)': '%m/%d/%y',   # мм/дд/гг с разделителем или концом строки\n    r'\x08(\\d{1,2})/(\\d{4})(\\D|$)': '%m/%Y',                # мм/гггг с разделителем или концом строки\n    r'\x08([A-Za-z]+),\\s(\\d{4})(\\D|$)': '%B, %Y'            # ммм, гггг с разделителем или концом строки\n}"

# Корректировка названий месяцев

In [121]:
def correct_month_names(text):
    corrections = {
        'Jan': 'January', 'Feb': 'February', 'Mar': 'March', 'Apr': 'April', 'Jun': 'June',
        'Jul': 'July', 'Aug': 'August', 'Sep': 'September', 'Oct': 'October', 'Nov': 'November', 'Dec': 'December',
        'Marc': 'March'
    }
    for wrong, correct in corrections.items():
        text = re.sub(r'\b' + wrong + r'\b', correct, text, flags=re.IGNORECASE)
    return text

# Функция для преобразования дат

In [122]:
def convert_date(match, pattern):
    try:
        if pattern == '%m/%d/%Y':
            month, day, year = match.groups()
            return f"{year}-{int(month):02d}-{int(day):02d}"
        elif pattern == '%m/%d/%y':
            month, day, year = match.groups()
            year = '19' + year if int(year) >= 50 else '20' + year
            return f"{year}-{int(month):02d}-{int(day):02d}"
        elif pattern == '%m/%Y':
            month, year = match.groups()
            return f"{year}-{int(month):02d}-01"
        elif pattern == '%B, %Y':
            month, year = match.groups()
            datetime_object = datetime.strptime(f"{month} {year}", "%B %Y")
            return datetime_object.strftime('%Y-%m-%d')
    except ValueError:
        return None
    return None

# Чтение файла

In [123]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [124]:
# Путь к данным
path_to_dates_txt_file = "drive/MyDrive/Применение NLP в Здравоохранении/dates.txt"

In [125]:
with open(path_to_dates_txt_file) as file:
  lines = file.readlines()

print(*lines[:6],sep='\n')  # print the first 5 rows in the file

0	: Na 130 on 7/21/1999Pertinent Medical Review of Systems Constitutional:

1	"""Hx of suicidal ideation and last felt suicidal in Marc, 1981. No Hx of suicide attempts. ""Felt that after being sober for 5 years and in custody for 22 months that I just wasn't getting it. I couldn't do it. He had my parents come in the following week and we talked and that's when we decided I should go on the methadone clinic,""Hx of Non Suicidal Self Injurious Behavior: No"

2	s 03/1980 Positive PPD: treated with INH for 6 months

3	: 7/11/90CPT code: 99205

4	: 6/02/1986CPT Code: 90792: With medical services

5	s  25 yo married female with hx of low grade anxiety, perfectionism and attention presents with increasing sx of depression after being laid off from her job in Mar 2012. This intake interview was conducted as a one time consult intake with the goal of referral to appropriate services.



# Преобразование дат

In [126]:
extracted_dates = {}

for line in lines:
    line_content = line.split('\t')
    text = correct_month_names(line_content[1])
    extracted_dates[line_content[0]] = []
    for pattern, date_format in regex_patterns.items():
        for match in re.finditer(pattern, text):
            converted_date = convert_date(match, date_format)
            if converted_date:
                extracted_dates[line_content[0]].append(converted_date)

In [127]:
for key, value in extracted_dates.items():
    print(f"{key}: {value}")

0: []
1: ['1981-03-01']
2: ['1980-03-01']
3: []
4: []
5: []
6: []
7: []
8: []
9: []
10: []
11: []
12: []
13: []
14: []
15: []
16: []
17: ['1976-09-30']
18: []
19: []
20: []
21: []
22: []
23: ['1984-09-01']
24: []
25: ['1977-02-14']
26: ['1997-07-13']
27: []
28: []
29: ['1971-06-10']
30: ['1993-12-12', '1993-12-01']
31: []
32: []
33: []
34: ['2010-08-01']
35: ['1979-10-06']
36: []
37: []
38: []
39: []
40: []
41: []
42: ['1983-01-01']
43: ['1977-08-30']
44: ['1972-04-01']
45: []
46: []
47: []
48: ['1977-08-21']
49: ['1998-01-27']
50: ['1982-05-02']
51: []
52: []
53: []
54: []
55: []
56: []
57: ['2013-05-18', '2013-18-01']
58: ['1980-01-02']
59: []
60: []
61: ['1977-05-21']
62: ['1979-02-12']
63: []
64: ['1981-07-09']
65: []
66: []
67: []
68: []
69: []
70: []
71: ['1975-09-01']
72: ['1979-08-17']
73: []
74: []
75: []
76: []
77: []
78: []
79: []
80: []
81: ['1992-05-19']
82: []
83: []
84: ['1995-08-02']
85: []
86: []
87: []
88: []
89: []
90: []
91: []
92: []
93: ['1985-01-06']
94: []
95: [

# Подсчет процента найденных дат

In [129]:
# Подсчет общего числа строк и строк с найденными датами
total_lines = len(lines)
lines_with_dates = len([key for key, value in extracted_dates.items() if value])
percentage_with_dates = (lines_with_dates / total_lines) * 100

In [130]:
percentage_with_dates

23.400000000000002

отметка в 30% не достигнута, обрабатываем текст дальше

изменим функцию для преобразования дат, добавив разделение слитых конструкций слово-дата (7/21/1999Pertinent)

# Улучшаем результат

In [131]:
regex_patterns = {
    r'\b(\d{1,2})/(\d{1,2})/(\d{4})\b': '%m/%d/%Y',   # мм/дд/гггг
    r'\b(\d{1,2})/(\d{1,2})/(\d{2})\b': '%m/%d/%y',   # мм/дд/гг
    r'\b(\d{1,2})/(\d{4})\b': '%m/%Y',                # мм/гггг
    r'\b([A-Za-z]+),\s(\d{4})\b': '%B, %Y',            # ммм, гггг
    r'\b(\d{1,2})/(\d{1,2})/(\d{4})(\D|$)': '%m/%d/%Y',   # мм/дд/гггг с разделителем или концом строки
    r'\b(\d{1,2})/(\d{1,2})/(\d{2})(\D|$)': '%m/%d/%y',   # мм/дд/гг с разделителем или концом строки
    r'\b(\d{1,2})/(\d{4})(\D|$)': '%m/%Y',                # мм/гггг с разделителем или концом строки
    r'\b([A-Za-z]+),\s(\d{4})(\D|$)': '%B, %Y'            # ммм, гггг с разделителем или концом строки
}

In [132]:
def convert_date(match, pattern):
    try:
        if pattern == '%m/%d/%Y':
            month, day, year = match.groups()[:3]
            return f"{year}-{int(month):02d}-{int(day):02d}"
        elif pattern == '%m/%d/%y':
            month, day, year = match.groups()[:3]
            year = '19' + year if int(year) >= 50 else '20' + year
            return f"{year}-{int(month):02d}-{int(day):02d}"
        elif pattern == '%m/%Y':
            month, year = match.groups()[:2]
            return f"{year}-{int(month):02d}-01"
        elif pattern == '%B, %Y':
            month, year = match.groups()[:2]
            datetime_object = datetime.strptime(f"{month} {year}", "%B %Y")
            return datetime_object.strftime('%Y-%m-%d')
    except ValueError:
        return None
    return None

In [133]:
extracted_dates = {}
total_dates = 0

In [134]:
def extract_and_convert_date(text, patterns):
    for pattern, date_format in patterns.items():
        match = re.search(pattern, text)
        if match:
            date_str = convert_date(match, date_format)
            remaining_text = text[match.end():]
            return date_str, remaining_text
    return None, text

In [135]:
for line in lines:
    line_content = line.split('\t')
    text = correct_month_names(line_content[1])
    extracted_dates[line_content[0]] = []
    while text:
        date_str, text = extract_and_convert_date(text, regex_patterns)
        if date_str:
            extracted_dates[line_content[0]].append(date_str)
            total_dates += 1
        else:
            break

In [136]:
extracted_dates

{'0': ['1999-07-21'],
 '1': ['1981-03-01'],
 '2': ['1980-03-01'],
 '3': ['1990-07-11'],
 '4': ['1986-06-02'],
 '5': [],
 '6': ['1985-06-18'],
 '7': [],
 '8': [],
 '9': ['1986-08-01'],
 '10': [],
 '11': ['1972-09-17'],
 '12': [],
 '13': ['1994-11-24'],
 '14': [],
 '15': ['1992-08-16'],
 '16': ['1997-12-12'],
 '17': ['1976-09-30'],
 '18': [],
 '19': [],
 '20': [],
 '21': ['1997-12-08'],
 '22': ['1994-09-13'],
 '23': ['1984-09-01'],
 '24': ['1986-04-18'],
 '25': ['1977-02-14'],
 '26': ['1997-07-13'],
 '27': [],
 '28': ['1988-04-12'],
 '29': ['1971-06-10'],
 '30': ['1993-12-12'],
 '31': [],
 '32': ['1981-03-26'],
 '33': [],
 '34': ['2010-08-01'],
 '35': ['1979-10-06'],
 '36': ['1977-08-31'],
 '37': [],
 '38': ['1994-09-09'],
 '39': ['1975-12-01'],
 '40': ['1979-09-21'],
 '41': ['1980-03-18'],
 '42': ['1983-01-01'],
 '43': ['1977-08-30'],
 '44': ['1972-04-01'],
 '45': ['1995-05-12'],
 '46': ['1990-06-08'],
 '47': ['2008-02-09'],
 '48': ['1977-08-21'],
 '49': ['1998-01-27'],
 '50': ['1982-05

In [137]:
total_lines = len(lines)
lines_with_dates = len([key for key, value in extracted_dates.items() if value])
percentage_with_dates = (lines_with_dates / total_lines) * 100


In [138]:
percentage_with_dates

74.8

процент обработанных дат 74%

# Сохранение в файл

In [139]:
with open("results.txt", "w") as file:
    for key, value in extracted_dates.items():
        file.write(f"{key}: {', '.join(value)}\n")