**Build a date parser using basic text processing and rules.**

In [2]:
import pandas as pd
import datetime
import re

Load the dataset

In [3]:
dptc = pd.read_csv('date_parser_testcases.csv')
dptc

Unnamed: 0,Input,Expected Output
0,"The event will take place on March 5, 2023.",05/03/2023
1,Her birthday is on 07/08/1990.,07/08/1990
2,The deadline is 2022-12-31.,31/12/2022
3,We met on 1st of January 2000.,01/01/2000
4,"The concert is scheduled for 15th September, 2...",15/09/2021
...,...,...
95,"We celebrate Independence Day on 2023-07-04, a...",04/07/2023
96,The final date for submission is 30th November...,30/11/2022
97,"The annual conference is on 15th October 2023,...",15/10/2023
98,"His birthdate, noted as 1990-05-20, is in the ...",20/05/1990


Case 1 : 01/01/2000 or 1/1/2000 or 01/01/00 or 1/1/00

In [4]:
match_count = 0
found_indexes = []

date_pattern = r'\b\d{1,2}[./-]\d{1,2}[./-]\d{2,4}\b'

for idx in range(len(dptc)):
    text_entry = dptc['Input'][idx]
    if re.search(date_pattern, text_entry):
        print(text_entry)  
        found_indexes.append(idx)
        match_count += 1

Her birthday is on 07/08/1990.
Let's catch up on 02.04.2022.
The project started on 5/6/19.
They got married on 12/12/12.
Submit your report by 08/31/2021.
The new year begins on 01-01-2023.
The seminar is on 03/14/2022.
My last day is 31.08.2020.
They moved in on 12/25/2019.
The ceremony is on 11.11.2021.
The interview is on 1/2/2022.
The opening is on 10/10/2018.
The party is on 31-12-2022.
Vacation starts on 07/15/2021.
The closing date is 08/08/2020.
The exam is on 01.01.2022.
Her birthday, which she celebrates on 07/08/1990, is coming up soon.
Remember, the meeting is on 02.04.2022 at 10 AM.
Let's wrap up the project by 5/6/19, so we can relax.
They celebrated their wedding on 12/12/12 in grand style.
The submission deadline, noted as 08/31/2021, is fast approaching.
The new year's celebration begins on 01-01-2023 at midnight.
We scheduled the seminar for 03/14/2022, don't forget.
My final working day here is noted as 31.08.2020.
They officially moved in on 12/25/2019.
The ceremon

In [5]:
# Calculate initial match percentage
initial_match_percentage = match_count / len(dptc)
print(f'Initial Match Percentage: {initial_match_percentage:.2f}')

Initial Match Percentage: 0.32


In [6]:
# Handle dates with month names
maybe = {}
for i, text in enumerate(dptc['Input']):
    if i not in found_indexes:
        words = text.split()
        for word in words:
            if re.search(r'(jan ?|feb ?|mar ?|apr ?|may ?|june ?|jul ?|aug ?|sep ?|oct ?|nov ?|dec ?)', word.lower()):
                maybe[text] = word

In [7]:
print(f'Maybe Matches: {maybe}')

Maybe Matches: {'The event will take place on March 5, 2023.': 'March', 'We met on 1st of January 2000.': 'January', 'The concert is scheduled for 15th September, 2021.': 'September,', 'Christmas is on 25th Dec 2024.': 'Dec', 'The meeting is set for April 03, 2020.': 'April', 'Her appointment is on the 2nd of March, 2021.': 'March,', 'The workshop is on February 15th, 2022.': 'February', 'The course starts on 1st July 2023.': 'July', 'Independence Day is on 4th of July, 2022.': 'July,', 'The holiday starts on Dec 20th, 2021.': 'Dec', 'The conference will be held on 5th May 2023.': 'May', 'The festival begins on March 17, 2022.': 'March', 'Her graduation is on May 30th, 2022.': 'May', 'His wedding is on 6th of August, 2020.': 'August,', 'She was born on 3rd March 1998.': 'March', 'The workshop is on February 29, 2024.': 'February', 'The tournament is on June 1st, 2021.': 'June', 'The last date is 30th November 2022.': 'November', 'The conference is on 15th October 2023.': 'October', 'Th

In [8]:
# Convert month abbreviations to numbers and handle various formats
shortlist = {}
months = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]
for text, word in maybe.items():
    if len(word) >= 3 and word.lower() in months:
        shortlist[text] = word
        match_count += 1
    elif len(word) == 3:
        shortlist[text] = word
        match_count += 1

In [9]:
print(f'Shortlist: {shortlist}')

Shortlist: {'The event will take place on March 5, 2023.': 'March', 'We met on 1st of January 2000.': 'January', 'Christmas is on 25th Dec 2024.': 'Dec', 'The meeting is set for April 03, 2020.': 'April', 'The workshop is on February 15th, 2022.': 'February', 'The course starts on 1st July 2023.': 'July', 'The holiday starts on Dec 20th, 2021.': 'Dec', 'The conference will be held on 5th May 2023.': 'May', 'The festival begins on March 17, 2022.': 'March', 'Her graduation is on May 30th, 2022.': 'May', 'She was born on 3rd March 1998.': 'March', 'The workshop is on February 29, 2024.': 'February', 'The tournament is on June 1st, 2021.': 'June', 'The last date is 30th November 2022.': 'November', 'The conference is on 15th October 2023.': 'October', 'The festival is on 12th August 2024.': 'August', 'We are planning to meet on March 5, 2023, for lunch.': 'March', 'We first met on the 1st of January 2000 at the conference.': 'January', 'The concert, happening on 15th September 2021, will 

In [19]:
`# Function to convert month names to numbers
def month_name_to_number(month):
    months = {
        "jan": "01", "january": "01",
        "feb": "02", "february": "02",
        "mar": "03", "march": "03",
        "apr": "04", "april": "04",
        "may": "05",
        "jun": "06", "june": "06",
        "jul": "07", "july": "07",
        "aug": "08", "august": "08",
        "sep": "09", "september": "09",
        "oct": "10", "october": "10",
        "nov": "11", "november": "11",
        "dec": "12", "december": "12"
    }
    return months.get(month.lower(), None)

SyntaxError: invalid syntax (3103701596.py, line 1)

In [20]:
def ensure_full_year(year):
    if len(year) == 2:
        if int(year) <= int(str(pd.Timestamp.now().year)[-2:]):
            return "20" + year  
        else:
            return "19" + year 
    return year

In [21]:
def ensure_full_year(year):
    if len(year) == 2:
        if int(year) <= int(str(pd.Timestamp.now().year)[-2:]):
            return "20" + year  
        else:
            return "19" + year 
    return year

In [22]:
def parse_date(text):
    patterns = [
        r"(\d{1,2})(st|nd|rd|th)?\s+of\s+([A-Za-z]+)\s*,?\s*(\d{4})",
        r"(\d{1,2})(st|nd|rd|th)?\s+of\s+([A-Za-z]+)\s+(\d{4})",
        r"(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s*,?\s*(\d{4})", 
        r"([A-Za-z]+)\s+(\d{1,2})(st|nd|rd|th)?\s*,?\s*(\d{4})",  
        r"(\d{1,2})/(\d{1,2})/(\d{4})",  
        r"(\d{1,2})-(\d{1,2})-(\d{4})",  
        r"(\d{1,2})\.(\d{1,2})\.(\d{4})",  
        r"(\d{4})/(\d{1,2})/(\d{1,2})",  
        r"(\d{4})-(\d{1,2})-(\d{1,2})",  
        r"(\d{1,2})/(\d{1,2})/(\d{2})",  
        r"(\d{4})\.(\d{1,2})\.(\d{1,2})",
        r"([A-Za-z]+)\s+(\d{1,2})(st|nd|rd|th)?,?\s+(\d{4})",  
        r"(\d{4})\s+([A-Za-z]+)\s+(\d{1,2})(st|nd|rd|th)?"
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            try:
                if pattern == r"(\d{1,2})/(\d{1,2})/(\d{4})":
                    month, day, year = match.groups()
                    month, day = month.zfill(2), day.zfill(2)
                    return f"{day}/{month}/{year}"
                
                if pattern == r"(\d{1,2})(st|nd|rd|th)?\s+of\s+([A-Za-z]+)\s*,?\s*(\d{4})":
                    day, month, year = match.groups()[0], match.groups()[2], match.groups()[3]
                    day = day.zfill(2)  
                    month = month_name_to_number(month)  
                    return f"{day}/{month}/{year}"
                
                if pattern == r"(\d{1,2})(st|nd|rd|th)?\s+of\s+([A-Za-z]+)\s+every\s+year(?:,\s+including\s+(\d{4}))?":
                    day, month, year = match.groups()[0], match.groups()[2], match.groups()[3]
                    day = day.zfill(2)
                    month = month_name_to_number(month)
                    if not year:
                        year = datetime.now().year
                    else:
                        year = ensure_full_year(year)
                    return f"{day}/{month}/{year}"
                
                if len(match.groups()) == 4:  
                    day = match.group(1) if pattern.startswith(r"(\d{1,2})") else match.group(2)
                    day = day.zfill(2)
                    month = match.group(3) if pattern.startswith(r"(\d{1,2})") else match.group(1)
                    if month.isdigit():  
                        month = month.zfill(2)
                    else:  
                        month = month_name_to_number(month)
                    year = match.group(4) if pattern.startswith(r"(\d{1,2})") else match.group(4)
                else:  
                    if pattern.startswith(r"(\d{4})"):  
                        year, month, day = match.group(1), match.group(2).zfill(2), match.group(3).zfill(2)
                    else:  
                        day, month, year = match.group(1).zfill(2), match.group(2).zfill(2), match.group(3)

                if year and len(year) == 2:
                    year = ensure_full_year(year)
                if month and len(month) == 1:
                    month = f"0{month}"
                if day and len(day) == 1:
                    day = f"0{day}"
                
                return f"{day}/{month}/{year}"
            except Exception as e:
                print(f"Error parsing date '{text}': {e}")
    
    return None

In [23]:
dptc['parsed_date'] = dptc['Input'].apply(parse_date)
print(dptc)

Error parsing date 'The event will take place on March 5, 2023.': name 'month_name_to_number' is not defined
Error parsing date 'The event will take place on March 5, 2023.': name 'month_name_to_number' is not defined
Error parsing date 'We met on 1st of January 2000.': name 'month_name_to_number' is not defined
Error parsing date 'We met on 1st of January 2000.': name 'month_name_to_number' is not defined
Error parsing date 'The concert is scheduled for 15th September, 2021.': name 'month_name_to_number' is not defined
Error parsing date 'Christmas is on 25th Dec 2024.': name 'month_name_to_number' is not defined
Error parsing date 'The meeting is set for April 03, 2020.': name 'month_name_to_number' is not defined
Error parsing date 'The meeting is set for April 03, 2020.': name 'month_name_to_number' is not defined
Error parsing date 'Her appointment is on the 2nd of March, 2021.': name 'month_name_to_number' is not defined
Error parsing date 'The workshop is on February 15th, 2022.

In [24]:
# Function to get expected output
def expected_output(input_text):
    return dptc[dptc['Input'] == input_text]['Expected Output'].values[0]

In [25]:
print(expected_output(dptc['Input'][1]))

07/08/1990


In [26]:
correct = 0
for idx in range(len(dptc)):
    if expected_output(dptc['Input'][idx]) == dptc['parsed_date'][idx]:
        correct += 1

In [27]:
correct_percentage = correct / len(dptc)
print(f'Correct Percentage: {correct_percentage:.2f}')

Correct Percentage: 0.55
