# Date Parser using Basic Text Processing and Rules

This notebook demonstrates how to build a date parser using basic text processing and regex. The goal is to extract day, month, and year information from a given text and present it in the DD/MM/YYYY format. No external libraries or machine learning models are used.

In [5]:
import re

def parse_date(text):
    # Month mapping to convert month names to numbers
    month_mapping = {
        'january': '01', 'jan': '01',
        'february': '02', 'feb': '02',
        'march': '03', 'mar': '03',
        'april': '04', 'apr': '04',
        'may': '05',
        'june': '06', 'jun': '06',
        'july': '07', 'jul': '07',
        'august': '08', 'aug': '08',
        'september': '09', 'sep': '09', 'sept': '09',
        'october': '10', 'oct': '10',
        'november': '11', 'nov': '11',
        'december': '12', 'dec': '12'
    }

    # Normalize text - lowercase for better pattern matching
    text_lower = text.lower()

    # Define patterns for different date formats
    patterns = [
        # DD Month YYYY or DD of Month YYYY
        r'(?i)(\d{1,2})(?:st|nd|rd|th)?\s+(?:of\s+)?(january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|jun|jul|aug|sep|sept|oct|nov|dec)(?:\s*,?\s*|\s+)(\d{4}|\d{2})',
        
        # Month DD, YYYY or Month DD YYYY
        r'(?i)(january|february|march|april|may|june|july|august|september|october|november|december|jan|feb|mar|apr|jun|jul|aug|sep|sept|oct|nov|dec)\s+(\d{1,2})(?:st|nd|rd|th)?(?:\s*,?\s*|\s+)(\d{4}|\d{2})',

        # MM/DD/YYYY or DD/MM/YYYY
        r'(\d{1,2})/(\d{1,2})/(\d{4}|\d{2})',

        # YYYY/MM/DD
        r'(\d{4})/(\d{1,2})/(\d{1,2})',

        # MM-DD-YYYY or DD-MM-YYYY
        r'(\d{1,2})-(\d{1,2})-(\d{4}|\d{2})',

        # YYYY-MM-DD
        r'(\d{4})-(\d{1,2})-(\d{1,2})',

        # DD.MM.YYYY or MM.DD.YYYY
        r'(\d{1,2})\.(\d{1,2})\.(\d{4}|\d{2})',

        # YYYY.MM.DD
        r'(\d{4})\.(\d{1,2})\.(\d{1,2})'
    ]

    # Process text to find a date
    for pattern in patterns:
        matches = re.findall(pattern, text)
        if matches:
            match = matches[0]  # Use the first match

            # Handle different pattern formats
            if pattern == patterns[0]:  # DD Month YYYY
                day, month, year = match
                month = month_mapping[month.lower()]
                
            elif pattern == patterns[1]:  # Month DD, YYYY
                month, day, year = match
                month = month_mapping[month.lower()]
                
            elif pattern == patterns[2]:  # MM/DD/YYYY or DD/MM/YYYY
                first, second, year = match
                
                # Check if US format is explicitly indicated
                if int(first) <= 12 and int(second) <= 31 and f"{first}/{second}" in text:
                    # Check for American format indicators
                    if text_lower.find("us") != -1 or text_lower.find("america") != -1:
                        month, day = first, second
                    # If day is > 12, it must be DD/MM
                    elif int(second) > 12:
                        day, month = second, first
                    # Common format MM/DD for dates like 03/14 (March 14)
                    elif int(first) <= 12 and int(second) <= 12:
                        if f"{second}/{first}" in text:
                            day, month = second, first
                        else:
                            month, day = first, second
                    else:
                        day, month = second, first
                else:
                    # Default to international format (DD/MM)
                    day, month = first, second

            elif pattern == patterns[3]:  # YYYY/MM/DD
                year, month, day = match
                
            elif pattern == patterns[4]:  # MM-DD-YYYY or DD-MM-YYYY
                first, second, year = match
                # Assume DD-MM format for most international contexts
                if int(first) > 12:
                    day, month = first, second
                elif int(second) > 12:
                    month, day = first, second
                else:
                    # Default to DD-MM format for ambiguous cases
                    day, month = first, second

            elif pattern == patterns[5]:  # YYYY-MM-DD
                year, month, day = match
                
            elif pattern == patterns[6]:  # DD.MM.YYYY or MM.DD.YYYY
                first, second, year = match
                # Most common format for dot notation is DD.MM.YYYY
                if int(first) > 12 or int(second) > 12:
                    if int(first) <= 31 and int(second) <= 12:
                        day, month = first, second
                    else:
                        month, day = first, second
                else:
                    # Default to DD.MM for ambiguous cases
                    day, month = first, second
                
            elif pattern == patterns[7]:  # YYYY.MM.DD
                year, month, day = match

            # Handle two-digit year
            if len(year) == 2:
                year = '20' + year if int(year) < 50 else '19' + year

            # Ensure day and month are properly formatted
            try:
                day = int(day)
                month = int(month)
                
                # Validate day and month ranges
                if month > 12:
                    month, day = day, month  # Swap if month is out of range
                
                if day > 31:
                    day = 31  # Cap at maximum days
                
                return f"{day:02d}/{month:02d}/{year}"
            except ValueError:
                # If conversion fails, try to use the original values
                return f"{int(day):02d}/{int(month):02d}/{year}"

    # Check for standalone month name followed by day and year
    month_day_year = re.search(r'(?i)(jan|feb|mar|apr|may|jun|jul|aug|sep|sept|oct|nov|dec)[a-z]*\s+(\d{1,2})(?:st|nd|rd|th)?(?:\s*,?\s*|\s+)(\d{4}|\d{2})', text)
    if month_day_year:
        month, day, year = month_day_year.groups()
        # Find the full month name
        for full_month, abbr in month_mapping.items():
            if full_month.startswith(month.lower()):
                month = abbr
                break
        
        # Handle two-digit year
        if len(year) == 2:
            year = '20' + year if int(year) < 50 else '19' + year
            
        return f"{int(day):02d}/{int(month):02d}/{year}"

    return None

# Example usage
example_texts = [
    "I went to London on 23rd April, 2004",
    "The event is scheduled for 5th January, 2023",
    "My birthday is on 1st March, 1990",
    "We met on 12th December, 2015",
    "No date here!",
    "The deadline is 02/15/2021",
    "She was born on 07/03/1985",
    "The meeting is on 09-30-2020",
    "We celebrated on 12-25-2019",
    "Let's catch up on 02.04.2022.",
    "The project started on 5/6/19.",
    "He was born on 1987/11/23.",
    "Christmas is on 25th Dec 2024.",
    "The exam date is 2021.11.10.",
    "The workshop is on February 15th, 2022."
]

for text in example_texts:
    parsed_date = parse_date(text)
    print(f"Input: {text}\nParsed Date: {parsed_date}\n")

Input: I went to London on 23rd April, 2004
Parsed Date: 23/04/2004

Input: The event is scheduled for 5th January, 2023
Parsed Date: 05/01/2023

Input: My birthday is on 1st March, 1990
Parsed Date: 01/03/1990

Input: We met on 12th December, 2015
Parsed Date: 12/12/2015

Input: No date here!
Parsed Date: None

Input: The deadline is 02/15/2021
Parsed Date: 15/02/2021

Input: She was born on 07/03/1985
Parsed Date: 03/07/1985

Input: The meeting is on 09-30-2020
Parsed Date: 30/09/2020

Input: We celebrated on 12-25-2019
Parsed Date: 25/12/2019

Input: Let's catch up on 02.04.2022.
Parsed Date: 02/04/2022

Input: The project started on 5/6/19.
Parsed Date: 06/05/2019

Input: He was born on 1987/11/23.
Parsed Date: 31/11/2023

Input: Christmas is on 25th Dec 2024.
Parsed Date: 25/12/2024

Input: The exam date is 2021.11.10.
Parsed Date: 21/11/2010

Input: The workshop is on February 15th, 2022.
Parsed Date: 15/02/2022



In [6]:
# Test the date parser with cases from CSV file
test_cases = [
    ("The event will take place on March 5, 2023.", "05/03/2023"),
    ("Her birthday is on 07/08/1990.", "07/08/1990"),
    ("The deadline is 2022-12-31.", "31/12/2022"),
    ("We met on 1st of January 2000.", "01/01/2000"),
    ("The concert is scheduled for 15th September, 2021.", "15/09/2021"),
    ("Let's catch up on 02.04.2022.", "02/04/2022"),
    ("The project started on 5/6/19.", "05/06/2019"),
    ("He was born on 1987/11/23.", "23/11/1987"),
    ("Christmas is on 25th Dec 2024.", "25/12/2024"),
    ("The meeting is set for April 03, 2020.", "03/04/2020"),
    ("Her birthdate, noted as 1997-05-20, is in the records.", "20/05/1997"),
    ("Her appointment is on the 2nd of March, 2021.", "02/03/2021"),
    ("The exam date is 2021.11.10.", "10/11/2021"),
    ("They got married on 12/12/12.", "12/12/2012"),
    ("The workshop is on February 15th, 2022.", "15/02/2022"),
    ("Submit your report by 08/31/2021.", "31/08/2021"),
    ("The course starts on 1st July 2023.", "01/07/2023"),
    ("Independence Day is on 4th of July, 2022.", "04/07/2022"),
    ("His birthday is 1995/10/30.", "30/10/1995"),
    ("The new year begins on 01-01-2023.", "01/01/2023"),
    ("The seminar is on 03/14/2022.", "14/03/2022"),
    ("My last day is 31.08.2020.", "31/08/2020"),
    ("The due date is 2020-02-28.", "28/02/2020"),
    ("The holiday starts on Dec 20th, 2021.", "20/12/2021"),
    ("The conference will be held on 5th May 2023.", "05/05/2023"),
    ("They moved in on 12/25/2019.", "25/12/2019"),
    ("The festival begins on March 17, 2022.", "17/03/2022"),
    ("The ceremony is on 11.11.2021.", "11/11/2021"),
    ("The event is on 2023/07/04.", "04/07/2023"),
    ("Her graduation is on May 30th, 2022.", "30/05/2022"),
    ("The release date is 2021-09-09.", "09/09/2021"),
    ("The interview is on 1/2/2022.", "01/02/2022"),
    ("The celebration is on 2022-10-10.", "10/10/2022"),
    ("His wedding is on 6th of August, 2020.", "06/08/2020"),
    ("She was born on 3rd March 1998.", "03/03/1998"),
    ("The opening is on 10/10/2018.", "10/10/2018"),
    ("The deadline is 2020.12.15.", "15/12/2020"),
    ("The party is on 31-12-2022.", "31/12/2022"),
    ("The workshop is on February 29, 2024.", "29/02/2024"),
    ("Vacation starts on 07/15/2021.", "15/07/2021"),
    ("The application is due by 2022-03-03.", "03/03/2022"),
    ("The tournament is on June 1st, 2021.", "01/06/2021"),
    ("The closing date is 08/08/2020.", "08/08/2020"),
    ("The concert is on 2020/09/09.", "09/09/2020"),
    ("The exam is on 01.01.2022.", "01/01/2022"),
    ("Independence Day is 2023-07-04.", "04/07/2023"),
    ("The last date is 30th November 2022.", "30/11/2022"),
    ("The conference is on 15th October 2023.", "15/10/2023"),
    ("His birthdate is 1990-05-20.", "20/05/1990"),
    ("The festival is on 12th August 2024.", "12/08/2024")
]

# Calculate accuracy and display predicted vs actual
correct = 0
for text, expected in test_cases:
    result = parse_date(text)
    print(f"Input: {text}")
    print(f"Predicted: {result}")
    print(f"Expected: {expected}")
    print()
    if result == expected:
        correct += 1

accuracy = (correct / len(test_cases)) * 100
print(f"Accuracy: {accuracy}%")

Input: The event will take place on March 5, 2023.
Predicted: 05/03/2023
Expected: 05/03/2023

Input: Her birthday is on 07/08/1990.
Predicted: 08/07/1990
Expected: 07/08/1990

Input: The deadline is 2022-12-31.
Predicted: 22/12/2031
Expected: 31/12/2022

Input: We met on 1st of January 2000.
Predicted: 01/01/2000
Expected: 01/01/2000

Input: The concert is scheduled for 15th September, 2021.
Predicted: 15/09/2021
Expected: 15/09/2021

Input: Let's catch up on 02.04.2022.
Predicted: 02/04/2022
Expected: 02/04/2022

Input: The project started on 5/6/19.
Predicted: 06/05/2019
Expected: 05/06/2019

Input: He was born on 1987/11/23.
Predicted: 31/11/2023
Expected: 23/11/1987

Input: Christmas is on 25th Dec 2024.
Predicted: 25/12/2024
Expected: 25/12/2024

Input: The meeting is set for April 03, 2020.
Predicted: 03/04/2020
Expected: 03/04/2020

Input: Her birthdate, noted as 1997-05-20, is in the records.
Predicted: 31/05/2020
Expected: 20/05/1997

Input: Her appointment is on the 2nd of M