In [1]:
import re
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('C:/Users/Aaryan/Desktop/Random/Datasets/date_parser_testcases.csv')
df

Unnamed: 0,Input,Expected Output
0,"The event will take place on March 5, 2023.",05/03/2023
1,Her birthday is on 07/08/1990.,07/08/1990
2,The deadline is 2022-12-31.,31/12/2022
3,We met on 1st of January 2000.,01/01/2000
4,"The concert is scheduled for 15th September, 2...",15/09/2021
...,...,...
95,"We celebrate Independence Day on 2023-07-04, a...",04/07/2023
96,The final date for submission is 30th November...,30/11/2022
97,"The annual conference is on 15th October 2023,...",15/10/2023
98,"His birthdate, noted as 1990-05-20, is in the ...",20/05/1990


In [3]:
def extract_date(text):
    date_patterns = [
        r"(\d{1,2})(?:st|nd|rd|th)?(?: of)?\s+([A-Za-z]+),?\s+(\d{4})",  # e.g., 21st June, 2024 or 1st of January 2000
        r"(\d{4})-(\d{1,2})-(\d{1,2})",  # e.g., 2022-12-31
        r"(\d{1,2})/(\d{1,2})/(\d{4})"  # e.g., 07/08/1990
    ]

    month_names = {
        'January': '01', 'February': '02', 'March': '03', 'April': '04',
        'May': '05', 'June': '06', 'July': '07', 'August': '08',
        'September': '09', 'October': '10', 'November': '11', 'December': '12',

        'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
        'Jun': '06', 'Jul': '07', 'Aug': '08',
        'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
    }

    for pattern in date_patterns:
        match = re.search(pattern, text)
        if match:
            if pattern == date_patterns[0]:  
                day, month_str, year = match.groups()
                month = month_names.get(month_str)
            elif pattern == date_patterns[1]:  
                year, month, day = match.groups()
            elif pattern == date_patterns[2]:  
                day, month, year = match.groups()

            # Format day and month as two-digit numbers
            day = day.zfill(2)
            month = month.zfill(2)

            return f"{day}/{month}/{year}"

    return "No valid date found"

In [4]:
df['Extracted Date'] = df['Input'].apply(extract_date)

In [6]:
print(df[['Expected Output', 'Extracted Date']])

   Expected Output       Extracted Date
0       05/03/2023  No valid date found
1       07/08/1990           07/08/1990
2       31/12/2022           31/12/2022
3       01/01/2000           01/01/2000
4       15/09/2021           15/09/2021
..             ...                  ...
95      04/07/2023           04/07/2023
96      30/11/2022           30/11/2022
97      15/10/2023           15/10/2023
98      20/05/1990           20/05/1990
99      12/08/2024           12/08/2024

[100 rows x 2 columns]


In [7]:
correct_matches = df['Extracted Date'] == df['Expected Output']
accuracy = correct_matches.mean() * 100

In [8]:
print(accuracy)

45.0


In [9]:
print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 45.00%


In [13]:
df.head(10)

Unnamed: 0,Input,Expected Output,Extracted Date
0,"The event will take place on March 5, 2023.",05/03/2023,No valid date found
1,Her birthday is on 07/08/1990.,07/08/1990,07/08/1990
2,The deadline is 2022-12-31.,31/12/2022,31/12/2022
3,We met on 1st of January 2000.,01/01/2000,01/01/2000
4,"The concert is scheduled for 15th September, 2...",15/09/2021,15/09/2021
5,Let's catch up on 02.04.2022.,02/04/2022,No valid date found
6,The project started on 5/6/19.,05/06/2019,No valid date found
7,He was born on 1987/11/23.,23/11/1987,No valid date found
8,Christmas is on 25th Dec 2024.,25/12/2024,25/12/2024
9,"The meeting is set for April 03, 2020.",03/04/2020,No valid date found


In [12]:
mismatches = df[~correct_matches]

print("\nMismatched Entries:")
print(mismatches[['Input', 'Expected Output', 'Extracted Date']])


Mismatched Entries:
                                                Input  Expected Output  \
0         The event will take place on March 5, 2023.       05/03/2023   
5                       Let's catch up on 02.04.2022.       02/04/2022   
6                      The project started on 5/6/19.       05/06/2019   
7                          He was born on 1987/11/23.       23/11/1987   
9              The meeting is set for April 03, 2020.       03/04/2020   
12                       The exam date is 2021.11.10.       10/11/2021   
13                      They got married on 12/12/12.       12/12/2012   
14            The workshop is on February 15th, 2022.       15/02/2022   
15                  Submit your report by 08/31/2021.       31/08/2021   
18                        His birthday is 1995/10/30.       30/10/1995   
19                 The new year begins on 01-01-2023.       01/01/2023   
20                      The seminar is on 03/14/2022.       14/03/2022   
21               