# Date Mining Assignment
#### Author: Emmanuel Sedicol

In [25]:
import numpy as np
import pandas as pd
from IPython.display import display, Markdown, Latex
pd.set_option('max_colwidth', 1000)

import re, os, datetime
import textblob
import calendar

for d in ["src","data"]: os.makedirs(d, exist_ok=True)
    
DEBUG = False

## Import Data

In [26]:
URL = "https://kmurphy.bitbucket.io/modules/Data_Mining_2/topics/08-Text_Mining/20-Mining_Dates/files/"

for filename, dest in [("public.csv", "src")]:
    
    source = f"{URL}/{filename}"
    target = f"{dest}/{filename}"

    if not os.path.isfile(target):
        print (f"Downloading remote file {filename} to folder {dest}", sep="")
        import urllib.request
        urllib.request.urlretrieve(source, target)
    else:
        print(f"Using local copy of {filename} in folder {dest}")

df = pd.read_csv("src/public.csv")
print(df.shape)

Using local copy of public.csv in folder src
(715, 2)


## Regular Expressions

#### Posible date formats after data cleaning
- 04/02/2009 (day, month, year)
- Mar 2 2009 (month, day, year)
- Feb 2009 (_, month, year)
- 2009 (_, _, year)

In [27]:
# quick access to delimeters
DELIMS = "[/\-\s]"

# valid days range is from 1 to 31
DAYS_RANGE = r"(3[01])|([012]?\d)"

# valid months range is from 1 to 12
MONTHS_NUM = r"1[012]|0?[1-9]"
MONTHS_TEXT = r"january[\s\-/]?|february[\s\-/]?|march[\s\-/]?|april[\s\-/]?|may[\s\-/]?|june[\s\-/]?|july[\s\-/]?|august[\s\-/]?|september[\s\-/]?|october[\s\-/]?|(no)v?ember[\s\-/]?|december[\s\-/]?|jan[\s\-/]?|feb[\s\-/]?|mar[\s\-/]?|apr[\s\-/]?|jun[\s\-/]?|jul[\s\-/]?|aug[\s\-/]?|sept[\s\-/]?|oct[\s\-/]?|nov[\s\-/]?|dec[\s\-/]?"
MONTHS_RANGE = f"({MONTHS_NUM})|({MONTHS_TEXT})"

# valid years range is from 1920 to (YYYY or last two digit of year YY)
YEARS_RANGE = r"19([2-9][0-9])|20([01][0-9])|([2-9][0-9])|([01][0-9])"

# posible date formats
DD_MM_YYYY = f"({DAYS_RANGE})({DELIMS})({MONTHS_RANGE})({DELIMS})({YEARS_RANGE})"
MM_DD_YYYY = f"({MONTHS_RANGE})({DELIMS})({DAYS_RANGE})({DELIMS})({YEARS_RANGE})"
MM_YYYY = f"({MONTHS_RANGE})({DELIMS})({YEARS_RANGE})"
DD_YY = r"\s(3[01])|([012]?\d)\s(19([2-9][0-9])|20([01][0-9])|([2-9][0-9])|([01][0-9]))"
YYYY = f"({YEARS_RANGE})"

FULL_FORMAT = re.compile(f"({DD_MM_YYYY})|({MM_DD_YYYY})|({MM_YYYY})|({DD_YY})|({YYYY})")

## Cleaning Step

As a result of this step, the dataframe will have columns

 * __Code__ unchanged
 * __Raw__ unchanged
 * __Data__ cleaned version of __Raw__
 * __Iter__ number of regex that matched this row (for development purposes)
 * __Match__ regex object result for this row (for development purposes, you might not use)
 * __Day__ day (or zero if not set)
 * __Month__ month (or zero if not set)
 * __Year__ year (or zero if not set)

In [28]:
month_dict = {
    "jan":"january",
    "feb":"february",
    "mar":"march",
    "apr":"april",
    "jun":"june",
    "jul":"july",
    "aug":"august",
    "agust":"august",
    "auust":"august",
    "sep":"september",
    "sept":"september",
    "oct":"october" ,
    "nov":"november" ,
    "dec":"december" 
   
}

# Complete incomplete months e.g. jan => january
def complete_month(txt):
    regex = re.compile(r"jan\b|feb\b|mar\b|apr\b|jun\b|jul\b|aug\b|agust\b|auust\b|sep\b|sept\b|oct\b|nov\b|dec\b")
    match = re.search(regex, txt)
    
    if match is not None:
        target = match.group().strip()
        result = re.sub(re.compile(target), month_dict[target],txt)
    else:
        result = txt
        
    return result


FORMAT_PATTERN = f"({DAYS_RANGE})\s({MONTHS_TEXT})\s({YEARS_RANGE})"
# fomate date order e.g. DD, MM, YYYY => MM, DD, YYYY
def format_date_layout(txt):
    match =  re.search(FORMAT_PATTERN, txt)
    # if match found
    if match is not None: 
        data = str((match.group())).split()
        result = re.sub(match.group().strip(), f"{data[1]} {data[0]} {data[2]}", txt)
    else:
        result = txt
    return result
        
# lower case text, complete spelling for all months, check spellings and removed unwanted characters
def clean_message(src):
     # seperate attached strings to upper case letters e.g. 'yAug' => 'y Aug'
    clean_step1 = re.sub(r'([a-z]?[a-z])([A-Z]?[A-Z])', r'\g<1> \g<2>' ,src)

    # low caps all characters
    clean_step2 = clean_step1.lower()

    # first month correction e.g. aug => august
    month_correction1 = complete_month(clean_step2)

    # remove all of (st|th|rd|nd) e.g. 21st => 21
    clean_step3 = re.sub(re.compile(r"([\w\d]+)(st|th|rd|nd)"), r"\1", month_correction1)

    # remove all of '.', '?', '!', ')', '*' 
    clean_step4 = re.sub(re.compile(r"([\(\))*?!:~;]?)([\w\d]+)([\(\))*?!:~;]?)"), r"\2", clean_step3)

    # remove any ['-', '/'] and replace with space
    clean_step5 = re.sub(re.compile(r"[.\-/,]"), " ", clean_step4)

    # insert space bewteen connected digit and word character e.g. '6june' => '6 june'
    clean_step6 = re.sub(r'([a-z]?[a-z])([0-9]?[0-9])', r'\g<1> \g<2>' ,clean_step5)
    clean_step7 = re.sub(r'([0-9]?[0-9])([a-z]?[a-z])', r'\g<1> \g<2>' ,clean_step6)

    # reduce triple spacing to one
    spacing_correction1 = clean_step7.replace("   ", " ")

    # reduce double spacing to one
    spacing_correction2 = spacing_correction1.replace("  ", " ")

    # spellchecker
    spell_checker = "".join(textblob.TextBlob(spacing_correction2.strip()).correct())

    # second month check
    month_correction2 = complete_month(spell_checker)

    # final clean: fix date format to (MM, DD, YYYY)
    final_clean = format_date_layout(month_correction2)
    
    return final_clean.strip()

if DEBUG:
    df["Data"] = df["Raw"].apply(lambda txt: clean_message(txt))      
    df["Date"] = 0     
    df["Day"] = 0
    df["Month"] = 0
    df["Year"] = 0
    
    df.to_csv("src/clean_data.csv")

In [29]:
# Use clean dataset from this point onwards
df = pd.read_csv("src/clean_data.csv")

display(df.sample(5))
print(df.shape)

Unnamed: 0.1,Unnamed: 0,Code,Raw,Data,Date,Day,Month,Year
147,147,2881341216,".Mother died, 1/1983.",more died 1 1983,0,0,0,0
582,582,8203172862,sS/P TAH/BSO : 7/2004 for endometrial hyperplasia,s s p ah so : 7 2004 for geometrical hyperplasia,0,0,0,0
578,578,8162526406,27 Agust 14 Communication with referring physician?: Done,august 27 14 communication i referring physician: done,0,0,0,0
294,294,4722176370,"7HH, April 1985 Hx of Outpatient Treatment:",7 he april 1985 he of outpatient treatment,0,0,0,0
614,614,8677583099,"none; but currently has appt with new HJH PCP Rachel Salas, MD on 7, Agust 93 Other Agency Involvement: No",none but currently has apt i new heh pp rachel salts md on august 7 93 or agency involvement no,0,0,0,0


(715, 8)


## Helper Function

- Find Match Function: return a list of all matches
 
- Transform Month Function: iterate through input text and transform month onto its numerical value

- Transform Year Function: add "19" or "20" to start of year

- Populate function to insert default values to missing dates e.g. 23/2019 => 01/23/2009

In [30]:
def transform_month(month):
    # search for months in text pattern
    match = re.search(re.compile(MONTHS_TEXT), month)
    
    # if match found
    if match is not None:  
        # change month to its numerical value
        month_num = datetime.datetime.strptime(complete_month(match.group().strip()), "%B").month
        result = re.sub(re.compile(match.group().strip()), str(month_num), month)
    else:
        result = month
    # return a formated numerical dates seperated by '-'
    return result.replace(" ", "/")


def transform_year(txt):
    result = ""
    data = txt.split("/")
    data_length = len(data) - 1
    
    if len(str(data[data_length])) >= 2 and len(str(data[data_length])) < 4:
        nineties_range = re.sub(r"([2-9][0-9])", "19" + data[data_length], data[data_length])
        twenties_range = re.sub(r"([01][0-9])", "20" + data[data_length], data[data_length])

        data[data_length] = nineties_range if len(nineties_range) == 4 else twenties_range
        result = "/".join(data)
    else:
        result = txt
        
    return result

def populate_missing_values(txt):
    result = ""
    data = txt.split("/")
    
    if len(data) == 2:
        result = "01/" + txt
    elif len(data) == 1:
        result = "01/01/" + txt
    else:
        result = txt
        
    return result

# Change any text dates into numerical values
def find_match(pattern):
    match_array = []
    
    for row in range(0, len(df)):
        longest_match = []
        match = re.finditer(re.compile(pattern), df["Data"].loc[row]) 
        for m in match:
            longest_match.append(m[0])

        # retirive longest match values
        result = max(longest_match, key=len).strip()
        
        month_transform = transform_month(result)
        year_transform = transform_year(month_transform)
        populate_missing_dates = populate_missing_values(year_transform)
        
        match_array.append(populate_missing_dates) if len(longest_match) > 0 else match_array.append("None")

    return match_array

df["Date"] = find_match(FULL_FORMAT)

In [31]:
# function to slice match values in order to access month, day and year values
def slice_match(row, section):
    data = row.replace('//', '/')
    data_split = data.split('/')

    return data_split[section]

# mapping values
df['Day'] = df.Date.apply(lambda txt: slice_match(txt, 1))  
df['Month'] = df.Date.apply(lambda txt: slice_match(txt, 0))  
df['Year'] = df.Date.apply(lambda txt: slice_match(txt, 2))  

In [33]:
df = df.sort_values(by=['Date','Raw'])
df[["Date", "Raw", "Data"]].sample(10)

Unnamed: 0,Date,Raw,Data
495,01/4/1974,Dr. Noland-pending 4/1974 Hx of Brain Injury: No,dr noma being 4 1974 he of brain injury no
362,4/8/1980,"8 April, 80= 1, negativeAudit C Score Current:",april 8 80= 1 negative audit c score current
269,12/15/1992,12/15/92 CPT Code: 90801 - Psychiatric Diagnosis Interview,12 15 92 cut code 90801 psychiatric diagnosis interview
546,01/12/2008,12/2008 Primary Care Doctor:,12 2008 primary care doctor
562,10/18/1999,18 Oct 1999 SOS-10 Total Score:,october 18 1999 so 10 total score
167,3/7/1986,3/7/86 SOS-10 Total Score:,3 7 86 so 10 total score
151,01/1/1994,".Since January 1994, she feels that hse has been much more irritable and frustrated towards her husband about 1 week prior to her menses. She does not have a h/o premenstrual mood symptoms. She notes that little things would set her off with him. She was not this way at work or in other social settings.",since january 1994 she feels that he has been much more irritable a frustrated towns her husband about 1 week prior to her senses she does not have a h o premenrual mood symptoms she notes that little things would set her off i him she was not this way at work or in or social settings
273,8/29/1992,"29, Agust 92 Primary Care Doctor:",august 29 92 primary care doctor
703,01/10/1980,n Abnormal CXR : 10/1980 ? scarring @ R heart border needs f/u CXR,n abnormal car : 10 1980 ? scarring @ r heart boer needs f u car
453,9/02/1976,9/02/76 CPT Code: 90791: No medical services,9 02 76 cut code 90791 no medical services


In [12]:
df.to_csv("results.csv", index=False)
from zipfile import ZipFile
archive = "date_assignment.zip"
print(f"Creating archive: {archive}")
with ZipFile(archive,"w") as zip:
    for f in ["01-Model.ipynb", "results.csv"]:
        if os.path.isfile(f):
            print(f"\t{f} - OK")
            zip.write(f) 
        else:
            print(f"\t{f} - Missing. Check this!")

Creating archive: date_assignment.zip
	01-Model.ipynb - OK
	results.csv - OK
