# Date Mining Assignment
#### Author: Emmanuel Sedicol

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display, Markdown, Latex
pd.set_option('max_colwidth', 1000)

import re, os, datetime
import textblob

DEBUG = False

## Import Data

In [2]:
URL = "https://kmurphy.bitbucket.io/modules/Data_Mining_2/topics/08-Text_Mining/20-Mining_Dates/files/"

for filename, dest in [("public.csv", "src")]:
    
    source = f"{URL}/{filename}"
    target = f"{dest}/{filename}"

    if not os.path.isfile(target):
        print (f"Downloading remote file {filename} to folder {dest}", sep="")
        import urllib.request
        urllib.request.urlretrieve(source, target)
    else:
        print(f"Using local copy of {filename} in folder {dest}")

df = pd.read_csv("src/public.csv")
print(df.shape)

Using local copy of public.csv in folder src
(715, 2)


## Regular Expressions

#### Posible date formats after data cleaning
- 04/02/2009 (day, month, year)
- Mar 2 2009 (month, day, year)
- Feb 2009 (_, month, year)
- 2009 (_, _, year)

In [3]:
# quick access to delimeters
DELIMS = "[/\-\s]"

# valid days range is from 1 to 31
DAYS_RANGE = r"(3[01])|([012]?\d)"

# valid months range is from 1 to 12
MONTHS_NUM = r"1[012]|0?[1-9]"
MONTHS_TEXT = r"january[\s\-/]?|february[\s\-/]?|march[\s\-/]?|april[\s\-/]?|may[\s\-/]?|june[\s\-/]?|july[\s\-/]?|august[\s\-/]?|september[\s\-/]?|october[\s\-/]?|(no)v?ember[\s\-/]?|december[\s\-/]?|jan[\s\-/]?|feb[\s\-/]?|mar[\s\-/]?|apr[\s\-/]?|jun[\s\-/]?|jul[\s\-/]?|aug[\s\-/]?|sept[\s\-/]?|oct[\s\-/]?|nov[\s\-/]?|dec[\s\-/]?"
MONTHS_RANGE = f"({MONTHS_NUM})|({MONTHS_TEXT})"

# valid years range is from 1920 to (YYYY or last two digit of year YY)
YEARS_RANGE = r"19([2-9][0-9])|20([01][0-9])|([2-9][0-9])|([01][0-9])"

# posible date formats
DD_MM_YYYY = f"({DAYS_RANGE})({DELIMS})({MONTHS_RANGE})({DELIMS})({YEARS_RANGE})"
MM_DD_YYYY = f"({MONTHS_RANGE})({DELIMS})({DAYS_RANGE})({DELIMS})({YEARS_RANGE})"
MM_YYYY = f"({MONTHS_RANGE})({DELIMS})({YEARS_RANGE})"
DD_YY = r"\s(3[01])|([012]?\d)\s(19([2-9][0-9])|20([01][0-9])|([2-9][0-9])|([01][0-9]))"
YYYY = f"({YEARS_RANGE})"

FULL_FORMAT = re.compile(f"({DD_MM_YYYY})|({MM_DD_YYYY})|({MM_YYYY})|({DD_YY})|({YYYY})")

## Cleaning Step

As a result of this step, the dataframe will have columns

 * __Code__ unchanged
 * __Raw__ unchanged
 * __Data__ cleaned version of __Raw__
 * __Iter__ number of regex that matched this row (for development purposes)
 * __Match__ regex object result for this row (for development purposes, you might not use)
 * __Day__ day (or zero if not set)
 * __Month__ month (or zero if not set)
 * __Year__ year (or zero if not set)

In [4]:
month_dict = {
    "jan":"january",
    "feb":"february",
    "mar":"march",
    "apr":"april",
    "jun":"june",
    "jul":"july",
    "aug":"august",
    "agust":"august",
    "auust":"august",
    "sep":"september",
    "sept":"september",
    "oct":"october" ,
    "nov":"november" ,
    "dec":"december" 
   
}

# Complete incomplete months e.g. jan => january
def complete_month(txt):
    regex = re.compile(r"jan\b|feb\b|mar\b|apr\b|jun\b|jul\b|aug\b|agust\b|auust\b|sep\b|sept\b|oct\b|nov\b|dec\b")
    match = re.search(regex, txt)
    
    if match is not None:
        target = match.group().strip()
        result = re.sub(re.compile(target), month_dict[target],txt)
    else:
        result = txt
        
    return result


FORMAT_PATTERN = f"({DAYS_RANGE})\s({MONTHS_TEXT})\s({YEARS_RANGE})"
# fomate date order e.g. DD, MM, YYYY => MM, DD, YYYY
def format_date_layout(txt):
    match =  re.search(FORMAT_PATTERN, txt)
    # if match found
    if match is not None: 
        data = str((match.group())).split()
        result = re.sub(match.group().strip(), f"{data[1]} {data[0]} {data[2]}", txt)
    else:
        result = txt
    return result
        
# lower case text, complete spelling for all months, check spellings and removed unwanted characters
def clean_message(src):
    # seperate attached strings to upper case letters e.g. 'yAug' => 'y Aug'
    clean_step1 = re.sub(r'([a-z]?[a-z])([A-Z]?[A-Z])', r'\g<1> \g<2>' ,src)

    # low caps all characters
    clean_step2 = clean_step1.lower()
    
    # remove any remaining attached small character to month name e.g. 'iaug' => 'aug'
    clean_step3 = re.sub(re.compile(r"([a-z]{1})(jan|feb|mar|apr|jun|jul|aug|sept|oct|nov|dec)\s"), r"\g<2> ", clean_step2)

    # first month correction e.g. aug => august
    month_correction1 = complete_month(clean_step3)

    # remove all of (st|th|rd|nd) e.g. 21st => 21
    clean_step4 = re.sub(re.compile(r"([\w\d]+)(st|th|rd|nd)"), r"\1", month_correction1)
    
    # remove any special characters and replace with space
    regex = re.compile(r"[/!?;@=#$%&*\+\-\"\'\:.,()\\]")
    clean_step5 = re.sub(regex, " ", clean_step4)

    # insert space bewteen connected digit and word character e.g. '6june' => '6 june'
    clean_step6 = re.sub(r'([a-z]?[a-z])([0-9]?[0-9])', r'\g<1> \g<2>' ,clean_step5)
    clean_step7 = re.sub(r'([0-9]?[0-9])([a-z]?[a-z])', r'\g<1> \g<2>' ,clean_step6)

    # remove any word character that is on its own e.g. 'r test b word' => 'test word'
    regex = re.compile(r"\b\s?[a-z]\s?\b")
    clean_step8 = re.sub(regex, " ", clean_step7)

    # reduce triple spacing to one
    spacing_correction1 = clean_step8.replace("   ", " ")

    # reduce double spacing to one
    spacing_correction2 = spacing_correction1.replace("  ", " ")

    # remove excess digit e.g. '4 4 12 2009' => '4 12 2009'
    regex = re.compile(f"(\d?\d)(\s)({DD_MM_YYYY}|{MM_DD_YYYY})")
    clean_step9 = re.sub(regex, r"\g<3>", spacing_correction2)

    # spellchecker
    spell_checker = "".join(textblob.TextBlob(clean_step9.strip()).correct())

    # second month check
    month_correction2 = complete_month(spell_checker)

    # final clean: fix date format to (MM, DD, YYYY)
    final_clean = format_date_layout(month_correction2)
    
    return final_clean.strip()

if DEBUG:
    df["Data"] = df["Raw"].apply(lambda txt: clean_message(txt))      
    df["Date"] = 0     
    df["Day"] = 0
    df["Month"] = 0
    df["Year"] = 0
    
    df.to_csv("src/clean_data.csv")

In [5]:
# Use clean dataset from this point onwards
df = pd.read_csv("src/clean_data.csv")

display(df.sample(5))
print(df.shape)

Unnamed: 0.1,Unnamed: 0,Code,Raw,Data,Date,Day,Month,Year
346,346,5334083481,"September, 23, 09 CPT Code: 90791: No medical services",september 23 09 cut code 90791 no medical services,0,0,0,0
303,303,4827816954,"May, 25 71 Other Adult Mental Health Outcomes Scales Used:",may 25 71 or adult mental heal outcome scales used,0,0,0,0
639,639,9014743639,10/1978 Communication with referring physician?: Done,10 1978 communication i referring physician done,0,0,0,0
339,339,5260883858,"sOne week Memorial Psychiatric Hospital Oct 2014, feels triggered by distress with mother as had stopped and felt depressed 2 months with sucidal ideation. Does not have records but responded well and told PTSD main diagnosis. Depression lifetd with treatment",one week memorial psychiatric hospital october 2014 feels trigger by dress i more as had stopped felt depressed 2 mon i suicidal creation does not have record but respond well told its main diagnosis depression lifted i treatment,0,0,0,0
117,117,2424740239,". Other collateral, noted in a July 1975 note documenting patient's CM at Johnson Hospital, reports patient has history of hoarding behavior and chronic delusions with ex-husband as focus, and recurring conviction that men are coming into her home and stealing things, writing things on her belongings, etc. Patient apparently has long history of seeing psychiatrists and therapists but none for very long. Hx of Outpatient Treatment: Yes",or collateral noted in july 1975 note documenting patient cm at johnson hospital reports patient has history of hoping behavior chronic delusions i ex husband as focus recurring conviction that men are coming into her home stealing things writing things on her belongings etc patient apparently has long history of seeing psychiatric therapist but none for very long he of outpatient treatment yes,0,0,0,0


(715, 8)


## Helper Function

- Find Match Function: return a list of all matches
 
- Transform Month Function: iterate through input text and transform month onto its numerical value

- Transform Year Function: add "19" or "20" to start of year

- Populate function to insert default values to missing dates e.g. 23/2019 => 01/23/2009

In [6]:
def transform_month(month):
    # search for months in text pattern
    match = re.search(re.compile(MONTHS_TEXT), month)
    
    # if match found
    if match is not None:  
        # change month to its numerical value
        month_num = datetime.datetime.strptime(complete_month(match.group().strip()), "%B").month
        result = re.sub(re.compile(match.group().strip()), str(month_num), month)
    else:
        result = month
    # return a formated numerical dates seperated by '/'
    return result.replace(" ", "/")


def transform_year(txt):
    data = txt.split("/")
    data_length = len(data) - 1
    
    if len(str(data[data_length])) >= 2 and len(str(data[data_length])) < 4:
        nineties_range = re.sub(r"([2-9][0-9])", "19" + data[data_length], data[data_length])
        twenties_range = re.sub(r"([01][0-9])", "20" + data[data_length], data[data_length])

        data[data_length] = nineties_range if len(nineties_range) == 4 else twenties_range
        result = "/".join(data)
    else:
        result = txt
        
    return result

def populate_missing_values(txt):
    data = txt.split("/")
    if len(data) == 2:
        if int(data[0]) > 12:
            result = "01/" + txt
        else:
            result = data[0] + "/01/" + data[1]
    elif len(data) == 1:
        result = "01/01/" + txt
    else:
        result = txt
        
    return result

# Change any text dates into numerical values
def find_match(pattern):
    match_array = []
    
    for row in range(0, len(df)):
        longest_match = []
        match = re.finditer(re.compile(pattern), df["Data"].loc[row]) 
        for m in match:
            longest_match.append(m[0])

        # retirive longest match values
        result = max(longest_match, key=len).strip()
        
        month_transform = transform_month(result)
        year_transform = transform_year(month_transform)
        populate_missing_dates = populate_missing_values(year_transform)
        
        match_array.append(populate_missing_dates) if len(longest_match) > 0 else match_array.append("None")

    return match_array

df["Date"] = find_match(FULL_FORMAT)

In [7]:
# function to slice match values in order to access month, day and year values
def slice_match(row, section):
    data = row.replace('//', '/')
    data_split = data.split('/')

    return data_split[section]

# mapping values
df['Day'] = df.Date.apply(lambda txt: slice_match(txt, 1))  
df['Month'] = df.Date.apply(lambda txt: slice_match(txt, 0))  
df['Year'] = df.Date.apply(lambda txt: slice_match(txt, 2))  

In [8]:
df = df.sort_values(by=['Date','Raw'])
df[["Date", "Raw", "Data"]].sample(5)

Unnamed: 0,Date,Raw,Data
5,01/01/2009,)and 8mo in 2009,8 mo in 2009
602,4/11/1990,"April 11, 1990 CPT Code: 90791: No medical services",april 11 1990 cut code 90791 no medical services
87,01/01/1998,"sSince 1998. Prior medication trials (including efficacy, reasons discontinued):",since 1998 prior meditation trials including efficacy reasons discontinued
432,4/30/1995,")- Venlafaxine 37.5mg daily: April, 30 95: self-discontinued due to side effects (dizziness)",venlafaxine 37 5 my daily april 30 95 self discontinued due to side effects dizziness
6,7/01/1977,")HTN, hypercholesterolemia, DM, sleep apnea,, nephrolithiasis. chronic renal impairment, DVT since July 1977 on enoxaparin.",hen hypercholeerolemia do sleep anna nephroliiasis chronic renal impairment dot since july 1977 on enoxaparin


In [9]:
df.to_csv("results.csv", index=False)
from zipfile import ZipFile
archive = "date_assignment.zip"
print(f"Creating archive: {archive}")
with ZipFile(archive,"w") as zip:
    for f in ["01-Model.ipynb", "results.csv"]:
        if os.path.isfile(f):
            print(f"\t{f} - OK")
            zip.write(f) 
        else:
            print(f"\t{f} - Missing. Check this!")