# Date Mining Assignment
#### Author: Emmanuel Sedicol

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display, Markdown, Latex
pd.set_option('max_colwidth', 1000)

import re, os, datetime
import textblob
import calendar

for d in ["src","data"]: os.makedirs(d, exist_ok=True)
    
DEBUG = True

## Import Data

In [2]:
URL = "https://kmurphy.bitbucket.io/modules/Data_Mining_2/topics/08-Text_Mining/20-Mining_Dates/files/"

for filename, dest in [("public.csv", "src")]:
    
    source = f"{URL}/{filename}"
    target = f"{dest}/{filename}"

    if not os.path.isfile(target):
        print (f"Downloading remote file {filename} to folder {dest}", sep="")
        import urllib.request
        urllib.request.urlretrieve(source, target)
    else:
        print(f"Using local copy of {filename} in folder {dest}")

df = pd.read_csv("src/public.csv")
print(df.shape)

Using local copy of public.csv in folder src
(715, 2)


## Regular Expressions

#### Posible date formats after data cleaning
- 04/02/2009 (day, month, year)
- Mar 2 2009 (month, day, year)
- Feb 2009 (_, month, year)
- 2009 (_, _, year)

In [3]:
# quick access to delimeters
DELIMS = "[/\-\s]"

# valid days range is from 1 to 31
DAYS_RANGE = r"(3[01])(st|th|rd|nd)?|([012]?\d)(st|th|rd|nd)?"

# valid months range is from 1 to 12
MONTHS_NUM = r"1[012]|0?[1-9]"
MONTHS_TEXT = r"january[\s\-/]?|february[\s\-/]?|march[\s\-/]?|april[\s\-/]?|may[\s\-/]?|june[\s\-/]?|july[\s\-/]?|august[\s\-/]?|september[\s\-/]?|october[\s\-/]?|(no)v?ember[\s\-/]?|december[\s\-/]?|jan[\s\-/]?|feb[\s\-/]?|mar[\s\-/]?|apr[\s\-/]?|jun[\s\-/]?|jul[\s\-/]?|aug[\s\-/]?|sept[\s\-/]?|oct[\s\-/]?|nov[\s\-/]?|dec[\s\-/]?"
MONTHS_RANGE = f"({MONTHS_NUM})|({MONTHS_TEXT})"

# valid years range is from 1920 to (YYYY or last two digit of year YY)
YEARS_RANGE = r"19([2-9][0-9])|20([01][0-9])|([2-9][0-9])|([01][0-9])"

# posible date formats
DD_MM_YYYY = f"({DAYS_RANGE})({DELIMS})({MONTHS_RANGE})({DELIMS})({YEARS_RANGE})"
MM_DD_YYYY = f"({MONTHS_RANGE})({DELIMS})({DAYS_RANGE})({DELIMS})({YEARS_RANGE})"
MM_YYYY = f"({MONTHS_RANGE})({DELIMS})({YEARS_RANGE})"
DD_MM = f"({DAYS_RANGE})({DELIMS})({MONTHS_RANGE})"
MM_DD = f"({MONTHS_RANGE})({DELIMS})({DAYS_RANGE})"
YYYY = f"({YEARS_RANGE})"

FULL_FORMAT = re.compile(f"({DD_MM_YYYY})|({MM_DD_YYYY})|({MM_YYYY})|({DD_MM})|({MM_DD})|({YYYY})")

## Cleaning Step

As a result of this step, the dataframe will have columns

 * __Code__ unchanged
 * __Raw__ unchanged
 * __Data__ cleaned version of __Raw__
 * __Iter__ number of regex that matched this row (for development purposes)
 * __Match__ regex object result for this row (for development purposes, you might not use)
 * __Day__ day (or zero if not set)
 * __Month__ month (or zero if not set)
 * __Year__ year (or zero if not set)

In [4]:
month_dict = {
    "jan":"january",
    "feb":"february",
    "mar":"march",
    "apr":"april",
    "jun":"june",
    "jul":"july",
    "aug":"august",
    "sept":"september",
    "oct":"october" ,
    "nov":"november" ,
    "dec":"december" 
}

# Complete incomplete months e.g. jan => january
def complete_month(txt):
    regex = re.compile(r"jan\b|feb\b|mar\b|apr\b|jun\b|jul\b|aug\b|sept\b|oct\b|nov\b|dec\b")
    match = re.search(regex, txt)
    
    if match is not None:
        target = match.group().strip()
        result = re.sub(re.compile(target), month_dict[target],txt)
    else:
        result = txt
        
    return result
        
# lower case text, complete spelling for all months, check spellings and removed unwanted characters
def clean_message(src):
    low_caps = src.lower()
    full_month = complete_month(low_caps)
    
    # remove all of (st|th|rd|nd) e.g. 21st => 21
    rm_1 = re.sub(re.compile(r"([\w\d]+)(st|th|rd|nd)"), r"\1", full_month)
    
    # remove all of '.', '?', '!', ')', '*' 
    rm_2 = re.sub(re.compile(r"([\(\))*?!:,~;]?)([\w\d]+)([\(\))*?!:,~;]?)"), r"\2", rm_1)
    
    # remove any ['-', '/'] and replace with space
    rm_3 = re.sub(re.compile(r"[.-/]"), " ", rm_2)
    
    # spellchecker
    super_clean = "".join(textblob.TextBlob(rm_3).correct())
    
    return super_clean

if DEBUG:
    df["Data"] = df["Raw"].apply(lambda txt: clean_message(txt))
    df["Iter"] = 0        
    df["Match"] = 0     
    df["Day"] = 0
    df["Month"] = 0
    df["Year"] = 0
    
    df.to_csv("src/clean_data.csv")

In [5]:
df = pd.read_csv("src/clean_data.csv")
display(df.head(3))
print(df.shape)

Unnamed: 0.1,Unnamed: 0,Code,Raw,Data,Iter,Match,Day,Month,Year
0,0,1012720972,".12, Noember 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor, Mcalester.",12 november 16- bad reaction to spice - kinetic my- admitted to create manor career,0,0,0,0,0
1,1,1039370009,".April, 5 97: made a phone call to Mom and Mom commented that he was talking very fast, hard to interrupt, but was in super happy spirits, so didn't make a big deal of it.",april 5 97 made a phone call to mon a mon commented that he was talking very a ha to interrupt but was in super happy spirits so didn't make a big deal of it,0,0,0,0,0
2,2,1039574613,"A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue (s/p partial glossectomy and neck dissection in 8/1974) referred to psycho-oncology for assistance with adjustment issues following recovery. The patient does not meet criteria for a major mood or anxiety disorder. She is not at imminent risk of harm to self or others. She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement, and to think through how to continue to create a life for herself moving forward.",a pleasant 28 to woman i no formal psychiatric history a i a h o sicca of the right tongue s p partial glossectomy a neck dissection in 8 1974 referred to psychic-oncology for assistance i adjutant issues following recovery the patient does not meet criterion for a major mood or anxiety dinner she is not at imminent risk of harm to self or ours she would benefit from psychoerapy to help her integrate her experience of cancer a the break-up of her engagement a to think through how to continue to create a life for herself moving for,0,0,0,0,0


(715, 9)


## Helper Function

- Find Match Function: return a list of all matches
 
- Transform Month Function: iterate through input text and transform month onto its numerical value

In [6]:
# Change any text dates into numerical values
def find_match(pattern):
    match_array = []
    
    for row in range(0, len(df)):
        longest_match = []
        match = re.finditer(re.compile(FULL_FORMAT), df["Data"].loc[row]) 
        for m in match:
            longest_match.append(m[0])

        result = max(longest_match, key=len).strip()
        match_array.append(transform_month(result)) if len(longest_match) > 0 else match_array.append("None")

    return match_array
             
def transform_month(month):
    # search for months in text pattern
    match = re.search(re.compile(MONTHS_TEXT), month)
    
    # if match found
    if match is not None:  
        # change month to its numerical value
        month_num = datetime.datetime.strptime(complete_month(match.group().strip()), "%B").month
        result = re.sub(re.compile(match.group().strip()), str(month_num), month)
    else:
        result = month
    # return a formated numerical dates seperated by '-'
    return result.replace(" ", "-")


def transform_year(txt):
    result = ""
    data = txt.split("-")
    data_length = len(data) - 1
    
    if len(str(data[data_length])) >= 2 and len(str(data[data_length])) < 4:
        data = txt.split("-")
        data_length = len(data) - 1

        nineties_range = re.sub(r"([2-9][0-9])", "19" + data[data_length], data[data_length])
        twenties_range = re.sub(r"([01][0-9])", "20" + data[data_length], data[data_length])

        data[data_length] = nineties_range if len(nineties_range) == 4 else twenties_range
        result = "-".join(data)
    else:
        result = txt
        
    return result
df["Match"] = find_match(FULL_FORMAT)
df["Match2"] = df.Match.apply(lambda year: transform_year(year)) 
# df["Month"] = df.Month_text.apply(lambda month: transform_month((complete_month(month)))) 
    
df.head(18)    

Unnamed: 0.1,Unnamed: 0,Code,Raw,Data,Iter,Match,Day,Month,Year,Match2
0,0,1012720972,".12, Noember 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor, Mcalester.",12 november 16- bad reaction to spice - kinetic my- admitted to create manor career,0,12-11-16,0,0,0,12-11-2016
1,1,1039370009,".April, 5 97: made a phone call to Mom and Mom commented that he was talking very fast, hard to interrupt, but was in super happy spirits, so didn't make a big deal of it.",april 5 97 made a phone call to mon a mon commented that he was talking very a ha to interrupt but was in super happy spirits so didn't make a big deal of it,0,4-5-97,0,0,0,4-5-1997
2,2,1039574613,"A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue (s/p partial glossectomy and neck dissection in 8/1974) referred to psycho-oncology for assistance with adjustment issues following recovery. The patient does not meet criteria for a major mood or anxiety disorder. She is not at imminent risk of harm to self or others. She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement, and to think through how to continue to create a life for herself moving forward.",a pleasant 28 to woman i no formal psychiatric history a i a h o sicca of the right tongue s p partial glossectomy a neck dissection in 8 1974 referred to psychic-oncology for assistance i adjutant issues following recovery the patient does not meet criterion for a major mood or anxiety dinner she is not at imminent risk of harm to self or ours she would benefit from psychoerapy to help her integrate her experience of cancer a the break-up of her engagement a to think through how to continue to create a life for herself moving for,0,8-1974,0,0,0,8-1974
3,3,1039963589,"October 7, 01 [report_end]",october 7 01 [report],0,10-7-01,0,0,0,10-7-2001
4,4,1048901075,"July, 4, 01 Primary Care Doctor:",july 4 01 primary care doctor,0,7-4-01,0,0,0,7-4-2001
5,5,1054311047,)and 8mo in 2009,a mo in 2009,0,2009,0,0,0,2009
6,6,1054668034,")HTN, hypercholesterolemia, DM, sleep apnea,, nephrolithiasis. chronic renal impairment, DVT since July 1977 on enoxaparin.","hen hypercholeerolemia do sleep anna, nephroliiasis chronic renal impairment dot since july 1977 on enoxaparin",0,7-1977,0,0,0,7-1977
7,7,1082469285,"Septeber, 10, 70 CPT Code: 90792: With medical services",september 10 70 cut code 90792 i medical services,0,9-10-70,0,0,0,9-10-1970
8,8,1125769793,"Since 10/2014: Fatigued, more forgetful, impaired dexterity on her left hand. MRI reveals an approximately 4.2cm x 3.3cm x 2.5cm right parietal enhancing mass with surrounding edema",since 10 2014 fatigued more forgetful impaired dexterity on her left ha mr reveals an approximately 4 cm x 3 cm x 2 cm right parietal enhancing mass i surrounding oedema,0,10-2014,0,0,0,10-2014
9,9,1148116416,24 yo right handed woman with history of large right frontal mass s/p resection 11/3/1985 who had recent urgent R cranial wound revision and placement of L EVD for declining vision and increased drainage from craniotomy incision site and possible infection. She has a hx of secondary mania related to psychosis and manipulation of her right frontal lobe.,24 to right had woman i history of large right frontal mass s p resection 11 3 1985 who had recent urgent r cranial you revision a placement of l end for declining vision a increased drainage from craniotomy incision site a possible infection she has a he of secondary mania related to sycosis a manipulation of her right frontal love,0,11-3-1985,0,0,0,11-3-1985


In [10]:
df[["Match", "Match2"]]

Unnamed: 0,Match,Match2
0,12-11-16,12-11-2016
1,4-5-97,4-5-1997
2,8-1974,8-1974
3,10-7-01,10-7-2001
4,7-4-01,7-4-2001
5,2009,2009
6,7-1977,7-1977
7,9-10-70,9-10-1970
8,10-2014,10-2014
9,11-3-1985,11-3-1985


In [8]:
!say "DONEEEEEEEEEEEEEE"