# Date Mining Assignment
#### Author: Emmanuel Sedicol

In [1]:
import numpy as np
import pandas as pd
from IPython.display import display, Markdown, Latex
pd.set_option('max_colwidth', 1000)

import re, os, datetime
import textblob
import calendar

for d in ["src","data"]: os.makedirs(d, exist_ok=True)
    
DEBUG = False

## Import Data

In [2]:
URL = "https://kmurphy.bitbucket.io/modules/Data_Mining_2/topics/08-Text_Mining/20-Mining_Dates/files/"

for filename, dest in [("public.csv", "src")]:
    
    source = f"{URL}/{filename}"
    target = f"{dest}/{filename}"

    if not os.path.isfile(target):
        print (f"Downloading remote file {filename} to folder {dest}", sep="")
        import urllib.request
        urllib.request.urlretrieve(source, target)
    else:
        print(f"Using local copy of {filename} in folder {dest}")

df = pd.read_csv("src/public.csv")
print(df.shape)

Using local copy of public.csv in folder src
(715, 2)


## Regular Expressions

#### Posible date formats after data cleaning
- 04/02/2009 (day, month, year)
- Mar 2 2009 (month, day, year)
- Feb 2009 (_, month, year)
- 2009 (_, _, year)

In [3]:
# quick access to delimeters
DELIMS = "[/\-\s]"

# valid days range is from 1 to 31
DAYS_RANGE = r"(3[01])|([012]?\d)"

# valid months range is from 1 to 12
MONTHS_NUM = r"1[012]|0?[1-9]"
MONTHS_TEXT = r"january[\s\-/]?|february[\s\-/]?|march[\s\-/]?|april[\s\-/]?|may[\s\-/]?|june[\s\-/]?|july[\s\-/]?|august[\s\-/]?|september[\s\-/]?|october[\s\-/]?|(no)v?ember[\s\-/]?|december[\s\-/]?|jan[\s\-/]?|feb[\s\-/]?|mar[\s\-/]?|apr[\s\-/]?|jun[\s\-/]?|jul[\s\-/]?|aug[\s\-/]?|sept[\s\-/]?|oct[\s\-/]?|nov[\s\-/]?|dec[\s\-/]?"
MONTHS_RANGE = f"({MONTHS_NUM})|({MONTHS_TEXT})"

# valid years range is from 1920 to (YYYY or last two digit of year YY)
YEARS_RANGE = r"19([2-9][0-9])|20([01][0-9])|([2-9][0-9])|([01][0-9])"

# posible date formats
DD_MM_YYYY = f"({DAYS_RANGE})({DELIMS})({MONTHS_RANGE})({DELIMS})({YEARS_RANGE})"
MM_DD_YYYY = f"({MONTHS_RANGE})({DELIMS})({DAYS_RANGE})({DELIMS})({YEARS_RANGE})"
MM_YYYY = f"({MONTHS_RANGE})({DELIMS})({YEARS_RANGE})"
DD_YY = r"\s(3[01])|([012]?\d)\s(19([2-9][0-9])|20([01][0-9])|([2-9][0-9])|([01][0-9]))"
YYYY = f"({YEARS_RANGE})"

FULL_FORMAT = re.compile(f"({DD_MM_YYYY})|({MM_DD_YYYY})|({MM_YYYY})|({DD_YY})|({YYYY})")

## Cleaning Step

As a result of this step, the dataframe will have columns

 * __Code__ unchanged
 * __Raw__ unchanged
 * __Data__ cleaned version of __Raw__
 * __Iter__ number of regex that matched this row (for development purposes)
 * __Match__ regex object result for this row (for development purposes, you might not use)
 * __Day__ day (or zero if not set)
 * __Month__ month (or zero if not set)
 * __Year__ year (or zero if not set)

In [4]:
month_dict = {
    "jan":"january",
    "feb":"february",
    "mar":"march",
    "apr":"april",
    "jun":"june",
    "jul":"july",
    "aug":"august",
    "agust":"august",
    "auust":"august",
    "sep":"september",
    "sept":"september",
    "oct":"october" ,
    "nov":"november" ,
    "dec":"december" 
   
}

# Complete incomplete months e.g. jan => january
def complete_month(txt):
    regex = re.compile(r"jan\b|feb\b|mar\b|apr\b|jun\b|jul\b|aug\b|agust\b|auust\b|sep\b|sept\b|oct\b|nov\b|dec\b")
    match = re.search(regex, txt)
    
    if match is not None:
        target = match.group().strip()
        result = re.sub(re.compile(target), month_dict[target],txt)
    else:
        result = txt
        
    return result


FORMAT_PATTERN = f"({DAYS_RANGE})\s({MONTHS_TEXT})\s({YEARS_RANGE})"
# fomate date order e.g. DD, MM, YYYY => MM, DD, YYYY
def format_date_layout(txt):
    match =  re.search(FORMAT_PATTERN, txt)
    # if match found
    if match is not None: 
        data = str((match.group())).split()
        result = re.sub(match.group().strip(), f"{data[1]} {data[0]} {data[2]}", txt)
    else:
        result = txt
    return result
        
# lower case text, complete spelling for all months, check spellings and removed unwanted characters
def clean_message(src):
     # seperate attached strings to upper case letters e.g. 'yAug' => 'y Aug'
    clean_step1 = re.sub(r'([a-z]?[a-z])([A-Z]?[A-Z])', r'\g<1> \g<2>' ,src)

    # low caps all characters
    clean_step2 = clean_step1.lower()

    # first month correction e.g. aug => august
    month_correction1 = complete_month(clean_step2)

    # remove all of (st|th|rd|nd) e.g. 21st => 21
    clean_step3 = re.sub(re.compile(r"([\w\d]+)(st|th|rd|nd)"), r"\1", month_correction1)

    # remove all of '.', '?', '!', ')', '*' 
    clean_step4 = re.sub(re.compile(r"([\(\))*?!:~;]?)([\w\d]+)([\(\))*?!:~;]?)"), r"\2", clean_step3)

    # remove any ['-', '/'] and replace with space
    clean_step5 = re.sub(re.compile(r"[.\-/,]"), " ", clean_step4)

    # insert space bewteen connected digit and word character e.g. '6june' => '6 june'
    clean_step6 = re.sub(r'([a-z]?[a-z])([0-9]?[0-9])', r'\g<1> \g<2>' ,clean_step5)
    clean_step7 = re.sub(r'([0-9]?[0-9])([a-z]?[a-z])', r'\g<1> \g<2>' ,clean_step6)

    # reduce triple spacing to one
    spacing_correction1 = clean_step7.replace("   ", " ")

    # reduce double spacing to one
    spacing_correction2 = spacing_correction1.replace("  ", " ")

    # spellchecker
    spell_checker = "".join(textblob.TextBlob(spacing_correction2.strip()).correct())

    # second month check
    month_correction2 = complete_month(spell_checker)

    # final clean: fix date format to (MM, DD, YYYY)
    final_clean = format_date_layout(month_correction2)
    
    return final_clean.strip()

if DEBUG:
    df["Data"] = df["Raw"].apply(lambda txt: clean_message(txt))      
    df["Date"] = 0     
    df["Day"] = 0
    df["Month"] = 0
    df["Year"] = 0
    
    df.to_csv("src/clean_data.csv")

In [5]:
# Use clean dataset from this point onwards
df = pd.read_csv("src/clean_data.csv")

display(df.head(3))
print(df.shape)

Unnamed: 0.1,Unnamed: 0,Code,Raw,Data,Date,Day,Month,Year
0,0,1012720972,".12, Noember 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor, Mcalester.",november 12 16 bad reaction to spice k 2 kinetic my admitted to create manor career,0,0,0,0
1,1,1039370009,".April, 5 97: made a phone call to Mom and Mom commented that he was talking very fast, hard to interrupt, but was in super happy spirits, so didn't make a big deal of it.",april 5 97 made a phone call to mon a mon commented that he was talking very a ha to interrupt but was in super happy spirits so didn't make a big deal of it,0,0,0,0
2,2,1039574613,"A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue (s/p partial glossectomy and neck dissection in 8/1974) referred to psycho-oncology for assistance with adjustment issues following recovery. The patient does not meet criteria for a major mood or anxiety disorder. She is not at imminent risk of harm to self or others. She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement, and to think through how to continue to create a life for herself moving forward.",a pleasant 28 to woman i no formal psychiatric history a i a h o sicca of the right tongue s p partial glossectomy a neck dissection in 8 1974 referred to psychic oncology for assistance i adjutant issues following recovery the patient does not meet criterion for a major mood or anxiety dinner she is not at imminent risk of harm to self or ours she would benefit from psychoerapy to help her integrate her experience of cancer a the break up of her engagement a to think through how to continue to create a life for herself moving for,0,0,0,0


(715, 8)


## Helper Function

- Find Match Function: return a list of all matches
 
- Transform Month Function: iterate through input text and transform month onto its numerical value

- Transform Year Function: add "19" or "20" to start of year

- Populate function to insert default values to missing dates e.g. 23/2019 => 01/23/2009

In [7]:
def transform_month(month):
    # search for months in text pattern
    match = re.search(re.compile(MONTHS_TEXT), month)
    
    # if match found
    if match is not None:  
        # change month to its numerical value
        month_num = datetime.datetime.strptime(complete_month(match.group().strip()), "%B").month
        result = re.sub(re.compile(match.group().strip()), str(month_num), month)
    else:
        result = month
    # return a formated numerical dates seperated by '-'
    return result.replace(" ", "/")


def transform_year(txt):
    result = ""
    data = txt.split("/")
    data_length = len(data) - 1
    
    if len(str(data[data_length])) >= 2 and len(str(data[data_length])) < 4:
        nineties_range = re.sub(r"([2-9][0-9])", "19" + data[data_length], data[data_length])
        twenties_range = re.sub(r"([01][0-9])", "20" + data[data_length], data[data_length])

        data[data_length] = nineties_range if len(nineties_range) == 4 else twenties_range
        result = "/".join(data)
    else:
        result = txt
        
    return result

def populate_missing_values(txt):
    result = ""
    data = txt.split("/")
    
    if len(data) == 2:
        result = "01/" + txt
    elif len(data) == 1:
        result = "01/1/" + txt
    else:
        result = txt
        
    return result

# Change any text dates into numerical values
def find_match(pattern):
    match_array = []
    
    for row in range(0, len(df)):
        longest_match = []
        match = re.finditer(re.compile(FULL_FORMAT), df["Data"].loc[row]) 
        for m in match:
            longest_match.append(m[0])

        # retirive longest match values
        result = max(longest_match, key=len).strip()
        
        month_transform = transform_month(result)
        year_transform = transform_year(month_transform)
        populate_missing_dates = populate_missing_values(year_transform)
        
        match_array.append(populate_missing_dates) if len(longest_match) > 0 else match_array.append("None")

    return match_array

df["Date"] = find_match(FULL_FORMAT)

In [8]:
# function to slice match values in order to access month, day and year values
def slice_match(row, section):
    data = row.replace('//', '/')
    data_split = data.split('/')

    return data_split[section]

# mapping values
df['Day'] = df.Date.apply(lambda txt: slice_match(txt, 1))  
df['Month'] = df.Date.apply(lambda txt: slice_match(txt, 0))  
df['Year'] = df.Date.apply(lambda txt: slice_match(txt, 2))  

In [9]:
df = df.sort_values(by=['Date','Raw'])
df.head(20)

Unnamed: 0.1,Unnamed: 0,Code,Raw,Data,Date,Day,Month,Year
551,551,7783314663,". Patient primary concern is related to a TBI experienced at bootcamp in 01/2007. He reports a fellow army recruit sucker punched him from behind and knocked him out immediately. He is not sure how long he was out for but the next thing he remembers he was in the ambulance going to the hospital. He reports problems since that time although it was hard to get a sense of the specific symptoms he is experiencing. He reports he tends to wake up in the middle of the night and feels like he is having a seizure and can't control his body. He will feel disoriented at times (not completely dissociative, but feels ""weird, out of my body."" He also had headaches and feels lightheaded. He reports he has also been struggling with depression since the incident where he will have good days and down days. On the down days, he wants to sleep all day and not get out of bed.","patient primary concern is related to a ti experienced at bootcamp in 01 2007 he reports a fellow army recruit sucked punched him from beri a knocked him out immediately he is not sure how long he was out for but the next thing he remembers he was in the ambulance going to the hospital he reports problems since that time slough it was ha to get a sense of the specific symptoms he is experiencing he reports he yes to wake up in the middle of the night a feels like he is having a seizure a can't control his body he will feel disoriented at times not completely dissociative but feels ""we out of my body "" he also had headache a feels ligheaded he reports he has also been struggling i depression since the incident where he will have good days a down days on the down days he wants to sleep all day a not get out of bed",01/01/2007,1,1,2007
391,391,5866628778,s 03/1980 Positive PPD: treated with INH for 6 months,s 03 1980 positive pp treated i in for 6 mon,01/03/1980,3,1,1980
13,13,1174525826,"kNotice that in 03/1990, sustained a bizarre injury. He was in Colorado City at the time. He was driving his car, and he says he had recently ran out of Saphris, which is an antipsychotic he was taking. He says he does not recall all the events but believes he stepped out of his vehicle and then walked off of a bridge, sustaining a seven-story fall. He was found unconscious. He was taken to and treated at Norfolk Health Center in Colorado City, where he underwent open reduction internal fixation of the right humerus as well as the left femur. was in ICU for a week, multiple fx. He subsequently recovered from his injuries in the state of South Carolina",k notice that in 03 1990 stained a bizarre injury he was in colorado city at the time he was driving his car a he says he had recently ran out of saphris which is an antipsychotic he was taking he says he does not recall all the events but believes he stepped out of his vehicle a then walked off of a bridge staining a seven story fall he was for unconscious he was taken to a treated at norfolk heal center in colorado city where he urgent open reduction internal fixation of the right humerus as well as the left femur was in ice for a week multiple ff he subsequently recovered from his injuries in the state of you carolina,01/03/1990,3,1,1990
663,663,9392846158,01/05/1999 [report_end],01 05 1999 [report],01/05/1999,5,1,1999
706,706,9847571824,06/1973 Primary Care Doctor:,06 1973 primary care doctor,01/06/1973,6,1,1973
309,309,4903818618,06/1981 Hx of Brain Injury: Yes,06 1981 he of brain injury yes,01/06/1981,6,1,1981
153,153,2914541592,08/1988 Primary Care Doctor:,08 1988 primary care doctor,01/08/1988,8,1,1988
486,486,7019242944,sOne prior voluntary hospitalization in 09/1975 for depression on Psychiatry. Hx of Outpatient Treatment: Yes,s one prior voluntary hospitalization in 09 1975 for depression on psychiatric he of outpatient treatment yes,01/09/1975,9,1,1975
594,594,8305170502,"sPatient reported losing three friends that passed away during his deployment, including a close friend Jacques, and two other friends that he lived with for a period of time. A fourth friend passed away prior to his deployment in 1972. Patient reported thinking about his friends daily and described his feelings of grief as remaining constant since they died. Alcohol Use: How often did you have a drink containing alcohol in the past year: Two to four times a month (2 points)",s patient reported losing three fires that passed away during his employment including a close free jacques a two or fires that he lived i for a period of time a four free passed away prior to his employment in 1972 patient reported thinking about his fires daily a described his feelings of grief as remaining conan since they died alcohol use how often did you have a drink containing alcohol in the pa year two to four times a mon 2 points,01/1/1972,1,1,1972
424,424,6311876851,Contemplating jumping off building - 1973 - difficulty writing paper.,contemplation jumping off building 1973 difficulty writing paper,01/1/1973,1,1,1973


In [12]:
df.to_csv("results.csv", index=False)
from zipfile import ZipFile
archive = "date_assignment.zip"
print(f"Creating archive: {archive}")
with ZipFile(archive,"w") as zip:
    for f in ["01-Model.ipynb", "results.csv"]:
        if os.path.isfile(f):
            print(f"\t{f} - OK")
            zip.write(f) 
        else:
            print(f"\t{f} - Missing. Check this!")

Creating archive: date_assignment.zip
	01-Model.ipynb - OK
	results.csv - OK
