In [7]:
import numpy as np
import pandas as pd
from IPython.display import display, Markdown, Latex
pd.set_option('max_colwidth', 1000)

import re, os, datetime
import textblob
import calendar

for d in ["src","data"]: os.makedirs(d, exist_ok=True)
    
DEBUG = False

In [8]:
URL = "https://kmurphy.bitbucket.io/modules/Data_Mining_2/topics/08-Text_Mining/20-Mining_Dates/files/"

for filename, dest in [("public.csv", "src")]:
    
    source = f"{URL}/{filename}"
    target = f"{dest}/{filename}"

    if not os.path.isfile(target):
        print (f"Downloading remote file {filename} to folder {dest}", sep="")
        import urllib.request
        urllib.request.urlretrieve(source, target)
    else:
        print(f"Using local copy of {filename} in folder {dest}")

df = pd.read_csv("src/public.csv")
print(df.shape)

Using local copy of public.csv in folder src
(715, 2)


In [9]:
# lower case text, check spellings and removed unwanted characters
def clean_message(src):
    low_caps = src.lower()
    spellcheck = "".join(textblob.TextBlob(low_caps).correct())
    
    # remove any '.', '?', '!', ')', '*' at the start of a string or digit
    final_df = re.sub(re.compile(r"([-.)*?!:,]?)([\w\d]+)([-.)*?!:,]?)"), r"\2", spellcheck)
    
    return final_df

if DEBUG:
    df["Data"] = df["Raw"].apply(lambda txt: clean_message(txt))
    df["Iter"] = 0        
    df["Match"] = 0     
    df["Day"] = 0
    df["Month"] = 0
    df["Year"] = 0

    df.to_csv("src/clean_data.csv")
    df.head()

In [10]:
df = pd.read_csv("src/clean_data.csv")
display(df.head())
print(df.shape)

Unnamed: 0.1,Unnamed: 0,Code,Raw,Data,Iter,Match,Day,Month,Year
0,0,1012720972,".12, Noember 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor, Mcalester.",12 november 16 bad reaction to spice - synthesis my admitted to create manor mcalester,0,0,0,0,0
1,1,1039370009,".April, 5 97: made a phone call to Mom and Mom commented that he was talking very fast, hard to interrupt, but was in super happy spirits, so didn't make a big deal of it.",april 5 97 made a phone call to mon and mon commented that he was talking very fast hard to interrupt but was in super happy spirits so didn't make a big deal of it,0,0,0,0,0
2,2,1039574613,"A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue (s/p partial glossectomy and neck dissection in 8/1974) referred to psycho-oncology for assistance with adjustment issues following recovery. The patient does not meet criteria for a major mood or anxiety disorder. She is not at imminent risk of harm to self or others. She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement, and to think through how to continue to create a life for herself moving forward.",a pleasant 28 to woman with no formal psychiatric history and with a h/o sicca of the right tongue (s/p partial glossectomy and neck dissection in 8/1974 referred to psychiconcology for assistance with adjustment issues following recovery the patient does not meet criterion for a major mood or anxiety disorder she is not at imminent risk of harm to self or others she would benefit from psychotherapy to help her integrate her experience of cancer and the breakup of her engagement and to think through how to continue to create a life for herself moving forward,0,0,0,0,0
3,3,1039963589,"October 7, 01 [report_end]",october 7 01 [reported],0,0,0,0,0
4,4,1048901075,"July, 4, 01 Primary Care Doctor:",july 4 01 primary care doctor,0,0,0,0,0


(715, 9)


In [11]:
DELIMS = "[/\-\s]"

# valid days range is from 1 to 31
DAYS_RANGE = r"(3[01])(st|th|rd|nd)?|([012]?\d)(st|th|rd|nd)?"

# valid months range is from 1 to 12
MONTHS_NUM = r"1[012]|0?[1-9]"
MONTHS_TEXT = r"jan(uary)?[\s\-/]?|feb(ruary)?[\s\-/]?|mar(ch)?[\s\-/]?|apr(il)?[\s\-/]?|may[\s\-/]?|jun(e)?[\s\-/]?|jul(y)?[\s\-/]?|aug(ust)?[\s\-/]?|sept(ember)?[\s\-/]?|oct(ober)?[\s\-/]?|nov(ember)?[\s\-/]?|dec(ember)?[\s\-/]?"
MONTHS_RANGE = f"({MONTHS_NUM})|({MONTHS_TEXT})"

# valid years range is from 1920 to (YYYY or last two digit of year YY)
YEARS_RANGE = r"19([2-9][0-9])|20([01][0-9])|([2-9][0-9])|([01][0-9])"

In [15]:
# Change any text dates into numerical values
def find_match(pattern):
    match_array = []
    
    for row in range(0, len(df)):
        match = re.search(re.compile(pattern), df["Data"].loc[row])
        group = match.group() if match != None else "None"
        match_array.append(group.strip())
        
    return match_array

def transform_month(month):
    if month == "sept":
        month_num = datetime.datetime.strptime("sep", "%b").month
    else:
        regex = re.compile(r"jan\b|feb\b|mar\b|apr\b|may\b|jun\b|jul\b|aug\b|sept\b|oct\b|nov\b|dec\b")
        match = re.search(regex,month)
        num = "%b" if match is not None else "%B"
        month_num = datetime.datetime.strptime(month, num).month if month is not "None" else 0
    
    return month_num

df["Month_text"] = find_match(MONTHS_TEXT)
df["Month"] = df.Month_text.apply(lambda month: transform_month(month)) 
    
df.head(5)

Unnamed: 0.1,Unnamed: 0,Code,Raw,Data,Iter,Match,Day,Month,Year,Month_text
0,0,1012720972,".12, Noember 16- bad reaction to SpiceK2 - synthetic MJ- admitted to Crete Manor, Mcalester.",12 november 16 bad reaction to spice - synthesis my admitted to create manor mcalester,0,0,0,11,0,november
1,1,1039370009,".April, 5 97: made a phone call to Mom and Mom commented that he was talking very fast, hard to interrupt, but was in super happy spirits, so didn't make a big deal of it.",april 5 97 made a phone call to mon and mon commented that he was talking very fast hard to interrupt but was in super happy spirits so didn't make a big deal of it,0,0,0,4,0,april
2,2,1039574613,"A pleasant 28 yo woman with no formal psychiatric history and with a h/o SCCA of the right tongue (s/p partial glossectomy and neck dissection in 8/1974) referred to psycho-oncology for assistance with adjustment issues following recovery. The patient does not meet criteria for a major mood or anxiety disorder. She is not at imminent risk of harm to self or others. She would benefit from psychotherapy to help her integrate her experience of cancer and the break-up of her engagement, and to think through how to continue to create a life for herself moving forward.",a pleasant 28 to woman with no formal psychiatric history and with a h/o sicca of the right tongue (s/p partial glossectomy and neck dissection in 8/1974 referred to psychiconcology for assistance with adjustment issues following recovery the patient does not meet criterion for a major mood or anxiety disorder she is not at imminent risk of harm to self or others she would benefit from psychotherapy to help her integrate her experience of cancer and the breakup of her engagement and to think through how to continue to create a life for herself moving forward,0,0,0,0,0,
3,3,1039963589,"October 7, 01 [report_end]",october 7 01 [reported],0,0,0,10,0,october
4,4,1048901075,"July, 4, 01 Primary Care Doctor:",july 4 01 primary care doctor,0,0,0,7,0,july
