In [1]:
import random
import train_model
import spacy
import numpy as np
from pathlib import Path
import nltk

In [2]:
from keras.layers import Bidirectional, Concatenate, LSTM, Dot, Input, Multiply
from keras.layers import RepeatVector, Dense, Activation
from keras.optimizers import RMSprop
from keras.utils import to_categorical
from keras.models import Model
import keras.backend as K

Using TensorFlow backend.


Create date translator model

In [3]:
def load_human_machine_dict(h_dict_path, m_dict_path):
    hdict = dict()
    mdict = dict()
    with open(h_dict_path, "r", encoding='utf-8') as inputfile:
        for row in inputfile:
            row = row[:-1]
            kc = row.split('\t')
            hdict[kc[0]] = int(kc[1])

    with open(m_dict_path, "r", encoding='utf-8') as inputfile:
        for row in inputfile:
            row = row[:-1]
            kc = row.split('\t')
            mdict[kc[0]] = int(kc[1])

    inv_mdict = {c:k for k, c in mdict.items()}
    return hdict, inv_mdict

In [4]:
def preprocess(x, hdict):
    x = x.lower().replace(',', '')
    xnum = [hdict[c] if c in hdict else hdict['<unk>'] for c in x]
    while len(xnum) < 30:
        xnum.append(0)
    onehot = [to_categorical(i, num_classes=len(hdict)) for i in xnum]
    return onehot

In [5]:
def softmax(x, axis=1):
    dim = K.ndim(x)
    if dim == 2:
        return K.softmax(x)
    elif dim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D')

In [6]:
def attention(a, s_prev):
    s_prev = RepeatVector(30)(s_prev)
    concat = Concatenate(axis=-1)([a, s_prev])
    d1 = Dense(10, activation="tanh")(concat)
    d2 = Dense(1, activation="relu")(d1)
    alphas = Activation(softmax)(d2)
    return Dot(axes=1)([alphas, a])

In [7]:
def modeling(xl, yl, n_a, n_s, hvocab_size, mvocab_size):
    X = Input(shape=(xl, hvocab_size))
    s0 = Input(shape=(n_s,))
    c0 = Input(shape=(n_s,))
    s = s0
    c = c0
    outputs = []
    a=Bidirectional(LSTM(units=n_a, return_sequences=True))(X)
    for i in range(0, yl):
        context = attention(a, s)
        s, _, c = LSTM(n_s, return_state=True)(inputs=context, initial_state=[s,c])
        out = Dense(mvocab_size, activation=softmax)(s)
        outputs.append(out)
    return Model(inputs=(X, s0, c0), outputs=outputs)

In [8]:
def load_date_translator(hvocab_size, mvocab_size):
    themodel = modeling(30, 10, 32, 64, hvocab_size, mvocab_size)
    themodel.load_weights("date_model.h5")
    return themodel

In [9]:
def load_time_translator(hvocab_size, mvocab_size):
    themodel = modeling(30, 11, 32, 64, hvocab_size, mvocab_size)
    themodel.load_weights("time_model.h5")
    return themodel

Create the model and load the pre-trained model weights

In [10]:
# load the date model
d_hdict, d_inv_mdict = load_human_machine_dict("h_dict.txt", "m_dict.txt")
date_model = load_date_translator(len(d_hdict), len(d_inv_mdict))

In [11]:
# load the time model
t_hdict, t_inv_mdict = load_human_machine_dict("time_h_dict.txt", "time_m_dict.txt")
time_model = load_time_translator(len(t_hdict), len(t_inv_mdict))

In [12]:
def getDate(s, hdict, inv_mdict, model):
    x = np.array([preprocess(s, hdict)])
    s = np.zeros((1, 64))
    c = np.zeros((1, 64))
    d = model.predict([x, s, c])
    ans = ""
    for w in d:
        ans += inv_mdict[int(np.argmax(w, axis=1))]
    return ans

In [13]:
getDate('dec 7', d_hdict, d_inv_mdict, date_model)

'0000-12-07'

In [14]:
def getTime(s, hdict, inv_mdict, model):
    x = np.array([preprocess(s, hdict)])
    s = np.zeros((1, 64))
    c = np.zeros((1, 64))
    d = model.predict([x, s, c])
    ans = ""
    for w in d:
        ans += inv_mdict[int(np.argmax(w, axis=1))]
    return ans

In [15]:
getTime('8-9am', t_hdict, t_inv_mdict, time_model)

'08:00-09:00'

In [16]:
# for demonstration setting below
def getdate(s):
    date = getDate(s, d_hdict, d_inv_mdict, date_model)
    if date[:4] == '0000':
        return date[5:]
    return date

def gettime(s):
    time = getTime(s, t_hdict, t_inv_mdict, time_model)
    if time[6:] == '99:99':
        return time[:5]
    return time

Load Named-Entity recognition model

In [74]:
sm_model = "ner_model_sm"
lg_model = "ner_model"
model_path = sm_model

In [75]:
def loadTrainedNERModel():
    path = Path(model_path)
    if path.exists():
        nlp = spacy.load(path)
    else:
        train_model.train_model(100)
        nlp = spacy.load(path)
    return nlp

In [76]:
def ner_model_on_test_set(nlp):
    test_data = []
    with open("test.txt", "r", encoding='utf-8') as inputfile:
        for line in inputfile:
            test_data.append(line[:-1])
    for text in test_data:
        doc = nlp(text)
        print(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        # print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

In [77]:
nlp = loadTrainedNERModel()

In [78]:
text = "Hello, we will have a final exam next week on Dec 12 in CENTER 101 I am bringing a kid to the final exam so you might hear some nonsense"
doc = nlp(text)

In [79]:
for tk in doc.ents:
    print(tk.text, tk.label_)

final exam next week EVENT
Dec 12 DATE
CENTER 101 LOCATION
final exam EVENT


In [80]:
def hasNum(s):
    return any(c.isdigit() for c in s)

In [81]:
def getInfo(text):
    doc = nlp(text)
    entities = [(ent, ent.label_) for ent in doc.ents]
    events = []
    s_dates = []
    u_dates = []
    times = []
    locations = []
    for enti in entities:
        if enti[1] == 'EVENT':
            events.append(enti)
        elif enti[1] == 'DATE':
            if hasNum(enti[0].text):
                s_dates.append(enti)
            else:
                u_dates.append(enti)
        elif enti[1] == 'TIME':
            times.append(enti)
        else:
            locations.append(enti)
    
    if len(events) == 0:
        poff = 0
        event = None
    else:
        # set the first one as primary event, find date, time and location that is closest to the primary event
        pevt = events[0][0]
        poff = pevt.start
        event = pevt.text
    
    # find date, first look for closest exact date(specified day and month). If not found, find the closest one and return.
    # possible words, today, tomorrow, Monday, Tuesday.... this week, next week which are hard to specified
    date = None
    if len(s_dates) != 0:
        closest = 10000
        for d in s_dates:
            ds = d[0].start
            if abs(ds - poff) < closest:
                closest = abs(ds - poff)
                date = getdate(d[0].text)
    elif len(u_dates) != 0:
        closest = 10000
        for d in u_dates:
            ds = d[0].start
            if abs(ds - poff) < closest:
                closest = abs(ds - poff)
                date = d[0].text
    
    # find time, just look for the closest time
    time = None
    if len(times) != 0:
        closest = 100000
        for d in times:
            ds = d[0].start
            if abs(ds - poff) < closest:
                closest = abs(ds - poff)
                time = gettime(d[0].text)
    
    # find location, look for the closest location
    loc = None
    if len(locations) != 0:
        closest = 100000
        for d in locations:
            ds = d[0].start
            if abs(ds - poff) < closest:
                closest = abs(ds - poff)
                loc = d[0].text
    return {'Event':event, 'Date':date, 'Time':time, 'Location':loc}, entities

In [82]:
text0 = "We will have a review session for exam 2 this Wednesday, October 30th from 7-8pm in SOLIS 104."
text1 = "INFO SESSION October 21, 2019 5:30 - 7:30 pm Qualcomm Conference Center Description:What do we do? COME FIND OUT! Network with the engineers after a Q+A panel. "
text2 = "The William Lowell Putnam Competition (http://math.scu.edu/putnam/index.html) is a college level mathematics competition that will take place over the course of two three-hour segments on Saturday December 1st. The questions are proof based and generally fairly difficult. While finding solutions often requires ingenuity and critical thinking, they rarely require knowledge of mathematics beyond calculus or perhaps differential equations (though knowing more advanced topics can be useful at times)."
text3 = "Hi all, we will meet for a practice competition and team formation meeting this Saturday at 11am in B230. We will spend a little time forming teams, then do a mini-competition with those teams, and then discuss the problems briefly. I'll bring some breakfast stuff – donuts/coffee/bagels. Everyone is welcome to participate! Don't feel like you can't or shouldn't come based on missing a meeting or your performance in the team selection competition a few days ago."
text4 = "Due to midterms, a number of people have requested PA2 extension. We have extended the due date by 2 days to October 30th. Please note that we cannot go beyond this as we need to keep the course on track."
textfav = "You're receiving this email because you registered for the Fall 2018 Beginner's Programming Competition. The Winter 2019 Beginner's Programming Competition, sponsored by Microsoft and Associated Students UCSD, will be held Saturday, March 9th, from 3pm-7:30pm. We'll have some really cool prizes from Microsoft for the winners. Winners will also get to submit their resumes to Microsoft for consideration for Summer 2020 internships!"
textfail = "On Sunday, September 23rd, Associated Student Concerts and Events will be hosting their annual Fall Y’all Festival for our undergraduate students at RIMAC Field. Due to the construction of event infrastructure and additional security measures, Hopkins Drive will be closed from 7 PM through 11:59 PM on the evening of the 23rd.  During that time, Hopkins Drive will be closed to all vehicle, bike, and pedestrian traffic between Northpoint Drive after the North Information Booth through to the RIMAC arena loading dock."
text5 = "Hello Tritons, If you are interested in becoming an actuary (or want to know more about the profession), come to the first Triton Actuarial Society (TAS) meeting of the school year! It will be taking place from 8:00 p.m. - 9:00 p.m. in AP&M 6402 on Monday, October 8, 2018. At the meeting you will get the opportunity to meet the members and learn what the pre-professional organization has in store for you this year. After the meeting, we'll head over to Convoy for some boba. It'll be a great way for you to mingle with one another, so you won't want to miss it! For more information, please join the facebook group: facebook.com/groups/tas.ucsd/ Hope to see you there!"


In [84]:
text = textfav

infos, entity = getInfo(text)
print(text)
print("")
for k, c in infos.items():
    if c == None:
        print(k+': None')
    else:
        print(k+': '+c)

You're receiving this email because you registered for the Fall 2018 Beginner's Programming Competition. The Winter 2019 Beginner's Programming Competition, sponsored by Microsoft and Associated Students UCSD, will be held Saturday, March 9th, from 3pm-7:30pm. We'll have some really cool prizes from Microsoft for the winners. Winners will also get to submit their resumes to Microsoft for consideration for Summer 2020 internships!

Event: Fall 2018 Beginner's Programming Competition
Date: Saturday
Time: 15:26-19:30
Location: None
