In [682]:
import json
import requests
from dateutil.parser import parse as parse_date
import datetime
import re
import random
from os import listdir
from os.path import isfile, join
import sys
import chardet
import html2text
import email

In [173]:
re_year = re.compile('[12][09][0-9]{2}')

## Parsing email

In [594]:
def html_to_text(html):
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    ret_val = h.handle(html)
    return ret_val

In [593]:
def parse_singlepart_text_message(msg):
    if msg.is_multipart():
        raise Exception("Cannot run function parse_singlepart_text_message on a multipart message.")

    if msg.get_content_type() in ['image/jpeg', 'image/png']:
        return ""

    charset = msg.get_content_charset()
    if charset is None:
        charset = 'utf8'
    text = ''
    if msg.get_content_type() == 'text/plain':
        text = str(msg.get_payload(decode=True), str(charset), "ignore")
    elif msg.get_content_type() == 'text/html':
        html = str(msg.get_payload(decode=True), str(charset), "ignore")
        text = html_to_text(html.strip())
    return text.strip()

In [684]:
def get_text(email_object):
    msg = email_object
    content_type = msg.get_content_type()
    payload = msg.get_payload()

    if msg.is_multipart() and content_type == 'multipart/mixed' or content_type == 'multipart/related':
        text = ""
        for part in payload:
            text += get_text(part)
        return text
    elif msg.is_multipart() and content_type == 'multipart/alternative':
        content_types = [x.get_content_type() for x in payload]
        if sorted(content_types) == ['text/html']:
            html = payload[0]
            return parse_singlepart_text_message(html)
        if sorted(content_types) == ['text/html', 'text/plain']:
            html = list(filter(lambda x: x.get_content_type() == "text/html", payload))[0]
            return parse_singlepart_text_message(html)
        elif sorted(content_types) == ['multipart/related', 'text/plain']:
            multi = list(filter(lambda x: x.get_content_type() == "multipart/related", payload))[0]
            return get_text(multi)
        else:
            return ""
    elif not msg.is_multipart():
        return parse_singlepart_text_message(msg)
    else:
        raise Exception("haven't accounted for this content type " + content_type)

In [676]:
def structurize(email):
    return {
            "date": email['date'],
            "subject": email['subject'],
            "payload": re.sub("[\n\r\t]{1,}", "\n", get_text(email))
            }

## Read file

In [174]:
def read_json_file(fname):
    with open(fname,"r", encoding='utf-8') as f:
        content = f.read()
    return json.loads(content)
def dump_as_json(data, filename):
    with open(filename, 'w') as f:
        f.write(json.dumps(data, sort_keys=False , indent=4))

In [360]:
train = read_json_file('train_without_durations.json')
test = read_json_file('test_without_durations.json')

## Duckling

In [172]:
duckling_url = 'http://192.168.99.100:8000/parse'
headers = {'content-type': 'application/x-www-form-urlencoded'}

In [529]:
def filter_node(node):
    return node['dim'] in ['time', 'duration'] and node['body'] not in ['summer', 'now']

def subj_payload_to_text(email):
    return re.sub("[\n\s]{1,}", " ", (email['subject']+ '\n' + email['payload']));

In [596]:
def duckle_it(email):
    sent_date = parse_date(email['date'])
    if sent_date.hour > 12 and sent_date.tzinfo != None:
        sent_date = sent_date.replace(hour = sent_date.hour - 12)
    timestamp = int(sent_date.timestamp() * 1000)
    payload = {'reftime':timestamp, 'text': subj_payload_to_text(email)}
    r = requests.post(duckling_url, data = payload, headers=headers)
    return  list(filter(filter_node, [x for x in r.json()]))

In [597]:
duckle_it(train[0]['email'])

[{'body': 'Wednesday August 15, 2001',
  'start': 99,
  'value': {'values': [{'value': '2001-08-15T00:00:00.000-07:00',
     'grain': 'day',
     'type': 'value'}],
   'value': '2001-08-15T00:00:00.000-07:00',
   'grain': 'day',
   'type': 'value'},
  'end': 124,
  'dim': 'time',
  'latent': False},
 {'body': '3:00pm - 4:00pm',
  'start': 131,
  'value': {'values': [{'to': {'value': '2001-08-06T16:01:00.000-07:00',
      'grain': 'minute'},
     'from': {'value': '2001-08-06T15:00:00.000-07:00', 'grain': 'minute'},
     'type': 'interval'},
    {'to': {'value': '2001-08-07T16:01:00.000-07:00', 'grain': 'minute'},
     'from': {'value': '2001-08-07T15:00:00.000-07:00', 'grain': 'minute'},
     'type': 'interval'},
    {'to': {'value': '2001-08-08T16:01:00.000-07:00', 'grain': 'minute'},
     'from': {'value': '2001-08-08T15:00:00.000-07:00', 'grain': 'minute'},
     'type': 'interval'}],
   'to': {'value': '2001-08-06T16:01:00.000-07:00', 'grain': 'minute'},
   'from': {'value': '2001-0

In [728]:
class Date_interval:
    def __init__(self, start_date, end_date, start_pos, end_pos, grain):
        self.start_date = start_date
        self.end_date = end_date
        self.start_pos = start_pos
        self.end_pos = end_pos
        self.grain = grain
    
    def can_be_replaced(self, interval):        
        if(self.start_date.date() != interval.start_date.date()):
            return False
        if(self.end_date.date() != interval.end_date.date()):
            return False
        if(self.grain in ['day', 'year', 'month'] and interval.grain in ['minute','hour']):
            return True
        return False
    
    def print_self(self):
        print("Start: " + str(self.start_date))
        print("End: " + str(self.end_date))
        print("["+str(self.start_pos)+', '+str(self.end_pos) + ']')
        print("Grain:" + self.grain)
        
class Location:
    def __init__(self, loc, start_pos, end_pos):
        self.text = loc        
        self.start_pos = start_pos
        self.end_pos = end_pos
    
    def print_self(self):
        print("Location: " + str(self.text))

## Spacy model updating for location parsing

In [26]:
import spacy
nlp = spacy.load('en_core_web_md')

In [44]:
def meetings_contains_loc(d):
    for m in d['meetings']:
        if (m['loc']!=''):
            return True
    return False

In [161]:
with_loc = list(filter(meetings_contains_loc, [d for d in train]))

In [162]:
def annotate_loc(d):
    subj_payload =  re.sub("[\n\s]{1,}", " ", (d['email']['subject']+ '\n' + d['email']['payload']))
    anno = []    
    for m in d['meetings']:        
        start_ind = subj_payload.find(m['loc'])
        if start_ind == -1:
            print(m['loc'])
            continue
        end_ind = start_ind + len(m['loc'])
        anno.append((start_ind, end_ind, 'LOC'))
    return (subj_payload, {'entities':anno})

In [163]:
train_data = list(map(annotate_loc, with_loc))

Plaza Club, 910 Louisiana, Solarium Room


In [103]:
with nlp.disable_pipes(*[pipe for pipe in nlp.pipe_names if pipe != 'ner']):
    optimizer = nlp.begin_training()
    for i in range(10):
        random.shuffle(train_data)
        for text, annotations in train_data:
            nlp.update([text], [annotations], sgd=optimizer)

In [104]:
nlp.to_disk('/model')

## Parsing dates

In [205]:
def update_date(node, check_date, sent_date, previous_dates):
    if check_date.date() == sent_date.date() and 'today' not in node['body'].lower() and len(previous_dates) > 0:
        return previous_dates[-1].replace(hour = check_date.hour, minute = check_date.minute, second = check_date.second)        
    return check_date

In [214]:
def default_offset(date):
    return date + datetime.timedelta(minutes=60)

In [671]:
def process_date_node(node, previous_dates, sent_date):
    node_value = node['value']
    if (node_value['type'] == 'value'):  
        parsed = parse_date(node_value['value'])
        parsed = update_date(node, parsed, sent_date, previous_dates)
        return Date_interval(parsed, None, node['start'] ,node['end'], node_value['grain'])
    if (node_value['type'] == 'interval'):
        from_value = node_value.get('from')
        if from_value == None:
            return Date_interval(None,None,node['start'] ,node['end'], None)
        from_parsed = parse_date(node_value['from']['value'])
        to_parsed = None
        to_value = node_value.get('to')
        if to_value != None:
            to_parsed = update_date(node, parse_date(to_value['value']), sent_date, previous_dates)
            if to_parsed.minute % 5 != 0:
                to_parsed = to_parsed - datetime.timedelta(minutes = 1)
        from_parsed = update_date(node, from_parsed, sent_date, previous_dates)        
        return Date_interval(from_parsed, to_parsed, node['start'], node['end'], from_value['grain'])
    raise Exception("Unexpected type" + node_value['type'])

In [669]:
def process_duration_node(node, previous_dates, sent_date):
    date = previous_dates[-1] if len(previous_dates) != 0 else sent_date
    delta = datetime.timedelta(seconds = node['value']['normalized']['value'])
    return Date_interval(date, date+delta, node['start'], node['end'], node['value']['unit'])

In [723]:
def filter_redundant_intervals(intervals):
    results = []
    for ind,interv in enumerate(intervals):
        if len(results) != 0:
            if interv.start_date == results[-1].start_date and interv.end_date == results[-1].end_date:
                continue
            if results[-1].can_be_replaced(interv):
                results[-1] = interv
                continue
            if interv.can_be_replaced(results[-1]):                
                continue
        results.append(interv)
    return results

In [647]:
def get_intervals(email):
    nodes = duckle_it(email)
    dates = []
    intervals = []
    durations = []
    sent_date = parse_date(email['date'])
    for node in nodes:
        if node['dim'] == 'duration':
            durations.append(node)
        elif node['dim'] == 'time':
            interval = process_date_node(node, dates, sent_date)
            if interval.start_date == None:
                continue
            interval.start_date = interval.start_date.replace(tzinfo = sent_date.tzinfo)
            if (interval.start_date.hour < 7):
                interval.start_date = interval.start_date.replace(hour = interval.start_date.hour + 12)
            if (interval.start_date < sent_date):
                continue
            dates.append(interval.start_date)
            if interval.end_date == None:
                interval.end_date = default_offset(interval.start_date)
            else:
                if (interval.end_date.hour < 7):
                    interval.end_date = interval.end_date.replace(hour = interval.end_date.hour + 12)
            intervals.append(interval)
    intervals += [process_duration_node(n, dates, sent_date) for n in durations]
    intervals = filter_redundant_intervals(intervals)
    for interv in intervals:
        interv.start_date = interv.start_date.replace(tzinfo = sent_date.tzinfo)
        interv.end_date = interv.end_date.replace(tzinfo = sent_date.tzinfo)
    return intervals

In [601]:
for x in get_intervals(train[0]['email']):
    x.print_self()

Start: 2001-08-15 12:00:00-07:00
End: 2001-08-15 13:00:00-07:00
[99, 124]
Start: 2001-08-15 15:00:00-07:00
End: 2001-08-15 16:00:00-07:00
[131, 146]


## Parsing locations 

In [567]:
def parse_locs(email):
    subj_payload = subj_payload_to_text(email)    
    doc = nlp(subj_payload)
    return [Location(x.text,x.start_char,x.end_char) for x in doc.ents if x.label_=='LOC']

In [563]:
def get_closest_loc(interval, locs):
    if len(locs) == 0: return None
    interval_mid_pos = (interval.start_pos + interval.end_pos)/2
    closestLoc = locs[0]
    min_distance = abs(interval_mid_pos - (closestLoc.start_pos + closestLoc.end_pos)/2)
    for l in locs:
        distance = abs(interval_mid_pos - (l.start_pos + l.end_pos)/2)
        if distance < min_distance:
            closestLoc = l
            min_distance = distance
    return closestLoc

## Evaluation

In [611]:
def evaluation(data):
    right_date_count = 0
    all_date_count = 0    
    right_subj_count = 0
    all_subj_count = 0
    right_loc_count = 0
    all_loc_count = 0
    ind = 0
    for d in data:
        email = d['email']
        intervals = get_intervals(email)
        parsed_subj = email['subject']
        parsed_locs = parse_locs(email)
        meetings = d['meetings']
        for m in meetings:
            sent_date = parse_date(email['date'])
            d_start = parse_date(m['start']).replace(tzinfo = sent_date.tzinfo)        
            d_end = parse_date(m['end']).replace(tzinfo = sent_date.tzinfo)
            interv = (d_start,d_end)
            if interv in [(x.start_date,x.end_date) for x in intervals]:
                right_date_count += 1
            if m['subj'] in parsed_subj:
                right_subj_count += 1
            if m['loc'] != '':
                if m['loc'] in [l.text for l in parsed_locs]:
                    right_loc_count +=1
                all_loc_count +=1
            all_date_count += 1
            all_subj_count += 1        
    
    print('Dates recall: ' + str(right_date_count/all_date_count*100))
    print('Subj recall: ' + str(right_subj_count/all_subj_count*100))
    print('Locs recall: ' + str(right_loc_count/all_loc_count*100))

In [724]:
evaluation(train)

Dates recall: 91.30434782608695
Subj recall: 78.26086956521739
Locs recall: 89.04109589041096


In [725]:
evaluation(test)

Dates recall: 100.0
Subj recall: 78.125
Locs recall: 64.0


In [615]:
def parse(email):    
    intervals = get_intervals(email)
    parsed_subj = email['subject']
    parsed_locs = parse_locs(email)
    meetings = []
    for interv in intervals:
        loc = get_closest_loc(interv, parsed_locs)
        meetings.append({
                "subj": parsed_subj,
                "time_interv" : interv,
                "loc": loc
        })
    return meetings

In [635]:
def full_check(emails):
    current_score = 0
    max_score = 0
    ind = 0
    for email in emails:
        parsed_meetings = parse(email['email'])
        ind +=1
        for m in email['meetings']:
            score = 0
            for parsed_m in parsed_meetings:
                if parse_date(m['start']) == parsed_m['time_interv'].start_date.replace(tzinfo = None) \
                    and parse_date(m['end']) == parsed_m['time_interv'].end_date.replace(tzinfo = None):
                    score += 0.65
                    score += 0.2 if m['subj'] in parsed_m['subj'] else 0
                    score += 0.15 if m['loc'] == '' or (parsed_m['loc'] != None and m['loc'] == parsed_m['loc'].text) else 0
                    break
            current_score += score                      
            max_score += 1        
    
    print('Recall: ' + str(current_score/max_score*100))

In [674]:
full_check(train)

Recall: 86.25


In [675]:
full_check(test)

Recall: 91.40625000000003


In [711]:
def demo_print(email):
    print(email['subject'])
    print(email['payload'])
    print()
    print("PARSED")
    print("###############")
    for m in parse(email):
        print("Subject: " + m['subj'])
        m['time_interv'].print_self()
        m['loc'].print_self()
        print()

In [697]:
def process_from_file(fname):
    with open(fname, encoding='utf-8') as f:
        content = f.read()
    print(content)
    return structurize(email.message_from_string(content))

In [731]:
mail = process_from_file("D:\Downloads\original_msg.txt")

Delivered-To: sergeypuhl1995@gmail.com
Received: by 2002:a4f:8541:0:0:0:0:0 with SMTP id h62-v6csp1330116ivd;
        Wed, 23 May 2018 13:04:40 -0700 (PDT)
X-Google-Smtp-Source: AB8JxZr94kbXxJ84NLyJ6fo6q6pcXF1BPdJ4cZBgui0j+mKXw8zgrl2ssV9PkWcd4m8ULttL8Ina
X-Received: by 2002:a17:902:a60d:: with SMTP id u13-v6mr4326109plq.40.1527105880388;
        Wed, 23 May 2018 13:04:40 -0700 (PDT)
ARC-Seal: i=1; a=rsa-sha256; t=1527105880; cv=none;
        d=google.com; s=arc-20160816;
        b=MRjp/hwHOjbheVu8/2hoUKfucqPJHbipq1gsDKsPcO8N/Uf4oqiCwcL9lrX8Lna8fP
         krL2n6zY+MCmnoFPAlnXhgmUbz9BOLfse4oHJoL436hsfCTyugH2/TDPLWan7mhg9eAo
         PUITdc5+/LB5gFBVxOlmlCQfQX03sRuxtFoEBygyCFYBRRM8fjUlA7OR69NDGa9t8E7S
         0pGj3fFL2cK13ggAH8JYuLtFnBmotPkTLP5kIr4yXg4fEquiVsQevL7ozVUV9Q1rV49s
         orDSCGFYUjJlZbgiihuvpzd7rzUX9eagPRjrktBR8Mf+99XrQwJW4ruGUuBPZmQtV9S0
         ueWw==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816;
        h=mime-version:spamd




In [732]:
demo_print(mail)

REMINDER: Intetics & Kievfprog Meetup "Modern advances in programming
 languages"
Dear friends,
Thank you for your registration.
The meetup will be held:
**Date:** May 24 (Thursday)
18.30-22.00
**Venue:** Creative Hub **IZONE** ,
Naberezhno-Lugova st. 8, Kyiv
Agenda:
** **
|
** **  
  
---|---  
  
_18.30-19.00_
|
Registration. Welcoming coffee  
  
_19.00-_ _19._ _45_
|
Flow - functional language for building cross-platform UI
**Speaker:** ASGER ALSTRUP PALM
The founder of FLOW programming language  
  
_19._ _45 -20._ _00_
|
Q/As  
  
_20._ _00 \- 2_ _0._ _45_
|
Algebraic Subtyping
**Speaker:** Alexander Nemish
The founder of Lasca programming language  
  
_20._ _45 -21._ _00_
|
Q/As  
  
_21._ _00 -21._ _15_
|
Promotion activity from Intetics  
  
_21._ _15 -22.00_
|
Beer&Burger Networking  
  
Get a chance to win A DRONE from Intetics!
**Intetics Inc.** ****

PARSED
###############
Subject: REMINDER: Intetics & Kievfprog Meetup "Modern advances in programming
 languages"
Start: 20

In [733]:
demo_print(train[0]['email'])

Storage Valuation Meeting
Please plan on attending meeting to discuss Storage Valuation, on:
Date:    Wednesday August 15, 2001
Time:     3:00pm - 4:00pm
Place:    EB32c2
Thank You,
Vanessa Berrios
Administrative Assistant 
for Rita Wynne & Mark McClure
(713)345-4197

PARSED
###############
Subject: Storage Valuation Meeting
Start: 2001-08-15 15:00:00-07:00
End: 2001-08-15 16:00:00-07:00
[131, 146]
Grain:minute
Location: EB32c2



In [734]:
demo_print(test[14]['email'])

Grammarly AI-NLP Club #4 - Successful registration
Good morning! We are glad to confirm your registration for Understanding and Assessing Language with Neural Network Models
<https://www.facebook.com/events/173479380101668/>!
When: Tuesday, March 20, 7 PM
Language: English
Duration: 1 hour, followed by networking opportunities and snacks
Join us for our fourth Grammarly AI-NLP Club on Tuesday, March 20 at
Gulliver Business Center, Tower B, floor 14.
Please, remember to bring your ID (a passport or a driver's license).
You'll need it to enter the business center. And don't forget to pick up a
bracelet pass at the reception desk on the first floor!
*If your plans change, we would appreciate it if you let us know. We will
give your seat to someone on the waiting list.*
Contact us:
Call: 063 441 12 58
Email: viktoria.kolomiets@grammarly.com
Follow us:
Facebook <https://www.facebook.com/GrammarlyKyiv/>
Instagram <https://www.instagram.com/grammarly_kyiv/>
See you soon!
*Viktoria, *
*Grammar