In [81]:
import datetime
import pickle
import datetime
from dateutil import parser
import requests
from requests.auth import HTTPBasicAuth
from os import environ as env
import imaplib
from email.parser import BytesParser
from email import policy
import re

import html2text

from zdbotutils import diff, mail
from zdbotutils.custom_logging import logger, logging
from config import *
from zdbotutils.mail import chunked

In [6]:
def epoch_time(d):
    epoch_start = parser.parse('1 Jan 1970 00:00:00 +0000 (UTC)')
    return int((d - epoch_start).total_seconds())

In [52]:
######################################################
# get updates in Zendesk (including suspended)
# TODO this vvv
since = None
if since:
    start_time = since
else:
#     start_time = epoch_time(datetime.datetime.now()) - 60 * 60 * 2  # 2 hours ago
#     start_time = 1608044094
    start_time = epoch_time(parser.parse('15 Dec 2020 01:00:00 +0000 (UTC)'))

In [53]:
url_template = 'https://archivesupport.zendesk.com/api/v2/incremental/ticket_events.json?start_time={}&include=comment_events'
next_page = url_template.format(start_time)
session = requests.session()

ticket_events = []
while True:
    logger.info('getting incremental ticket updates from Zendesk since {}'.format(start_time))
    response = session.get(
        next_page,
        auth=HTTPBasicAuth(env['ZENDESK_AGENT_ACCOUNT'] + "/token", env['ZENDESK_API_KEY'])
    )
    assert(response.status_code == 200), "{}: {}".format(response.status_code, response.content)
    data = response.json()
    ticket_events.extend(data['ticket_events'])
    
    if data['end_of_stream']:
        break
    next_page = data['next_page']
    start_time = data['end_time']

2020-12-15 14:51:14,490 [INFO] <ipython-input-53-f2da3089cd73>:7 > getting incremental ticket updates from Zendesk since 1607994000
2020-12-15 14:51:38,711 [INFO] <ipython-input-53-f2da3089cd73>:7 > getting incremental ticket updates from Zendesk since 1607994000
2020-12-15 14:52:16,765 [INFO] <ipython-input-53-f2da3089cd73>:7 > getting incremental ticket updates from Zendesk since 1607994000


In [54]:
zendesk_comments = []  # [(timestamp, comment, ticket_id)...]
for event in ticket_events:
    contents_found = 0  # TODO ditch this variable
    for child in event['child_events']:
        if child['event_type'].lower() == 'comment':
            contents_found += 1
            if contents_found > 1:
                logger.error('found {} content children in single event'.format(contents_found))
            zendesk_comments.append((event['timestamp'], child['body'], event['ticket_id']))

In [55]:
######################################################
# get emails from support addresses
logger.info('getting archive support emails')
server = imaplib.IMAP4_SSL(host=IMAP_SERVER, port=IMAP_PORT)
server.login(env['DIFFBOT_ADDRESS'], env['DIFFBOT_PASSWORD'])
server.select(FROM_ARCHIVE_ACCOUNTS_FOLDER)

2020-12-15 14:53:01,541 [INFO] <ipython-input-55-486bed08493b>:3 > getting archive support emails


('OK', [b'25443'])

In [56]:
# TODO status, response = server.search(None, 'ALL')
status, response = server.search(None, 'SINCE', '15-Dec-2020')
if status != 'OK':
    print('crap')
    exit(1)
# response of the form: [b'1 2 3 4']
if response[0] == b'':
    # TODO handle no emails to match against
    logger.error('no support emails to match against')

In [57]:
msg_ids = response[0].decode().split(' ')
logger.info('found {} support emails'.format(len(msg_ids)))
responses = []
for msg_ids_chunk in chunked(msg_ids, 1000):
    logger.debug('getting a message chunk')
    status, response = server.fetch(','.join(msg_ids_chunk), '(BODY[])')
    if status != 'OK':
        print('crap')
        exit(1)
    responses.extend(response)
server.logout()

2020-12-15 14:53:20,147 [INFO] <ipython-input-57-dfb5e35e89ab>:2 > found 5374 support emails


('BYE', [b'Logging out'])

In [58]:
######################################################
# process emails so they are comparable with zendesk comments

# these configurations match what we get from zendesk
html2text.config.IGNORE_TABLES = True
html2text.config.IGNORE_IMAGES = False
h = html2text.HTML2Text()
h.ignore_links = True

In [59]:
# patterns and formats
# id_pattern = re.compile(b"(\d+) \(BODY\[\] \{\d+\}")
id_pattern = re.compile(b"(\d+).+")
time_str_format = '%a,  %d %b %Y %H:%M:%S %z (%Z)'

In [61]:
# collect decorated messages [(timestamp, comment, id)...]
support_msgs = []
logger.info('parsing archive support email data')
for li in responses:

    # weird case – something isn't implemented properly in the libraries
    if li == b')':
        continue

    id_bytes, msg_bytes = li
    msg = BytesParser(policy=policy.default).parsebytes(msg_bytes)

    # get id
    msg_id = int(re.match(id_pattern, id_bytes.strip()).group(1))
    logger.debug(id_bytes)
    logger.debug(msg_id)

    # get time stamp
    time_str = msg['Received'].split(';')[-1].strip()
    time_stamp = epoch_time(parser.parse(time_str))

    # get message body
    raw = msg.get_body(preferencelist=('plain',))
    if raw is None:
        raw = msg.get_body(preferencelist=('html',))
        if raw is None:
            logger.error('Found message with no plain or html body')
            continue
    try:
        html_content = raw.get_content()
    except LookupError as e:
        logger.error(e)
        continue
    body = h.handle(raw.get_content())

    support_msgs.append((time_stamp, body, msg_id))

2020-12-15 15:05:40,683 [INFO] <ipython-input-61-d166e8f67667>:3 > parsing archive support email data
2020-12-15 15:06:20,298 [ERROR] <ipython-input-61-d166e8f67667>:32 > unknown encoding: iso-8859-8-i
2020-12-15 15:06:20,332 [ERROR] <ipython-input-61-d166e8f67667>:32 > unknown encoding: iso-8859-8-i


In [62]:
print(len(support_msgs))

5372


In [82]:
import importlib
importlib.reload(diff)

<module 'zdbotutils.diff' from '/Users/duncanhall/Documents/work - IA/ia-ps-zd-bot/src/zdbotutils/diff.py'>

In [83]:
#######################################################
# match

# zendesk_comments = pickle.load(open("zendesk_comments.pickle", "rb"))
# support_msgs = pickle.load(open("support_msgs.pickle", "rb"))
# print(len(support_msgs))

logger.info('comment matching... (this could take a while)')
results = diff.match_msgs(zendesk_comments, support_msgs)
# pickle.dump(results, open('results.pickle', 'wb'))
# results = pickle.load(open('results.pickle', 'rb'))
zd_matched, zd_unmatched, archive_matched, archive_unmatched = results

with open('inspect.txt', 'w') as f:
    f.write('\nZD MATCHED\n')
    f.write(str(zd_matched))
    f.write('\nZD UNMATCHED\n')
    f.write(str(zd_unmatched))
    f.write('\nARCHIVE MATCHED\n')
    f.write(str(archive_matched))
    f.write('\nARCHIVE UNMATCHED\n')
    f.write(str(archive_unmatched))
print("zd_matched: {}/{}".format(len(zd_matched), len(zendesk_comments)))
print("zd_unmatched: {}".format(len(zd_unmatched)))
print("archive_matched: {}/{}".format(len(archive_matched), len(support_msgs)))
print("archive_unmatched: {}".format(len(archive_unmatched)))

2020-12-15 15:37:26,747 [INFO] <ipython-input-83-40f1a8eec8ef>:8 > comment matching... (this could take a while)
2020-12-15 15:37:26,756 [INFO] diff.py:48 > Searching for match for item 0/1407
2020-12-15 15:37:26,911 [INFO] diff.py:152 > full ratio: 0.9966 – "InternetArchiveherearethedetai..." and "InternetArchiveherearethedetai..." FULL MATCH
2020-12-15 15:37:26,914 [INFO] diff.py:152 > full ratio: 0.9966 – "InternetArchiveherearethedetai..." and "InternetArchiveherearethedetai..." FULL MATCH
2020-12-15 15:37:27,490 [INFO] diff.py:48 > Searching for match for item 1/1407
2020-12-15 15:37:27,565 [INFO] diff.py:152 > full ratio: 0.9966 – "InternetArchiveherearethedetai..." and "InternetArchiveherearethedetai..." FULL MATCH
2020-12-15 15:37:27,567 [INFO] diff.py:152 > full ratio: 0.9966 – "InternetArchiveherearethedetai..." and "InternetArchiveherearethedetai..." FULL MATCH
2020-12-15 15:37:28,172 [INFO] diff.py:48 > Searching for match for item 2/1407
2020-12-15 15:37:28,389 [INFO] diff

zd_matched: 1087/1407
zd_unmatched: 319
archive_matched: 703/5372
archive_unmatched: 4624


In [95]:
server = imaplib.IMAP4_SSL(host=IMAP_SERVER, port=IMAP_PORT)
server.login(env['DIFFBOT_ADDRESS'], env['DIFFBOT_PASSWORD'])
server.select(FROM_ARCHIVE_ACCOUNTS_FOLDER)

('OK', [b'27151'])

In [100]:
#######################################################
# move matched emails
archive_matched_ids = [str(msg_id) for _, _, msg_id in archive_matched]
logger.info('moving {} matched emails'.format(len(archive_matched_ids)))
for msg_ids_chunk in chunked(archive_matched_ids, 1000):
    ids = ','.join(msg_ids_chunk)
    logger.debug(ids)
    server.copy(ids, MATCHED_ARCHIVE_FOLDER)
    server.store(ids, '+FLAGS', '\\Deleted')
server.expunge()

2020-12-15 16:21:05,888 [INFO] <ipython-input-100-7fa4f764f74d>:4 > moving 703 matched emails


('OK',
 [b'25398',
  b'25371',
  b'25334',
  b'25332',
  b'25328',
  b'25320',
  b'25316',
  b'25312',
  b'25302',
  b'25290',
  b'25254',
  b'25252',
  b'25232',
  b'25230',
  b'25226',
  b'25221',
  b'25210',
  b'25201',
  b'25196',
  b'25191',
  b'25179',
  b'25177',
  b'25172',
  b'25171',
  b'25161',
  b'25159',
  b'25149',
  b'25147',
  b'25142',
  b'25123',
  b'25122',
  b'25119',
  b'25117',
  b'25108',
  b'25099',
  b'25088',
  b'25082',
  b'25079',
  b'25065',
  b'25057',
  b'25036',
  b'25013',
  b'25007',
  b'24965',
  b'24932',
  b'24925',
  b'24903',
  b'24900',
  b'24891',
  b'24886',
  b'24869',
  b'24866',
  b'24847',
  b'24841',
  b'24837',
  b'24821',
  b'24815',
  b'24808',
  b'24801',
  b'24797',
  b'24792',
  b'24787',
  b'24782',
  b'24768',
  b'24766',
  b'24765',
  b'24759',
  b'24751',
  b'24562',
  b'24559',
  b'24556',
  b'24553',
  b'24551',
  b'24549',
  b'24545',
  b'24536',
  b'24530',
  b'24529',
  b'24525',
  b'24520',
  b'24518',
  b'24514',
  b'24513

In [101]:
#######################################################
# log old unmatched ticket comments
cutoff = datetime.datetime.now().timestamp() - MINUTES_GRACE_PERIOD * 60
old_zd_unmatched = {(t, c, ticket_id) for t, c, ticket_id in zd_unmatched if t < cutoff}
logger.info('logging {} old unmatched zendesk comments'.format(len(old_zd_unmatched)))
with open('zd_unmatched_log.txt', 'w') as f:
    for t, c, t_id in old_zd_unmatched:
        f.write("""
Ticket #{}
Time: {}
Comment:
{}
        """.format(t_id, str(datetime.datetime.fromtimestamp(t)), c))

2020-12-15 16:21:10,071 [INFO] <ipython-input-101-a4c93ff4325c>:5 > logging 319 old unmatched zendesk comments


In [102]:
#######################################################
# move old emails to unmatched
cutoff = datetime.datetime.now().timestamp() - MINUTES_GRACE_PERIOD * 60
old_archive_unmatched_ids = [str(msg_id) for t, _, msg_id in archive_unmatched if t < cutoff]
logger.info('moving {} old unmatched archive emails'.format(len(old_archive_unmatched_ids)))
for msg_ids_chunk in chunked(old_archive_unmatched_ids, 1000):
    ids = ','.join(msg_ids_chunk)
    server.copy(ids, UNMATCHED_ARCHIVE_FOLDER)
    server.store(ids, '+FLAGS', '\\Deleted')
server.expunge()

2020-12-15 16:21:12,055 [INFO] <ipython-input-102-4869ecdbfc07>:5 > moving 4624 old unmatched archive emails


('OK',
 [b'25397',
  b'25396',
  b'25395',
  b'25394',
  b'25393',
  b'25392',
  b'25391',
  b'25390',
  b'25389',
  b'25388',
  b'25387',
  b'25386',
  b'25385',
  b'25384',
  b'25383',
  b'25382',
  b'25381',
  b'25380',
  b'25379',
  b'25378',
  b'25377',
  b'25376',
  b'25375',
  b'25374',
  b'25373',
  b'25372',
  b'25370',
  b'25369',
  b'25368',
  b'25367',
  b'25366',
  b'25365',
  b'25364',
  b'25363',
  b'25362',
  b'25361',
  b'25360',
  b'25359',
  b'25358',
  b'25357',
  b'25356',
  b'25355',
  b'25354',
  b'25353',
  b'25352',
  b'25351',
  b'25350',
  b'25349',
  b'25348',
  b'25347',
  b'25346',
  b'25345',
  b'25344',
  b'25343',
  b'25342',
  b'25341',
  b'25340',
  b'25339',
  b'25338',
  b'25337',
  b'25336',
  b'25335',
  b'25333',
  b'25331',
  b'25330',
  b'25329',
  b'25327',
  b'25326',
  b'25325',
  b'25324',
  b'25323',
  b'25322',
  b'25321',
  b'25319',
  b'25318',
  b'25317',
  b'25315',
  b'25314',
  b'25313',
  b'25311',
  b'25310',
  b'25309',
  b'25308

In [None]:
server.logout()

In [111]:
# r = server.search(None, 'BEFORE', '15-Dec-2020')

In [114]:
# mids = r[1][0].decode().split(' ')
# for msg_ids_chunk in chunked(mids, 1000):
#     ids = ','.join(msg_ids_chunk)
#     print(len(ids))
#     server.store(ids, '+FLAGS', '\\Deleted')
# server.expunge()