# Install the python google api client
### API code and install instructions link https://github.com/googleapis/google-api-python-client
### See API docs link below

In [None]:
!pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib

In [None]:
from __future__ import print_function
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

# Create the service needed to get access to gmail
## This will open a new tab and ask for access to your gmail account and create a token.pickle that you may need to delete should you change the SCOPES variable
## You should also have gotten a credentials.json from https://developers.google.com/gmail/api/quickstart/python
### Note on token.pickle: The file token.pickle stores the user's access and refresh tokens, and is created automatically when the authorization flow completes for the first time.

In [None]:
# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
"""Shows basic usage of the Gmail API.
Lists the user's Gmail labels.
"""
creds = None
# The file token.pickle stores the user's access and refresh tokens, and is
# created automatically when the authorization flow completes for the first
# time.
if os.path.exists('token.pickle'):
    with open('token.pickle', 'rb') as token:
        creds = pickle.load(token)
# If there are no (valid) credentials available, let the user log in.
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file(
            'credentials.json', SCOPES)
        creds = flow.run_local_server()
    # Save the credentials for the next run
    with open('token.pickle', 'wb') as token:
        pickle.dump(creds, token)

service = build('gmail', 'v1', credentials=creds)

# Call the Gmail API
results = service.users().labels().list(userId='me').execute()
labels = results.get('labels', [])

if not labels:
    print('No labels found.')
else:
    print('Labels:')
    for label in labels:
        print(label['name'])

# Google Gmail API v1
## https://developers.google.com/gmail/api/v1/reference/users
### Sample Query for getting mail between two dates "in:sent after:2018/11/01 before: 2019/04/01"

In [None]:
# https://developers.google.com/gmail/api/guides/filtering
# Warning: All dates used in the search query are interpretted as midnight on that date in the PST timezone. 
# To specify accurate dates for other timezones pass the value in seconds instead: 
# ?q=in:sent after:1388552400 before:1391230800
# GET https://www.googleapis.com/gmail/v1/users/me/messages?q=in:sent after:2014/01/01 before:2014/02/01

# results = service.users().messages().list(userId='me').execute()
# print(results)

def GetMessage(service, user_id, msg_id, snippet = True):
  """Get a Message with given ID.
    https://developers.google.com/gmail/api/v1/reference/users/messages/get
  Args:
    service: Authorized Gmail API service instance.
    user_id: User's email address. The special value "me"
    can be used to indicate the authenticated user.
    msg_id: The ID of the Message required.

  Returns:
    A Message.
  """
  try:
    message = service.users().messages().get(userId=user_id, id=msg_id).execute()
    if snippet:
        print('Message snippet: %s' % message['snippet'])
    
    return message
  except errors.HttpError, error:
    print('An error occurred: %s' % error)


def ListMessagesMatchingQuery(service, user_id, query=''):
  """
  List all Messages of the user's mailbox matching the query.

  Args:
    service: Authorized Gmail API service instance.
    user_id: User's email address. The special value "me"
    can be used to indicate the authenticated user.
    query: String used to filter messages returned.
    Eg.- 'from:user@some_domain.com' for Messages from a particular sender.

  Returns:
    List of Messages that match the criteria of the query. Note that the
    returned list contains Message IDs, you must use get with the
    appropriate ID to get the details of a Message.
  """
  try:
    response = service.users().messages().list(userId=user_id,
                                               q=query).execute()
    messages = []
    if 'messages' in response:
      messages.extend(response['messages'])

    while 'nextPageToken' in response:
      page_token = response['nextPageToken']
      response = service.users().messages().list(userId=user_id, q=query,
                                         pageToken=page_token).execute()
      messages.extend(response['messages'])

    return messages
  except errors.HttpError, error:
    print('An error occurred: %s' % error)
#Sample Query in:sent after:2018/11/01 before: 2019/04/01 
messages = ListMessagesMatchingQuery(service, 'me', 'in:inbox after:2018/11/01') #Gets all emails sent after Nov 1
print(messages)

### Regex for finding emails

In [None]:
import re
pattern = re.compile("[A-Za-z0-9]+@[a-zA-Z0-9\.]+")

In [None]:
import datetime
# 16badc5f0dbdd207 - this is just a sample email id acicated with my current user,
# you can print out your own for testing
message = GetMessage(service, 'me', '16baecd1a4bb2a28')
email = ""
date = datetime.datetime.fromtimestamp(int(message['internalDate'][:10])).strftime('%Y-%m-%d %H:%M:%S')
for i in range(len(message['payload']['headers'])): 
    email = message['payload']['headers'][i]['value']
    if pattern.match(email):
        if (email != "mailroomstaff@gh.cs.umd.edu" 
        and email != "notifications@packages.cs.umd.edu" 
        and email != "mailroomstaff@cs.umd.edu"):
            print("**********************\n")
            print(email)
            print("----------------------\n")

In [None]:
def remove_surrounding(string, uni=True):
    if uni:
        string = unicodedata.normalize('NFKD', string).encode('ascii','ignore')
    string = string.split('<', 1)[-1]
    string = string.replace(">","")
    string = string.lstrip() 
    return string

# This part may take some time, go grab a coffee :)

In [None]:
from __future__ import division
import unicodedata
import math
len_messages = len(messages)
mod_value = math.floor(len_messages/10)
all_emails = []
def filter_header(message_arr):
    f = open("gmail_address_extractor/email_filter_test.txt","w+")
    f.write("Processing: " + str(len(message_arr)) + " emails")
    print("Processing: " + str(len(message_arr)) + " emails")
    j = 0
    for message in message_arr:
        email = GetMessage(service, 'me', message['id'], False)
        len_message = len(email['payload']['headers'])
        f.write("Length of header array: "+ str(len_message) + "\n")
        for i in range(len_message):
            if pattern.match(email['payload']['headers'][i]['value']):
                if (email['payload']['headers'][i]['value'] != "mailroomstaff@gh.cs.umd.edu" 
                and email['payload']['headers'][i]['value'] != "notifications@packages.cs.umd.edu" 
                and email['payload']['headers'][i]['value'] != "mailroomstaff@cs.umd.edu"
                and email['payload']['headers'][i]['value'] != 'bounce@cs.umd.edu'):
                    f.write("**********************\n")
                    f.write(email['payload']['headers'][i]['value'])
                    all_emails.append((remove_surrounding(email['payload']['headers'][i]['value']),datetime.datetime.fromtimestamp(int(email['internalDate'][:10])).strftime('%Y-%m-%d %H:%M:%S')))
                    f.write("\n----------------------\n")
        j += 1
        if j % mod_value == 0:
            f.write(str(math.ceil((j/len_messages)*100)) + "% there\n")
            print(str(math.ceil((j/len_messages)*100)) + "% there\n")
    f.close() 
    print("done")

filter_header(messages)

# Now it's time to create a CSV with all the sent emails

### Lets make sure that our expected output is correct

In [None]:
print(all_emails)

In [None]:
new_all_emails = []
for email in all_emails:
    if "," in email[0]:
        new_email = re.split(',',email[0])
        new_email = [remove_surrounding(new_email_index, False) for new_email_index in new_email]
        new_email = [(new_email_index,email[1]) for new_email_index in new_email]
        new_all_emails.extend(new_email)
    else:
        new_all_emails.append((remove_surrounding(email[0], False), email[1]))
all_emails = new_all_emails
new_all_emails = None

In [None]:
from __future__ import division
import csv
import math
len_messages = len(messages)
mod_value = math.floor(len_messages/10)
checker = {}
# print(mod_value, len_messages)
with open('gmail_address_extractor/Nov1TillNow.csv', 'wb') as csvfile:
    print("starting to write to CSV. Processing: " + str(len_messages) + " emails \n")
    filewriter = csv.writer(csvfile, delimiter=',',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
    filewriter.writerow(['Email', 'Date'])
    i = 0
    for email in all_emails:
        if email[0] not in checker:
            filewriter.writerow([email[0], email[1]])
            checker[email[0]] = True
#         filewriter.writerow([GetMessage(service, 'me',mess['id'], False)['payload']['headers'][4]['value'], remove_surrounding(GetMessage(service, 'me',mess['id'], False)['payload']['headers'][4]['value'])])
        i += 1
        if i % mod_value == 0:
            print(str(math.ceil((i/len_messages)*100)) + "% there\n")
    print("done")


# Make a comma seprated list that is not a CSV for easy copy paste into a mail client

In [None]:
f = open("gmail_address_extractor/email_easy_copy.txt","w+")
checker = {}
for email in all_emails:
    if email not in checker:
        print(email[0] + ", " + email[1] + "\n")
        f.write(email[0] + ", ") #keep this file copy paste email client friendly
        checker[email[0]] = True
f.close()