# Email Signature Extractor

Copyright (c) 2022 Akhilesh Muthusamy


### Installing Stanford NLP package for Name Entity Recognition

In [1]:
%pip install stanza

Collecting stanza
  Downloading stanza-1.3.0-py3-none-any.whl (432 kB)
[?25l[K     |▊                               | 10 kB 18.3 MB/s eta 0:00:01[K     |█▌                              | 20 kB 23.6 MB/s eta 0:00:01[K     |██▎                             | 30 kB 25.6 MB/s eta 0:00:01[K     |███                             | 40 kB 14.6 MB/s eta 0:00:01[K     |███▉                            | 51 kB 12.8 MB/s eta 0:00:01[K     |████▌                           | 61 kB 14.8 MB/s eta 0:00:01[K     |█████▎                          | 71 kB 14.1 MB/s eta 0:00:01[K     |██████                          | 81 kB 13.3 MB/s eta 0:00:01[K     |██████▉                         | 92 kB 12.1 MB/s eta 0:00:01[K     |███████▋                        | 102 kB 12.9 MB/s eta 0:00:01[K     |████████▍                       | 112 kB 12.9 MB/s eta 0:00:01[K     |█████████                       | 122 kB 12.9 MB/s eta 0:00:01[K     |█████████▉                      | 133 kB 12.9 MB/s eta 0:0

In [2]:
%pip install find-job-titles

Collecting find-job-titles
  Downloading find_job_titles-0.7.0-py2.py3-none-any.whl (383 kB)
[K     |████████████████████████████████| 383 kB 26.5 MB/s 
[?25hCollecting acora
  Downloading acora-2.3-cp37-cp37m-manylinux1_x86_64.whl (167 kB)
[K     |████████████████████████████████| 167 kB 59.7 MB/s 
[?25hCollecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (106 kB)
[K     |████████████████████████████████| 106 kB 63.7 MB/s 
[?25hInstalling collected packages: pyahocorasick, acora, find-job-titles
Successfully installed acora-2.3 find-job-titles-0.7.0 pyahocorasick-1.4.4


In [3]:
import re
import stanza
stanza.download('en')
nlp = stanza.Pipeline('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2022-02-21 02:47:54 INFO: Downloading default packages for language: en (English)...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.3.0/models/default.zip:   0%|          | 0…

2022-02-21 02:48:06 INFO: Finished downloading models and saved to /root/stanza_resources.
2022-02-21 02:48:06 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos          | combined  |
| lemma        | combined  |
| depparse     | combined  |
| sentiment    | sstplus   |
| constituency | wsj       |
| ner          | ontonotes |

2022-02-21 02:48:06 INFO: Use device: cpu
2022-02-21 02:48:06 INFO: Loading: tokenize
2022-02-21 02:48:06 INFO: Loading: pos
2022-02-21 02:48:07 INFO: Loading: lemma
2022-02-21 02:48:07 INFO: Loading: depparse
2022-02-21 02:48:07 INFO: Loading: sentiment
2022-02-21 02:48:08 INFO: Loading: constituency
2022-02-21 02:48:08 INFO: Loading: ner
2022-02-21 02:48:09 INFO: Done loading processors!


In [4]:
def extract_website(text):
  pattern = r"(?:https?:\/\/)?([a-zA-Z][a-zA-Z0-9._\-]+.[a-zA-Z0-9]+(\.[a-zA-Z0-9._\-]{2,3})+)"
  result = re.findall(pattern, text)
  return result if result else None


# Reference: https://stackoverflow.com/questions/6038061/regular-expression-to-find-urls-within-a-string

In [5]:
def extract_email(text):
  pattern = r"([a-zA-Z][a-zA-Z0-9._\-]+@[a-zA-Z0-9]+(\.[a-zA-Z0-9._\-]{2,3})+)"
  result = re.findall(pattern, text)
  return result if result else None

In [6]:
def extract_phone_no(text):
  if any(str.isdigit(c) for c in text):
    pattern = r"[+0-9. ()\-]{7,}"
    text =re.sub(' {3,}', '|', text)
    result = re.findall(pattern, text)
    return result if result else None
  else:
    return None

Reference: https://pypi.org/project/find-job-titles/

In [7]:
from find_job_titles import Finder

def extract_job_title(text):
  finder = Finder()
  try:
      job_title = finder.findall(text)
      return job_title
  except:
      return []

Reference: https://stanfordnlp.github.io/stanza/installation_usage.html

In [8]:
def extract_address(text):
  doc = nlp(text) 
  hasAddress = False
  for ent in doc.entities:
    if ent.type == "GPE":
      hasAddress = True

  return text if hasAddress else None 

In [9]:
def extract_person(text):
  doc = nlp(text) 
  name = None
  for ent in doc.entities:
    if ent.type == "PERSON":
      name = ent.text

  return name

In [10]:
def extract_company(text):
  doc = nlp(text) 
  name = None
  for ent in doc.entities:
    if ent.type == "ORG":
      name = ent.text

  return name

### Extract Email Signature

In [11]:
def extract_email_signature(email_text):

  STOPLEN = 3
  signature_start = STOPLEN
  email_signature = []

  name = []
  address = []
  phone = []
  position = []
  company = []
  website = []
  email = []

  email_lines = email_text.rstrip('\n').split('\n')

  for index in range(len(email_lines)-1, -1, -1):
      email_line = email_lines[index]

      if signature_start > 0 :
        st_name = extract_person(email_line)
        if st_name:
          name.append(st_name)
        st_address = extract_address(email_line)
        if st_address:
          address.append(st_address)
        st_phone = extract_phone_no(email_line)
        if st_phone:
          phone += st_phone
        st_position = extract_job_title(email_line)
        if st_position:
          position += st_position
        st_company = extract_company(email_line)
        if st_company:
          company.append(st_company)
        st_website = extract_website(email_line)
        if st_website:
          website += st_website
        st_email = extract_email(email_line)
        if st_email:
          email += st_email

        if (st_name or st_address or st_phone or st_company or st_website or st_email):
          signature_start = STOPLEN
        else:
          signature_start -= 1

        email_signature.append(email_line)

        if signature_start == 0:
          if len(email_signature) > 6:
            email_signature = email_signature[:-3]
          email_signature = email_signature[::-1]

      else:
        break

  contains_signature = False

  if (len(name) or len(address) or len(phone) or len(company) or len(website) or len(email)):
    contains_signature = True

  if contains_signature == False:
    email_signature = []

  item = 0
  if len(name) > 0:
    item += 1
  if len(address) > 0:
    item += 1
  if len(phone) > 0:
    item += 1
  if len(position) > 0:
    item += 1
  if len(company) > 0:
    item += 1
  if len(website) > 0:
    item += 1
  if len(email) > 0:
    item += 1

  print('Email includes signature:', 'Yes' if contains_signature else 'NO')
  print('-'*90)
  print('Email signature:\n', '\n'.join(email_signature))
  print('-'*90)
  print('Email signature completeness:',  str((item/7)*100) + '%')


## Testing the function with different inputs

In [12]:
email = """
Hi,

This is test email.

Regards,
George Knight
Sales Manager at Owner Knights Bar

818-996-8912//818-834-3896
http://www.knightsbar.com
www.knightsbar.com
38 Sharon Lane South Bend, IN 46625

"""

extract_email_signature(email)

Email includes signature: Yes
------------------------------------------------------------------------------------------
Email signature:
 George Knight
Sales Manager at Owner Knights Bar

818-996-8912//818-834-3896
http://www.knightsbar.com
www.knightsbar.com
38 Sharon Lane South Bend, IN 46625
------------------------------------------------------------------------------------------
Email signature completeness: 85.71428571428571%


In [13]:
email = """
Hi,

This is test email.

Kevin McLievie
Plant & Soil Scientist
University of Connecticut
--------------------------
www.kevinmclievie.com
Storrs, CT

"""

extract_email_signature(email)

Email includes signature: Yes
------------------------------------------------------------------------------------------
Email signature:
 Kevin McLievie
Plant & Soil Scientist
University of Connecticut
--------------------------
www.kevinmclievie.com
Storrs, CT
------------------------------------------------------------------------------------------
Email signature completeness: 71.42857142857143%


In [14]:
email = """
Hi,

This is test email without any signature.

Thank you

"""

extract_email_signature(email)

Email includes signature: NO
------------------------------------------------------------------------------------------
Email signature:
 
------------------------------------------------------------------------------------------
Email signature completeness: 0.0%
