# Script purpose

Created on 01/03/2023 by Claire

The output created by this script is used in the Step1_Regex_Data_Cleaning script. 

This script will do the following things:
- Issue queries from the sqlite file to filter either on labelled or unlabelled data
- Do some light cleaning, e.g. to split up the attachment types
- Save the extracted data files as pickle files for further analysis 

In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import pandas as pd
import pickle

In [11]:
# try and import my own functions:
import sys
import ipynb
import re
sys.path.append('/project/Xelix_Project/utils')

In [12]:
from ipynb.fs.full.Data_Extraction_Functions import return_labelled_data, return_unlabelled_data

In [13]:
# f_data = "/project/Xelix_Project/data/email_extractor.sqlite"
f_data = "data/email_extractor.sqlite"

# Extract labelled emails

These are the emails where the category slug needs to be 'not null' 

In [14]:
%%time
labelled_data = return_labelled_data(f_data)

CPU times: user 3.14 s, sys: 5.28 s, total: 8.41 s
Wall time: 2min 29s


In [15]:
print(len(labelled_data))
labelled_data.head()

14333


Unnamed: 0,id,subject,internal_date,from_address,to,bcc,cc,reply_to,html_body,plain_text_body,attachments,category_id,category_slug
0,167529,BLUESTEM GROUP INV PP2422423 - Order X843348. ...,2021-11-17 16:08:00+00,sales@bluestemgroup.co.uk,{Purchase.Ledger@evo-group.co.uk},{},{},{},"<html>\n<head>\n<meta http-equiv=""Content-Type...",,"19427_2422423_00112742.pdf,19427_2422423_00112...",100,invoices
1,167544,"Overdue Account £59,632.58",2022-07-05 15:37:48+00,sefika@albion.co.uk,{Matt.Hales@evo-group.co.uk},{},{Ryan.Leigh@evo-group.co.uk},{},"<html>\n<head>\n<meta http-equiv=""Content-Type...",,"Followups(6).pdf,Followups(6).pdf,Followups(6)...",1,statement
2,170741,FW: APRIL ACCOUNT,2021-07-28 10:10:32+00,Lisa.Dyson@evo-group.co.uk,{Purchase.Ledger@evo-group.co.uk},{},{},{},"<html xmlns:v=""urn:schemas-microsoft-com:vml"" ...",,"CR1018258.pdf,CR1018258.pdf,CR1018259.pdf,CR10...",100,invoices
3,170766,,2021-07-28 11:03:11+00,clovell@ups.com,{purchase.ledger@evo-group.co.uk},{},{},{},"<html xmlns:v=""urn:schemas-microsoft-com:vml"" ...",Attention: This email originated outside of ou...,"scr019205.pdf,scr019205.pdf,scr019205.pdf,scr0...",100,invoices
4,172273,ban 1148 Overdue Account,2021-08-02 09:08:05+00,PennyRichardson@morleys.co.uk,{Purchase.Ledger@evo-group.co.uk},{},{},{},"<html xmlns:v=""urn:schemas-microsoft-com:vml"" ...",Attention: This email originated outside of ou...,"ban-1148.pdf,ban-1148.pdf,image001.png,image00...",101,reminders


In [16]:
labelled_data['category_slug'].value_counts()

invoices                           5673
invoices-subject-line-pdf          4356
statement                          1417
demo                                684
reminders                           383
invoice-no-tags                     345
order-tracking                      240
no-action                           152
payments                            140
other-queries                       127
reminder-pdf                        120
credit-note                         116
order-acknowledgement               103
invoices-content                     99
new-supplier                         90
order-dispatch-and-tracking          86
invoices-subject-line-pdf-other      72
reminder-content                     61
bank-detail-change                   25
direct-debit-advice-reminder         19
backorder                            16
shipping-manifest                     4
invoices-subject-line-excel           3
invoices-subject-line-word            2
Name: category_slug, dtype: int64

In [17]:
print(sum(labelled_data.duplicated()))

0


# Extract unlabelled emails

Set a limit to extract, because large amount of emails

In [77]:
%%time
# limit = 15000    # how many emails to return
# start_from = 20000  # which rows to start from, e.g. 0 is from the start, 1000 is to skip the first 1000 rows

# limit = 20000    # how many emails to return
# start_from = 20000  # which rows to start from, e.g. 0 is from the start, 1000 is to skip the first 1000 rows

limit = 50000    # how many emails to return
start_from = 250000  # which rows to start from, e.g. 0 is from the start, 1000 is to skip the first 1000 rows

unlabelled_data = return_unlabelled_data(f_data, start_from, limit)

CPU times: user 25.1 s, sys: 52.4 s, total: 1min 17s
Wall time: 3min 6s


In [78]:
print(len(unlabelled_data))
unlabelled_data.head()

21035


Unnamed: 0,id,subject,internal_date,from_address,to,bcc,cc,reply_to,html_body,plain_text_body,attachments,category_id,category_slug
0,441636,[Postmaster] Attachment Alert Notification,2021-12-03 13:03:19+00,postmaster@dorchestercollection.com,{Invoicepayments.UK@dorchestercollection.com},{},{},{},"<html><head>\n<meta http-equiv=""Content-Type"" ...",,,,
1,441637,Scanned document from TDLPurchasingPRN2,2021-12-03 12:56:57+00,dominic.lowry@dorchestercollection.com,{DocuWareDeliveryNotes.UK@dorchestercollection...,{},{},{},,Scanned document from TDLPurchasingPRN2\n-----...,TDLPurchasingPRN220211203130159.pdf,,
2,441638,R & J Yorkshire's Finest Farmers and Butchers,2021-12-03 12:44:01+00,kate@randjyorkshiresfinest.co.uk,{APInvoicequeries.UK@dorchestercollection.com},{},{},{},"<html><head>\n<meta http-equiv=""Content-Type"" ...",,"501556.pdf,image001.png,image002.png,image003....",,
3,441639,RE: Overdue account,2021-12-03 12:41:50+00,David.Oakley@theeastindiacompany.com,"{APInvoicequeries.UK@dorchestercollection.com,...",{},{Tejas.Gandhi@theeastindiacompany.com},{},"<html><head>\n<meta http-equiv=""Content-Type"" ...",,"image001.jpg,image320501.jpg",,
4,441640,RE: Overdue account,2021-12-03 12:41:50+00,David.Oakley@theeastindiacompany.com,"{APInvoicequeries.UK@dorchestercollection.com,...",{},{Tejas.Gandhi@theeastindiacompany.com},{},"<html><head>\n<meta http-equiv=""Content-Type"" ...",,"image001.jpg,image320501.jpg",,


In [79]:
unlabelled_data['category_slug'].value_counts()

Series([], Name: category_slug, dtype: int64)

In [80]:
print(sum(unlabelled_data.duplicated()))

0


# Initial cleaning of data

This will:
- remove duplicate entries from the attachments column
- remove images and hyperlinks from the attachment types
- add new feature whether there is an attachment with the email or not after cleaning

In [18]:
from ipynb.fs.full.Regex_html_Functions import remove_duplicates, clean_attachment_types

In [None]:
# for the labelled data
labelled_data['attachments'] = labelled_data['attachments'].apply(remove_duplicates)
labelled_data['attachments'] = labelled_data['attachments'].apply(clean_attachment_types)
labelled_data['has_attachment'] = [True if val is not None else False for val in labelled_data['attachments']]

In [81]:
# for the unlabelled data
unlabelled_data['attachments'] = unlabelled_data['attachments'].apply(remove_duplicates)
unlabelled_data['attachments'] = unlabelled_data['attachments'].apply(clean_attachment_types)
unlabelled_data['has_attachment'] = [True if val is not None else False for val in unlabelled_data['attachments']]

# HTML parsing

This will turn the email text data into parsed plain text data from html if needed

In [26]:
from ipynb.fs.full.Regex_html_Functions import text_from_html

In [27]:
%%time
labelled_data["html_body_as_text"] = labelled_data["html_body"].map(text_from_html)
labelled_data["body"] = labelled_data["plain_text_body"]
labelled_data.loc[labelled_data["body"].isnull(), "body"] = labelled_data.loc[labelled_data["body"].isnull(), "html_body_as_text"]

CPU times: user 1min 5s, sys: 0 ns, total: 1min 5s
Wall time: 1min 5s


In [82]:
%%time
unlabelled_data["html_body_as_text"] = unlabelled_data["html_body"].map(text_from_html)
unlabelled_data["body"] = unlabelled_data["plain_text_body"]
unlabelled_data.loc[unlabelled_data["body"].isnull(), "body"] = unlabelled_data.loc[unlabelled_data["body"].isnull(), "html_body_as_text"]

CPU times: user 2min 24s, sys: 135 ms, total: 2min 25s
Wall time: 2min 25s


# Export to Pickle

In [None]:
# if only want bank-change-detail emails; 
# bankchange_emails = labelled_data[labelled_data.category_slug.str.contains("bank-detail-change") == True]
# bankchange_emails.to_pickle("data/Step0_Data/bank_change_emails_and_threads.pkl")
# print(len(bankchange_emails))

In [28]:
# labelled_data.to_pickle("data/Step0_Data/labelled_emails_and_threads.pkl")

In [83]:
unlabelled_data.to_pickle("data/Step0_Data/unlabelled_emails_and_threads_run6.pkl")

In [84]:
# del labelled_data, unlabelled_data

In [85]:
del unlabelled_data