# Tika Parsing and TTR (Text-to-tag Ratio Algorithm Implementation)

## Tika Parsing

In [1]:
import tika

In [2]:
tika.initVM()

In [3]:
import os
os.getenv('TIKA_VERSION')

In [4]:
from tika import parser
parsed = parser.from_file('../data/fradulent_emails.txt', xmlContent=True)

In [5]:
import json
with open("../data/tika-parsed/fradulent_emails_metadata.json", "w") as f:
    json.dump(parsed['metadata'], f)

In [6]:
with open("../data/tika-parsed/fradulent_emails.xhtml", "w") as f:
    f.write(parsed['content'])

In [7]:
import re
with open("../data/tika-parsed/fradulent_emails.xhtml") as f:
    # split the big xml file by the end of </html> tag
    splitted = re.split(r'(<\/html>)', f.read())
    # emails join the odd ones with even ones
    emails = [''.join((splitted[i],splitted[j])) for i, j in zip(range(0,len(splitted),2), range(1,len(splitted),2))]

In [8]:
len(emails)

4000

### XML Syntax Error Checking

In [9]:
from lxml import etree
for i,e in enumerate(emails):
    try:
        root = etree.fromstring(e)
    except lxml.etree.XMLSyntaxError:
        print('Error checking failed.')
        print('Index', i, 'has XML syntax error')
print('Error checking passed.')

Error checking passed.


### Save Separated XMLs

In [10]:
import os, codecs
PATH = '../data/tika-parsed/separated/'
for i, e in enumerate(emails):
    with codecs.open(os.path.join(PATH,str(i)+'.xhtml'), 'w', 'utf-8') as f:
        f.write(e)

## Things to fix before TTR

### Remove no content rows from assignment 1

In [11]:
PATH = '/Users/anthony/Documents/GitHub.nosync/DSCI-550-Assignment-1/data/separated_by_email'
assignment1_wo_content = []
for email in os.listdir(PATH):
    if email.endswith('json'):
        j = json.load(open(os.path.join(PATH,email)))
        if 'X-TIKA:content' not in j:
            assignment1_wo_content.append(email)

assignment1_wo_content = list(map(lambda x: int(x.split('.')[0]), assignment1_wo_content))

In [12]:
import pandas as pd

df1 = pd.read_csv('../data/additional-features-v2/additional_features.tsv', sep='\t')

df1 = df1.loc[~df1.index.isin(assignment1_wo_content)] # remove rows that have no content

df1 = df1.reset_index() # reset index

df1 = df1.drop(['level_0', 'index'], axis=1)

df1.to_csv('../data/additional-features-v2/new/additional_features_w_content.tsv', sep='\t')

In [13]:
len(df1)

3992

### Remove no content rows from assignment 2

In [15]:
from lxml.html import fromstring
PATH = '/Users/anthony/Documents/GitHub.nosync/DSCI-550-Assignment-2/data/tika-parsed/separated/'
assignment2_wo_content = []
for email in os.listdir(PATH):
    if email.endswith('xhtml'):
        with open(os.path.join(PATH, email)) as f:
            tree = fromstring(f.read())

        if not tree.text_content().strip():
            assignment2_wo_content.append(email)

assignment2_wo_content = list(map(lambda x: int(x.split('.')[0]), assignment2_wo_content))

In [16]:
import pandas as pd, os

PATH = '/Users/anthony/Documents/GitHub.nosync/DSCI-550-Assignment-2/data/tika-parsed/separated/'
emails = [f for f in os.listdir(PATH) if not f.startswith('.')]
numbers = list(map(lambda x: int(x.split('.')[0]), emails))
# pd.DataFrame(pd.Series())

In [17]:
df2 = pd.DataFrame(pd.Series(sorted(numbers)))

In [18]:
df2 = df2.loc[~df2.index.isin(assignment2_wo_content)] # remove rows that have no content

df2 = df2.reset_index() # reset index

In [19]:
len(df2)

3992

### Mapping

In [20]:
df2 = df2.drop([0], axis=1)

In [21]:
df2.columns = ['mapping']

In [22]:
mapping = json.loads(df2.to_json())['mapping']
mapping = {y:int(x) for x,y in mapping.items()} # switch keys and values
# explaination: since the indices were messed up, I will introduce a mapping dictionary where 
# its values are the common indices, and its keys are the indices from assignment 2 tika-parsed and 
# separated emails

In [23]:
len(mapping)

3992

## TTR

Relavant paper is in [here](https://www3.nd.edu/~tweninge/pubs/WH_TIR08.pdf)

### Pseudocode

![TTR-Pseudocode](img/TTR-Pseudocode.png)

I have implemented `TTR` in the `utils` module (in `src`)

In [24]:
import sys, os

sys.path.append('../src/')

In [25]:
from utils import TTR

In [26]:
import pandas as pd
df = pd.read_csv('../data/additional-features-v2/new/additional_features_w_content.tsv',sep='\t')

In [27]:
from tqdm.notebook import tqdm
PATH = '../data/tika-parsed/separated/'
for emailN, index in tqdm(mapping.items()):
    TTRArray, content = TTR(os.path.join(PATH, str(emailN) + '.xhtml'), 
                            extract_content=True, 
                            threshold=3)
    df.loc[df.index[index], "TTR'ed Text"] = content

  0%|          | 0/3992 [00:00<?, ?it/s]

In [29]:
df.to_csv('../data/additional-features-v2/new/additional_features_TTR.tsv',sep='\t')