# Tika Parsing and TTR (Text-to-tag Ratio Algorithm Implementation)

## Tika Parsing

In [1]:
import tika

In [2]:
tika.initVM()

```bash
$ echo TIKA_VERSION="2.0.0-ALPHA"
```

In [3]:
import os
os.getenv('TIKA_VERSION')

'2.0.0-ALPHA'

In [4]:
from tika import parser
parsed = parser.from_file('../data/fradulent_emails.txt', xmlContent=True)

In [5]:
import json
with open("../data/tika-parsed/fradulent_emails_metadata.json", "w") as f:
    json.dump(parsed['metadata'], f)

In [6]:
with open("../data/tika-parsed/fradulent_emails.xml", "w") as f:
    f.write(parsed['content'])

In [7]:
import re
with open("../data/tika-parsed/fradulent_emails.xml") as f:
    splitted = re.findall(r'<html.*?>', f.read()) # check this: <html xmlns="http://www.w3.org/1999/xhtml">
    splitted = set(splitted).pop()
    # rewind to the beginning
    f.seek(0)
    emails_wo_html_tag = f.read().split(splitted)
    emails = [splitted+e for e in emails_wo_html_tag][1:] # remove the very first one because it only has the splitter

In [8]:
len(emails)

4000

### XML Syntax Error Checking

In [91]:
from lxml import etree
for i,e in enumerate(emails):
    try:
        root = etree.fromstring(e)
    except lxml.etree.XMLSyntaxError:
        print('index', i, 'has XML syntax error')

### Save Separated XMLs

In [92]:
import os
PATH = '../data/tika-parsed/separated/'
for i, e in enumerate(emails):
    with open(os.path.join(PATH,str(i)+'.xml'), 'w') as f:
        f.write(e)

## TTR

Relavant paper is in [here](https://www3.nd.edu/~tweninge/pubs/WH_TIR08.pdf)

### Pseudocode

![TTR-Pseudocode](img/TTR-Pseudocode.png)

In [26]:
with open("../data/tika-parsed/fradulent_emails.xml") as f:
    for line in f:
        print(line)
        break
    
    # re-read the file
    f.seek(0)
    
    for line in f:
        print(line)
        break

<html xmlns="http://www.w3.org/1999/xhtml">

<html xmlns="http://www.w3.org/1999/xhtml">

