# Preprocessing the VGKG Dataset

## Download the relevant data

- Download the HTML from the DocumentIdentifier and extract/save its tokens
- Download the Images and classify them with tinyYoloV3 (for now)

In [9]:
import sys
import urllib
import time
from gzip import GzipFile
import pandas as pd
import os
from ipywidgets import FloatProgress
from IPython.display import display
import re
from bs4 import BeautifulSoup
import nltk
import pickle
import threading
from multiprocessing.dummy import Pool # use threads for I/O bound tasks


**Use crawler.py instead of the following cell - it uses multiprocessing.**

In [12]:
%%time

data_count = 5000
current_time = time.time()
article_path = "data/GDELT_VGKG/preprocessed/articles/%d/" % current_time
image_path = "data/GDELT_VGKG/preprocessed/images/%d/" % current_time

print(image_path)
print(article_path)

with GzipFile('data/GDELT_VGKG/vgkg-20160427-part1.csv.gz') as gzipfile:
    df = pd.read_csv(gzipfile, nrows=data_count)
    
    os.makedirs(article_path)
    os.makedirs(image_path)
    
    f = FloatProgress(min=0, max=data_count)
    display(f)
    for index, article in df.iterrows():
        f.value += 1
        try:
            doc = urllib.request.urlopen(article.DocumentIdentifier).read()
            bs_doc = BeautifulSoup(doc)

            # remove some tags that aren't rendered, including their content
            # From https://www.w3schools.com/tags/ref_byfunc.asp
            programming_tags = ['script', 'noscript', 'applet', 'embed', 'object', 'param']
            meta_tags = ['head', 'meta', 'base', 'basefont']
            other_tags = ['data', 'style']
            [x.extract() for x in bs_doc.findAll(programming_tags + meta_tags + other_tags)]

            # Keep only the remaining text (removing all tags etc.)
            text = bs_doc.get_text() #re.sub('<[^<]+?>', '', str(doc))[:100]

            # Tokenize the text
            tokens = nltk.word_tokenize(text)

            # Keep only tokens that are words and more than a letter
            alpha_tokens = [token for token in tokens if token.isalpha() and len(token) > 1]

            # Keep only tokens that are either all caps or no caps or start with a capital letter
            pattern = re.compile("(^[A-Z]?[a-z]+$)|(^[A-Z]+$)")
            word_tokens = [token for token in alpha_tokens if pattern.match(token)]


            # Done preprocessing. Save tokens
            file = open("%s/%s" % (article_path, article.DATE), "wb+")
            pickle.dump(word_tokens, file) # Date is unique(?) TODO find out
            
            # Download the corresponding image 
            # (for the whole dataset, we'll have to classify and then discard, unless I get proper storage)
            urllib.request.urlretrieve(article.ImageURL, "%s/%s" % (image_path, article.DATE))

            
        except Exception as e:
            print(e, article.DocumentIdentifier)
            
            
            # TODO create number of URLS (including splitting by error type) 
            # and Number of average Characters at each stage plots
            # TODO look into the errors in more detail (especially 403's etc.)
            # TODO store and publicise raw html/preprocessed/images

## Classify images

In [3]:
%%time

dataset_time = 1535141748 # set this to which version of the collected data is to be used
image_path = "data/GDELT_VGKG/preprocessed/images/%d/" % dataset_time
image_classification_path = "data/GDELT_VGKG/preprocessed/image_classifications/yoloV3-tiny/%d/%d/" % (dataset_time, time.time())
# TODO make the testing of different classifiers DRY

os.makedirs(image_classification_path)

sys.path += [os.getcwd()]
os.environ['DYLD_LIBRARY_PATH'] = "/usr/local/cuda/lib"
%run darknet_wrapper.py

MODEL="yolov3-tiny"
net, meta = initialize_classifier(config="cfg/%s.cfg"%MODEL, weights="weights/%s.weights"%MODEL, data="cfg/coco.data")

images = os.listdir(image_path)
f = FloatProgress(min=0, max=len(images))
display(f)
for image in images:
    f.value += 1
    try:
        image_dir = image_path + "/" + image
        result = detect(net, meta, image_dir)
        labels = dict()
        for label, probability, coordinates in result:
            if label in labels: 
                labels[label]+=1 
            else:
                labels[label] = 1
        print(labels)
        # Save classification result
        file = open("%s/%s" % (image_classification_path, image), "wb+")
        pickle.dump(labels, file)
        
    except Exception as e:
            print(e)
            # Some of the files are not actually images


/usr/local/cuda/lib


FloatProgress(value=0.0, max=41.0)

{}
{b'person': 5}
{}
{b'person': 3, b'tie': 1}
{b'person': 1}
{b'person': 1}
{}
{b'person': 1}
{b'person': 2}
{b'person': 2}
{b'person': 3, b'chair': 1}
{b'person': 3}
{b'person': 2}
{}
{b'person': 1}
{}
{}
{}
{}
{b'person': 1}
{b'person': 4}
{}
{b'person': 1}
{}
{}
{b'car': 3}
{b'person': 2}
{b'person': 1}
{b'person': 2}
{b'person': 1, b'bicycle': 1}
{}
{b'person': 3}
{}
{}
{}
{b'person': 1}
{b'person': 1}
{b'tie': 1}
{}
{b'person': 2}
{b'person': 1}
CPU times: user 4.44 s, sys: 450 ms, total: 4.89 s
Wall time: 5.72 s
