# Preprocessing the VGKG Dataset

In [1]:
import sys
import urllib
import time
from gzip import GzipFile
import pandas as pd
import os
from ipywidgets import FloatProgress
from IPython.display import display
import re
from bs4 import BeautifulSoup
import nltk
import pickle
import threading
from multiprocessing.dummy import Pool # use threads for I/O bound tasks


  return f(*args, **kwds)
  return f(*args, **kwds)


## Download the relevant data

- Download the HTML from the DocumentIdentifier and extract/save its tokens
- Download the Images and classify them with tinyYoloV3 (for now)

**Use crawler.py instead of the following cell - it uses multiprocessing.**

In [12]:
%%time

data_count = 5000
current_time = time.time()
article_path = "data/GDELT_VGKG/preprocessed/articles/%d/" % current_time
image_path = "data/GDELT_VGKG/preprocessed/images/%d/" % current_time

print(image_path)
print(article_path)

with GzipFile('data/GDELT_VGKG/vgkg-20160427-part1.csv.gz') as gzipfile:
    df = pd.read_csv(gzipfile, nrows=data_count)
    
    os.makedirs(article_path)
    os.makedirs(image_path)
    
    f = FloatProgress(min=0, max=data_count)
    display(f)
    for index, article in df.iterrows():
        f.value += 1
        try:
            doc = urllib.request.urlopen(article.DocumentIdentifier).read()
            bs_doc = BeautifulSoup(doc)

            # remove some tags that aren't rendered, including their content
            # From https://www.w3schools.com/tags/ref_byfunc.asp
            programming_tags = ['script', 'noscript', 'applet', 'embed', 'object', 'param']
            meta_tags = ['head', 'meta', 'base', 'basefont']
            other_tags = ['data', 'style']
            [x.extract() for x in bs_doc.findAll(programming_tags + meta_tags + other_tags)]

            # Keep only the remaining text (removing all tags etc.)
            text = bs_doc.get_text() #re.sub('<[^<]+?>', '', str(doc))[:100]

            # Tokenize the text
            tokens = nltk.word_tokenize(text)

            # Keep only tokens that are words and more than a letter
            alpha_tokens = [token for token in tokens if token.isalpha() and len(token) > 1]

            # Keep only tokens that are either all caps or no caps or start with a capital letter
            pattern = re.compile("(^[A-Z]?[a-z]+$)|(^[A-Z]+$)")
            word_tokens = [token for token in alpha_tokens if pattern.match(token)]


            # Done preprocessing. Save tokens
            file = open("%s/%s" % (article_path, article.DATE), "wb+")
            pickle.dump(word_tokens, file) # Date is unique(?) TODO find out
            
            # Download the corresponding image 
            # (for the whole dataset, we'll have to classify and then discard, unless I get proper storage)
            urllib.request.urlretrieve(article.ImageURL, "%s/%s" % (image_path, article.DATE))

            
        except Exception as e:
            print(e, article.DocumentIdentifier)
            
            
            # TODO create number of URLS (including splitting by error type) 
            # and Number of average Characters at each stage plots
            # TODO look into the errors in more detail (especially 403's etc.)
            # TODO store and publicise raw html/preprocessed/images

## Classify images

### Yolo

In [3]:
%%time

dataset_time = 1535141748 # set this to which version of the collected data is to be used
image_path = "data/GDELT_VGKG/preprocessed/images/%d/" % dataset_time
image_classification_path = "data/GDELT_VGKG/preprocessed/image_classifications/yoloV3-tiny/%d/%d/" % (dataset_time, time.time())
# TODO make the testing of different classifiers DRY

os.makedirs(image_classification_path)

sys.path += [os.getcwd()]
os.environ['DYLD_LIBRARY_PATH'] = "/usr/local/cuda/lib"
%run darknet_wrapper.py

MODEL="yolov3-tiny"
net, meta = initialize_classifier(config="cfg/%s.cfg"%MODEL, weights="weights/%s.weights"%MODEL, data="cfg/coco.data")

images = os.listdir(image_path)
f = FloatProgress(min=0, max=len(images))
display(f)
for image in images:
    f.value += 1
    try:
        image_dir = image_path + "/" + image
        result = detect(net, meta, image_dir)
        labels = dict()
        for label, probability, coordinates in result:
            if label in labels: 
                labels[label]+=1 
            else:
                labels[label] = 1
        # Save classification result
        file = open("%s/%s" % (image_classification_path, image), "wb+")
        pickle.dump(labels, file)
        
    except Exception as e:
            print(e)
            # Some of the files are not actually images


/usr/local/cuda/lib


FloatProgress(value=0.0, max=41.0)

{}
{b'person': 5}
{}
{b'person': 3, b'tie': 1}
{b'person': 1}
{b'person': 1}
{}
{b'person': 1}
{b'person': 2}
{b'person': 2}
{b'person': 3, b'chair': 1}
{b'person': 3}
{b'person': 2}
{}
{b'person': 1}
{}
{}
{}
{}
{b'person': 1}
{b'person': 4}
{}
{b'person': 1}
{}
{}
{b'car': 3}
{b'person': 2}
{b'person': 1}
{b'person': 2}
{b'person': 1, b'bicycle': 1}
{}
{b'person': 3}
{}
{}
{}
{b'person': 1}
{b'person': 1}
{b'tie': 1}
{}
{b'person': 2}
{b'person': 1}
CPU times: user 4.44 s, sys: 450 ms, total: 4.89 s
Wall time: 5.72 s


### Mask R-CNN

#### Loading the Classifier and the Images

In [29]:
sys.path += [os.getcwd() + "/Mask_RCNN", os.getcwd(), os.getcwd() + "/Mask_RCNN/samples/coco/"]
from mrcnn import utils
import mrcnn.model as modellib
from mrcnn import visualize
import coco

MODEL_DIR="Mask_RCNN/logs"
COCO_MODEL_PATH="Mask_RCNN/mask_rcnn_coco.h5"

DETECTION_THRESHOLD = .5 # .5 is the default threshold of YOLO so we're using that here as well

dataset_time = 1535141748 # set this to which version of the collected data is to be used
image_path = "data/GDELT_VGKG/preprocessed/images/%d/" % dataset_time
image_classification_path = "data/GDELT_VGKG/preprocessed/image_classifications/maskrcnn/%d/%d/" % (dataset_time, time.time())
# TODO make the testing of different classifiers DRY

os.makedirs(image_classification_path)

image_names = os.listdir(image_path)[:10]

class InferenceConfig(coco.CocoConfig):
    # Set batch size to 1 since we'll be running inference on
    # one image at a time. Batch size = GPU_COUNT * IMAGES_PER_GPU
    GPU_COUNT = 1
    IMAGES_PER_GPU = len(images)

config = InferenceConfig()

# Create model object in inference mode.
model = modellib.MaskRCNN(mode="inference", model_dir=MODEL_DIR, config=config)

# Load weights trained on MS-COCO
model.load_weights(COCO_MODEL_PATH, by_name=True)

# COCO Class names
# Index of the class in the list is its ID. For example, to get ID of
# the teddy bear class, use: class_names.index('teddy bear')
class_names = ['BG', 'person', 'bicycle', 'car', 'motorcycle', 'airplane',
               'bus', 'train', 'truck', 'boat', 'traffic light',
               'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird',
               'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear',
               'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie',
               'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
               'kite', 'baseball bat', 'baseball glove', 'skateboard',
               'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup',
               'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple',
               'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
               'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
               'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote',
               'keyboard', 'cell phone', 'microwave', 'oven', 'toaster',
               'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors',
               'teddy bear', 'hair drier', 'toothbrush']

#### Classifiying the images

In [30]:
%%time
import skimage

try:
    images = [skimage.io.imread(image_path + "/" + image) for image in image_names]
    results = model.detect(images, verbose=1)
except Exception as e:
        print(e)
        # Some of the files are not actually images


Processing 10 images
image                    shape: (346, 607, 3)         min:    0.00000  max:  255.00000  uint8
image                    shape: (324, 576, 3)         min:    0.00000  max:  255.00000  uint8
image                    shape: (360, 420, 3)         min:   85.00000  max:  255.00000  uint8
image                    shape: (551, 600, 3)         min:    0.00000  max:  255.00000  uint8
image                    shape: (699, 306, 3)         min:    0.00000  max:  255.00000  uint8
image                    shape: (400, 581, 3)         min:    0.00000  max:  255.00000  uint8
image                    shape: (337, 598, 3)         min:    0.00000  max:  255.00000  uint8
image                    shape: (409, 600, 3)         min:    0.00000  max:  255.00000  uint8
image                    shape: (720, 1200, 3)        min:    0.00000  max:  255.00000  uint8
image                    shape: (466, 965, 3)         min:    0.00000  max:  255.00000  uint8


  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


molded_images            shape: (10, 1024, 1024, 3)   min: -123.70000  max:  151.10000  float64
image_metas              shape: (10, 93)              min:    0.00000  max: 1200.00000  float64
anchors                  shape: (10, 261888, 4)       min:   -0.35390  max:    1.29134  float32
CPU times: user 14min 41s, sys: 32.8 s, total: 15min 13s
Wall time: 2min 7s


### Saving the results 

In [41]:
# Bring the results into the same format 
index = 0
for result in results:
    labels = dict()
    for class_id, score in zip(result['class_ids'],result['scores']):
        if score > DETECTION_THRESHOLD:
            label = class_names[int(class_id)]
            if label in labels: 
                labels[label]+=1 
            else:
                labels[label] = 1
    print(image_names[index] + ": " + str(labels))
    # Save classification result
    file = open("%s/%s" % (image_classification_path, image), "wb+")
    pickle.dump(labels, file)
    index += 1

20160213120000: {'person': 2, 'bicycle': 1}
20160215040000: {'person': 7, 'tie': 1, 'potted plant': 1}
20160215200000: {'person': 2}
20160218080000: {'person': 8, 'tie': 3}
20160219200000: {'person': 1, 'car': 1, 'handbag': 1}
20160220234500: {'tie': 1, 'person': 18}
20160222080000: {'clock': 1}
20160223040000: {'person': 1}
20160223200000: {'person': 1}
20160229120000: {'person': 5}
