# Preprocessing

### Settings

In [1]:
import random
import os
from os import path

from PIL import Image

from data_management import data_utils
from data_management.image_manipulations import image_to_dataloader_folders, crop_bottom_and_sides

seed = 1

In [2]:
root_dir = '/media/alex/A4A034E0A034BB1E/incidents-thesis/test-run/'
output_dir = path.join(root_dir, 'incidents_cleaned')

split_probabilities = {'train':70, 'val':20, 'test':10}

# Positives

In [3]:
split_field_name = 'split'
database = '/media/alex/A4A034E0A034BB1E/incidents-thesis/correctimgs'

**Geograph positives**

In [4]:
image_table_name = 'incidents_geograph'
handler = data_utils.ImgDatabaseHandler(database)
geograph_root = '/media/alex/A4A034E0A034BB1E/Geograph'

try:
    handler.add_field(image_table_name, split_field_name) # Only once - note that SQLite does not allow deletes
except Exception as e:
    print(e)

handler.calculate_splits(image_table_name, split_field_name, split_probabilities)
records = handler.get_all_records(image_table_name)

all_recorded_classes = []
[all_recorded_classes.append(str(record[4])) for record in records]
unique_classes = set(all_recorded_classes)
data_utils.create_dataloader_folders(root_dir, output_dir, unique_classes)

for record in records:
    filepath = os.path.join(geograph_root, record[0])
    # dataloader_root, img_class, img_split, img_path
    image_to_dataloader_folders(output_dir, record[4], record[-1], filepath, output_img_width=500)

handler.db.close()

duplicate column name: split


**Scraped data**

In [5]:
image_table_name = 'incidents'
handler = data_utils.ImgDatabaseHandler(database)
scraped_root = '/media/alex/A4A034E0A034BB1E/incidents-thesis/data'

try:
    handler.add_field(image_table_name, split_field_name) # Only once
except Exception as e:
    print(e)

handler.calculate_splits(image_table_name, split_field_name, split_probabilities)
records = handler.get_all_records(image_table_name)

all_recorded_classes = []
[all_recorded_classes.append(str(record[4])) for record in records]
unique_classes = set(all_recorded_classes)
data_utils.create_dataloader_folders(root_dir, output_dir, unique_classes)

for record in records:
    filepath = os.path.join(scraped_root, record[0])
    # dataloader_root, img_class, img_split, img_path
    image_to_dataloader_folders(output_dir, record[4], record[-1], filepath, output_img_width=500)

handler.db.close()

duplicate column name: split
file not found: [Errno 2] No such file or directory: '/media/alex/A4A034E0A034BB1E/incidents-thesis/data/flooding/bing/overflowed road/1817AC6AE7A81D67A07AA0F0F738E4334838ACBF.jpg'

file not found: [Errno 2] No such file or directory: '/media/alex/A4A034E0A034BB1E/incidents-thesis/data/flooding/bing/overflowed road/E829B16BD3C0CE410C85B9BF8921A8F1215E4E74.jpg'

file not found: [Errno 2] No such file or directory: '/media/alex/A4A034E0A034BB1E/incidents-thesis/data/flooding/bing/overflowed route/E2E1AE4A03A0D839B93EF054A2035EA8237C8793.jpg'

file not found: [Errno 2] No such file or directory: '/media/alex/A4A034E0A034BB1E/incidents-thesis/data/flooding/google/flooding on highway/d6FDN5HDZ3.jpg'

file not found: [Errno 2] No such file or directory: '/media/alex/A4A034E0A034BB1E/incidents-thesis/data/flooding/google/flooding on highway/wp2gj2GSFn.jpg'

file not found: [Errno 2] No such file or directory: '/media/alex/A4A034E0A034BB1E/incidents-thesis/data/flo

### Multilingual

In [6]:
image_table_name = 'incidents_multilingual'
handler = data_utils.ImgDatabaseHandler(database)
multilang_root = '/media/alex/A4A034E0A034BB1E/incidents-thesis/data/multilingual'

try:
    handler.add_field(image_table_name, split_field_name) # Only once
except Exception as e:
    print(e)

handler.calculate_splits(image_table_name, split_field_name, split_probabilities)
records = handler.get_all_records(image_table_name)

all_recorded_classes = []
[all_recorded_classes.append(str(record[4])) for record in records]
unique_classes = set(all_recorded_classes)
data_utils.create_dataloader_folders(root_dir, output_dir, unique_classes)

for record in records:
    filepath = os.path.join(multilang_root, record[0])
    # dataloader_root, img_class, img_split, img_path
    image_to_dataloader_folders(output_dir, record[4], record[-1], filepath, output_img_width=500)

handler.db.close()

duplicate column name: split
file not found: [Errno 2] No such file or directory: '/media/alex/A4A034E0A034BB1E/incidents-thesis/data/multilingual/landlide/bing/公路 塌方/04162DC6EB58A9AF7F567A895C5EBE8126C23835.jpg'

file not found: [Errno 2] No such file or directory: '/media/alex/A4A034E0A034BB1E/incidents-thesis/data/multilingual/landlide/bing/公路 塌方/087A5379C510D6D1EAFEF722F7494B579C5FBC1F.jpg'

file not found: [Errno 2] No such file or directory: '/media/alex/A4A034E0A034BB1E/incidents-thesis/data/multilingual/landlide/bing/公路 塌方/CEEB8B5F47029667206C476AC875791FCFE7C31B.jpg'

file not found: [Errno 2] No such file or directory: '/media/alex/A4A034E0A034BB1E/incidents-thesis/data/multilingual/landlide/bing/公路 塌方/D64C659B7F6A03E8C55E829B78754E98D5F05B67.jpg'

file not found: [Errno 2] No such file or directory: '/media/alex/A4A034E0A034BB1E/incidents-thesis/data/multilingual/landlide/bing/公路 塌方/D64C659B7F6A03E8C55EC409BC8CBF57C8AFC66B.jpg'

file not found: [Errno 2] No such file or dire



## Negatives

**Geograph negatives:** 9981 photos

In [4]:
negatives_class = ['negative']
data_utils.create_dataloader_folders(root_dir, output_dir, negatives_class)

In [9]:
geograph_negatives = '/media/alex/A4A034E0A034BB1E/incidents-thesis/true-negatives/geograph/road-incidents-all/geophotos'

images = data_utils.sample_n_images_from_dir(geograph_negatives, seed, 10000)

for i, image in enumerate(images):
    split = data_utils.determine_split(split_probabilities)
    image_to_dataloader_folders(output_dir, 'negative', split, image, output_img_width=500)
    if (i+1) % 1000 == 0:
        print(i+1)

Not enough images in the set, retaining all (9981)
999
1999
2999
3999
4999
5999
6999
7999
8999


**CityScapes:** 10k

In [10]:
cityscape_negatives = '/media/alex/A4A034E0A034BB1E/incidents-thesis/true-negatives/cityscapes'

images = data_utils.sample_n_images_from_dir(cityscape_negatives, seed, 10000)

for i, image in enumerate(images):
    split = data_utils.determine_split(split_probabilities)
    image_to_dataloader_folders(output_dir, 'negative', split, image, output_img_width=500)
    if (i+1) % 1000 == 0:
        print(i+1)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000


**Berkeley Deep Drive** 20k photos

In [11]:
bdd_train_negatives = '/media/alex/A4A034E0A034BB1E/incidents-thesis/true-negatives/bdd100k/images/100k/train'

images = data_utils.sample_n_images_from_dir(bdd_train_negatives, seed, 14000)


train = {'train':100, 'val':0, 'test':0}

for i, image in enumerate(images):   
    split = data_utils.determine_split(train)
    image_to_dataloader_folders(output_dir, 'negative', split, image, output_img_width=500, crop_bottom=0.25)
    if (i+1) % 1000 == 0:
        print(i+1) 

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000


In [12]:
bdd_val_negatives = '/media/alex/A4A034E0A034BB1E/incidents-thesis/true-negatives/bdd100k/images/100k/val'

images = data_utils.sample_n_images_from_dir(bdd_val_negatives, seed, 4000)

val = {'train':0, 'val':100, 'test':0}

for i, image in enumerate(images):
    split = data_utils.determine_split(val)
    image_to_dataloader_folders(output_dir, 'negative', split, image, output_img_width=500, crop_bottom=0.25)
    if (i+1) % 1000 == 0:
        print(i+1)

1000
2000
3000
4000


In [13]:
bdd_test_negatives = '/media/alex/A4A034E0A034BB1E/incidents-thesis/true-negatives/bdd100k/images/100k/test'

images = data_utils.sample_n_images_from_dir(bdd_test_negatives, seed, 2000)

test = {'train':0, 'val':0, 'test':100}

for i, image in enumerate(images):
    split = data_utils.determine_split(test)
    image_to_dataloader_folders(output_dir, 'negative', split, image, output_img_width=500, crop_bottom=0.25)
    if (i+1) % 1000 == 0:
        print(i+1)

1000
2000


### Replace table in case of faulty splits

https://stackoverflow.com/questions/8442147/how-to-delete-or-add-column-in-sqlite

CREATE TABLE incidents_geo (
 img_id VARCHAR PRIMARY KEY, 
 lat REAL,
 long REAL,
 datetime VARCHAR,
 class STRING
);

INSERT INTO incidents_geo
SELECT img_id, lat, long, datetime, class FROM incidents_geograph;