In [1]:
from pyproj import Proj, transform
import xml.etree.ElementTree as ET
import tqdm
import urllib.request
from PIL import Image
import os

# Image quisation from Open Bayern Data
https://atlas.bayern.de/?c=670229,5335010&z=10&l=vt_luftbild,https%3A%2F%2Fgeodaten.bayern.de%2Fodd%2Fa%2Fdop40%2Fmeta%2Fkml%2Fgemeinde.kml&t=ba

In this 

## Downloading the images
We iterate through the metalink.meta4 file and make each urllink in a list. Afterwards this list is use to dowload all the images which are in 1km x 1km of munich in a 2500px x 2500px resolution which means that one pixel covers 0.4m in real.

In [None]:

# add folder path to project
folder_path = "C:\Projekte\TDS\TDS2324-TrafficAccidents\Data\Images\\1km_img"

# create folder if not exists
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

# getting list of all urls
tree = ET.parse('C:\Projekte\TDS\TDS2324-TrafficAccidents\Data\metalink.meta4')
urls = []

for file_element in tree.findall('.//{urn:ietf:params:xml:ns:metalink}url'):
    if any(file_element.text.split("data")[1] in s for s in urls):
      continue
    else:
      urls.append(file_element.text)

# downloading all images
for url in tqdm.tqdm(urls):
  file_name = url.split("data")[1]
  if os.path.isfile(folder_path + file_name):
    continue
  else:
    urllib.request.urlretrieve(url, folder_path + file_name)

## Rename Files 
First we rename the files in the epsg:25832 formatsince they are not correctly in this format yet

In [None]:
filepath = 'C:\\Projekte\\TDS\\TDS2324-TrafficAccidents\\Data\\Images\\1km_img'
# method to get coordinates from filename
def get_coordinates(filename):
  x = filename.split("_")[0]
  y = filename.split("_")[1].split(".")[0]

  # remove leading '32' from x and add 3 zeros
  x = x[2:]
  x = x + "000"

  # add 3 zeros to y
  y = y + "000"
  return x, y

# Iterate through files in filepath and rename them to coordinates
for filename in os.listdir(filepath):
  x, y = get_coordinates(filename)
  old_filepath = filepath + "\\" + filename
  new_filepath = filepath + "\\" + x + "_" + y + ".tif"

  os.rename(old_filepath, new_filepath)


## Slicing in 40m x 40m images
Now we slice the images in a 40m x 40m image

In [48]:
folder_path_1km = 'C:\\Projekte\\TDS\\TDS2324-TrafficAccidents\\Data\\Images\\1km_img'

# Create folder for sliced pictures
folder_path_40m = 'C:\\Projekte\\TDS\\TDS2324-TrafficAccidents\\Data\\Images\\40m_img'
os.makedirs(folder_path_40m, exist_ok=True)


# method to get coordinates from a given filename
def get_coordinates(filename):
    # Splitting using underscores and removing the file extension
    parts = filename.split("_")
    
    # Extracting x and y coordinates
    x = parts[0]
    y = parts[1].split(".")[0]
    
    return x, y

# Iterate through files in the folder
for filename in tqdm.tqdm(os.listdir(folder_path_1km)):
  
    # subfolder in which the sliced pictures will be saved, named after original picture so they are sorted
    new_foldername = folder_path_40m + "\\" + filename.rsplit(".")[0]
    os.makedirs(new_foldername, exist_ok=True)

    # get coordinates from filename
    x, y = get_coordinates(filename)

    # convert x,y in int
    x = int(x)
    y = int(y)

    # Since we start with the upper left corner, we need to add 1000 to y
    y = y + 1000

    # open image
    img = Image.open(folder_path_1km + "\\" + filename)

    # get width and height of image
    width, height = img.size

    y2 = y
    # iterate through image and slice it
    for i in range(0, width, 100):
        for j in range(0, height, 100):
            y2 = int(y2) - 40 
            # get bounding box for slicing
            box = (i, j, i+100, j+100)
            # slice image
            a = img.crop(box)
            # save image
            image_name = str(x) + "_" + str(y2) + ".tif" 
            a.save(new_foldername + "\\" + image_name)
            
        x = x + 40 
        y2 = y

  0%|          | 0/374 [00:00<?, ?it/s]

100%|██████████| 374/374 [13:03<00:00,  2.09s/it]


## Categorize 40m Images

1. Get Munich data
2. Add feature: epsg:25832 
    - Coordinates in epsg:25832 of the accident
3. Add feature: image
    - Adding the image of the accident
4. Create new DF 'accident_count' 
    - Includes amount of accident foreach picture
5. Show distribution of the accident_amount per picture
6. Categorize each picture and rename it according its category

### 1: Get Munich data

In [12]:
import pandas as pd

# Load the CSV file
data = pd.read_csv('C:\Projekte\TDS\TDS2324-TrafficAccidents\Data\\all_16_22.csv', dtype={'ags': str})

# saving the data for the city of munich and make new index
munich = data[data['ags'] == '09162000'].reset_index(drop=True)

# drop all columns except for the ones we need
munich = munich[['xgcswgs84', 'ygcswgs84']]

# Show the resulting dataframe
print(munich.head(10))


   xgcswgs84  ygcswgs84
0  11.615121  48.112151
1  11.524172  48.127725
2  11.536799  48.146921
3  11.695126  48.115499
4  11.613230  48.186879
5  11.477048  48.115608
6  11.487404  48.086448
7  11.671945  48.119598
8  11.527980  48.079697
9  11.441178  48.145657
(33806, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33806 entries, 0 to 33805
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   xgcswgs84  33806 non-null  float64
 1   ygcswgs84  33806 non-null  float64
dtypes: float64(2)
memory usage: 528.3 KB
None


### 2: Add epsg:25832 coordinates
We add feature x_epsg:25832 and y_epsg:25832, so we can check in which picture our accident happened.

In [14]:
from pyproj import Proj, transform

# Define the coordinate systems
wgs84 = Proj(init='epsg:4326')
epsg25832 = Proj(init='epsg:25832')

# Convert the coordinates from WGS84 to epsg:25832
munich['x_epsg:25832'], munich['y_epsg:25832'] = transform(wgs84, epsg25832, munich['xgcswgs84'], munich['ygcswgs84'])

# Show the updated dataframe
print(munich.head())



   xgcswgs84  ygcswgs84   x_epsg:25832  y_epsg:25832
0  11.615121  48.112151  694645.808823  5.332073e+06
1  11.524172  48.127725  687820.079094  5.333578e+06
2  11.536799  48.146921  688689.196965  5.335742e+06
3  11.695126  48.115499  700587.159319  5.332651e+06
4  11.613230  48.186879  694222.709996  5.340374e+06


  in_crs_string = _prepare_from_proj_string(in_crs_string)
  in_crs_string = _prepare_from_proj_string(in_crs_string)
  munich['x_epsg:25832'], munich['y_epsg:25832'] = transform(wgs84, epsg25832, munich['xgcswgs84'], munich['ygcswgs84'])


### 3: Add image to accident
We add to our munich file the image in which the accident happened.

In [25]:
folder_path_40m = 'C:\\Projekte\\TDS\\TDS2324-TrafficAccidents\\Data\\Images\\40m_img'

def in_img(x_crd_acc, y_crd_acc, x_crd_img, y_crd_img, crd_range):
    return (x_crd_acc >= x_crd_img) and (x_crd_acc <= x_crd_img + crd_range) and (y_crd_acc >= y_crd_img) and (y_crd_acc <= y_crd_img + crd_range)

def get_coordinates_subfolder(filename):
    parts = filename.split("_")
    x = int(parts[0])
    y = int(parts[1])
    return x, y

def get_coordinates(filename):
    parts = filename.split("_")
    x = int(parts[0])
    y = int(parts[1].split(".")[0])
    
    return x, y

munich['image_name'] = ""

for index, row in tqdm.tqdm(munich.iterrows()):
    for folder in os.listdir(folder_path_40m):
        x_crd_img, y_crd_img = get_coordinates_subfolder(folder)
        if in_img(row['x_epsg:25832'], row['y_epsg:25832'], x_crd_img, y_crd_img, 1000):
            for img in os.listdir(os.path.join(folder_path_40m, folder)):
                x_crd_img, y_crd_img = get_coordinates(img)
                if in_img(row['x_epsg:25832'], row['y_epsg:25832'], x_crd_img, y_crd_img, 40):
                    munich.at[index, 'image_name'] = os.path.join(img)
                    break
            else:
                continue
            break

33806it [02:43, 206.96it/s]


In [26]:
munich.sample(10)

Unnamed: 0,xgcswgs84,ygcswgs84,x_epsg:25832,y_epsg:25832,image_name
1585,11.646075,48.138678,696848.106324,5335100.0,696840_5335080.tif
23670,11.559967,48.125659,690491.020802,5333437.0,690480_5333400.tif
9025,11.562784,48.141619,690641.458003,5335217.0,690640_5335200.tif
32743,11.559102,48.147515,690345.806833,5335863.0,690320_5335840.tif
48,11.479691,48.088583,684650.745936,5329120.0,684640_5329120.tif
27923,11.565904,48.188016,690701.405508,5340381.0,690680_5340360.tif
23276,11.57893,48.145205,691829.103917,5335656.0,691800_5335640.tif
20087,11.45782,48.206111,682604.609886,5342130.0,682600_5342120.tif
1558,11.479283,48.123307,684496.022227,5332979.0,684480_5332960.tif
28240,11.590973,48.132637,692771.921909,5334289.0,692760_5334280.tif


### 4: Count accidents per image
We check how many accidents happened in each image so we have a basis to categorize the images.

In [32]:
# Create a new dataframe to store the count of each image
image_count = pd.DataFrame(munich['image_name'].value_counts())

# Rename the column to 'count'
image_count.columns = ['count']

# Show the resulting dataframe
print(image_count.head(10))


                    count
image_name               
687440_5336040.tif     51
686200_5334360.tif     50
690120_5335240.tif     48
686240_5331760.tif     47
689680_5331880.tif     46
694960_5332960.tif     43
690120_5334720.tif     41
690680_5338120.tif     41
691800_5335640.tif     39
691640_5332760.tif     37


Since we want to train our model also with images that do not have any accidents at all we add those to our dataset image_count aswell.

In [34]:
folder_path_40m = 'C:\\Projekte\\TDS\\TDS2324-TrafficAccidents\\Data\\Images\\40m_img'

# Iterate through subfolders
for folder in tqdm.tqdm(os.listdir(folder_path_40m)):
    subfolder_path = os.path.join(folder_path_40m, folder)
    
    # Iterate through images in subfolder
    for image in os.listdir(subfolder_path):
        image_path = os.path.join(subfolder_path, image)
        
        # Check if image is not in munich
        if image not in image_count.index:
            # Add image to image_count with count 0
            image_count.loc[image] = 0
           

# Show the updated image_count dataframe
print(image_count)


100%|██████████| 374/374 [1:06:48<00:00, 10.72s/it]

                    count
image_name               
687440_5336040.tif     51
686200_5334360.tif     50
690120_5335240.tif     48
686240_5331760.tif     47
689680_5331880.tif     46
...                   ...
702960_5335800.tif      0
702960_5335840.tif      0
702960_5335880.tif      0
702960_5335920.tif      0
702960_5335960.tif      0

[233751 rows x 1 columns]





### 5: Categorize the images

In [38]:
count_frequency = image_count['count'].value_counts()
print(count_frequency)


count
0     219992
1       7618
2       2544
3       1213
4        694
5        439
6        315
7        221
8        147
9        125
10        85
12        64
11        55
13        37
14        32
15        26
16        24
17        16
18        16
19        15
20        11
22         9
21         7
24         6
25         4
26         4
30         4
23         4
27         3
28         3
29         3
32         2
33         2
41         2
43         1
48         1
47         1
46         1
50         1
39         1
37         1
31         1
51         1
Name: count, dtype: int64


Creating categories on count fequency

In [39]:
for index, row in image_count.iterrows():
    image_name = index
    count = row['count']
    # add feature category to image_count dataframe
    if count == 0:
        image_count.at[image_name, 'category'] = '0'
    elif count == 1:
        image_count.at[image_name, 'category'] = '1'
    elif count == 2:
        image_count.at[image_name, 'category'] = '2'
    elif count == 3 or count == 4:
        image_count.at[image_name, 'category'] = '3'
    else :
        image_count.at[image_name, 'category'] = '4'

    

In [49]:
accidents_per_category = image_count['category'].value_counts()
print(accidents_per_category)


category
0    219992
1      7618
2      2544
3      1907
4      1690
Name: count, dtype: int64


### 6: Imageselection
We choose foreach category 1500 images and make them in seperated folders.

In [53]:
import os
import random
import shutil

# Define the main folder path
main_folder_path = 'C:\\Projekte\\TDS\\TDS2324-TrafficAccidents\\Data\\Images\\model_img'
folder_path_40m = 'C:\\Projekte\\TDS\\TDS2324-TrafficAccidents\\Data\\Images\\40m_img'

# Create the main folder if it doesn't exist
if not os.path.exists(main_folder_path):
    os.makedirs(main_folder_path)


def find_subfolder(image_name):
    try:
        parts = image_name.split("_")
        x = parts[0]
        y = parts[1].split(".")[0]

        # replace last 3 digits with 000
        x = x[:-3] + "000"
        y = y[:-3] + "000"

        subfolder = x + "_" + y
        return subfolder
    except Exception as e:
        print(f"An error occurred with image_name: {image_name}")
        return None

# Iterate through each category
for category in image_count['category'].unique():
    # Create the subfolder path for the category
    subfolder_path = os.path.join(main_folder_path, category)
    
    # Create the subfolder if it doesn't exist
    if not os.path.exists(subfolder_path):
        os.makedirs(subfolder_path)
    
    # Get the images for the category
    category_images = image_count[image_count['category'] == category].index.tolist()
    
    # Randomly select 1500 images from the category
    selected_images = random.sample(category_images, 1500)
    
    # Move the selected images to the subfolder
    for image in selected_images:
        try:
            image_path = os.path.join(folder_path_40m, find_subfolder(image), image)
            new_image_path = os.path.join(subfolder_path, image)
            shutil.copy(image_path, new_image_path)
        except Exception as e:
            print(f"An error occurred with image: {image}, folder_path_40m: {folder_path_40m}, subfolder_path: {subfolder_path}")
            continue

