# Approved Crop KMZ to CSV

<a target="_blank" href="https://colab.research.google.com/github/nasaharvest/street2sat/blob/main/notebooks/CropKMZtoCSV.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

**Author**: Ivan Zvonkov

**Last Modified**: Apr 4, 2025

**Description**: Converts approved KMZ to csv.

In [1]:
from bs4 import BeautifulSoup
from google.colab import drive
from pathlib import Path
from tqdm import tqdm

import json
import pandas as pd
import xml.etree.ElementTree as ET
import zipfile


drive.mount('/content/drive')

Mounted at /content/drive


## 1. Convert each KMZ into a CSV file

In [7]:
SRC_KMZ_FOLDER = "/content/drive/MyDrive/Street2Sat/KMZ/Checked"
!ls "{SRC_KMZ_FOLDER}"

Senegal_ADM1_1377_ADM2_1649_95_background_0_100.kmz
Senegal_ADM1_1377_ADM2_1649_95_background_1000_1100.kmz
Senegal_ADM1_2644_ADM2_25345_95_background_0_100.kmz
Senegal_ADM1_2644_ADM2_25345_95_background_100_135.kmz


In [8]:
SRC_KMZ_FILES = list(Path(SRC_KMZ_FOLDER).glob("*.kmz"))

In [25]:
from bs4 import BeautifulSoup
from google.colab import drive
from pathlib import Path
from tqdm import tqdm

import json
import pandas as pd
import xml.etree.ElementTree as ET
import zipfile


drive.mount('/content/drive')

def get_points_from_kmz(kmz_file_path):
    with zipfile.ZipFile(kmz_file_path, 'r') as kmz:
        kml_filename = [name for name in kmz.namelist() if name.endswith('.kml')][0]
        kml_data = kmz.read(kml_filename)

    # Convert KMZ file to KMZ points list
    namespace = {'kml': 'http://www.opengis.net/kml/2.2'}
    kmz_points = []

    root = ET.fromstring(kml_data)
    for placemark in root.findall('.//kml:Placemark', namespace):
        kmz_element = {}
        for child in placemark.iter():
            key = child.tag.replace('{http://www.opengis.net/kml/2.2}', '')
            kmz_element[key] = child.text

        kmz_points.append(kmz_element)

    return kmz_points

def kmz_points_to_dataframe(kmz_points):
    points = []

    for kmz_point in tqdm(kmz_points): # Added tqdm here to track progress and see which point is causing the error

        row = {}

        soup = BeautifulSoup(kmz_point["description"], 'html.parser')
        row["capture_info"] = soup.find_all('h2')[0].text
        row["capture_time"] = soup.find_all('p')[0].text.split(": ")[1]
        row["image_url"] = soup.find('a')['href']

        # Driving direction details
        direction_data = soup.find_all('h2')[2].next_siblings
        direction_data = [item for item in direction_data if item.name == 'p']

        row["driving_northing"] = float(direction_data[0].text.split(": ")[1])
        row["driving_easting"] = float(direction_data[1].text.split(": ")[1])
        row["is_right_hand_drive"] = direction_data[2].text.split(": ")[1] == 'True'

        # Location details
        location_data = soup.find_all('h2')[1].next_siblings
        location_data = [item for item in location_data if item.name == 'p']
        row["adm1"] = location_data[0].text.split(": ")[1]
        row["adm2"] = location_data[1].text.split(": ")[1]

        def lat_lon_parse(lat_lon_str_element):
            # Extract the part after the colon and split by comma
            lat_lon_str = lat_lon_str_element.text.split(": ")[1].strip()
            print(f"Parsing string: {lat_lon_str}")
            # Remove 'np.float64' and any leading/trailing whitespace, then replace parentheses and brackets
            lat_lon_values = lat_lon_str.replace('np.float64', '').strip().replace('(', '').replace(')', '').replace('[', '').replace(']', '').split(',')
            # Convert to floats and return as a list
            return [float(val.strip()) for val in lat_lon_values]

        road_lat_lon = lat_lon_parse(location_data[2])
        field_lat_lon = lat_lon_parse(location_data[3])

        crop_type = kmz_point["name"].strip().lower().replace("\u200b", "")
        crop_point = {
            "latitude": field_lat_lon[0],
            "longitude": field_lat_lon[1],
            "is_crop": 1,
            "is_maize": int(crop_type == "maize"),
            "crop_type": crop_type,

            **row,
        }

        non_crop_point = {
            "latitude": road_lat_lon[0],
            "longitude": road_lat_lon[1],
            "is_crop": 0,
            "is_maize": 0,
            "crop_type": "",
            **row,
        }

        points.append(crop_point)
        points.append(non_crop_point)

    df = pd.DataFrame(points)
    df["gcloud_folder"] = df["image_url"].str.extract(r'street2sat-uploaded/([^/]+/[^/]+)')
    return df

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [27]:
dfs = []
for kmz_file_path in tqdm(SRC_KMZ_FILES):
    kmz_points = get_points_from_kmz(kmz_file_path)
    df = kmz_points_to_dataframe(kmz_points)
    dfs.append(df)

  0%|          | 0/4 [00:00<?, ?it/s]
100%|██████████| 22/22 [00:00<00:00, 1039.02it/s]


Parsing string: 13.649130999999999, -14.2918199
Parsing string: (np.float64(13.64913875025419), np.float64(-14.291988787823726))
Parsing string: 13.6577868, -14.2905884
Parsing string: (np.float64(13.657815212732384), np.float64(-14.290760939473442))
Parsing string: 13.648949999972222, -14.2918149
Parsing string: (np.float64(13.648957919037585), np.float64(-14.291989327040818))
Parsing string: 13.636690399999999, -14.291642599972223
Parsing string: (np.float64(13.636659808990913), np.float64(-14.291471275550043))
Parsing string: 13.649503299972222, -14.291818199972223
Parsing string: (np.float64(13.649500412679453), np.float64(-14.291987709366104))
Parsing string: 13.634639199999999, -14.2914417
Parsing string: (np.float64(13.634670136461928), np.float64(-14.291292306807852))
Parsing string: 15.0493255, -16.724997499999997
Parsing string: (np.float64(15.049492549772546), np.float64(-16.72493433987417))
Parsing string: 13.6548893, -14.2917296
Parsing string: (np.float64(13.6548336129002


100%|██████████| 19/19 [00:00<00:00, 1007.06it/s]


Parsing string: 13.650225, -14.291833199972222
Parsing string: (np.float64(13.650223737498171), np.float64(-14.291985552356895))
Parsing string: 13.657282999972223, -14.2906712
Parsing string: (np.float64(13.657363399381167), np.float64(-14.290854748752157))
Parsing string: 13.648207499972221, -14.2917632
Parsing string: (np.float64(13.648234594144654), np.float64(-14.291991483830799))
Parsing string: 13.633662299972222, -14.291244799972223
Parsing string: (np.float64(13.633494995925988), np.float64(-14.291388259905915))
Parsing string: 13.635517499999999, -14.2915782
Parsing string: (np.float64(13.635574557069754), np.float64(-14.291382060473714))
Parsing string: 13.648323899972223, -14.291770099999999
Parsing string: (np.float64(13.64832500975859), np.float64(-14.291991214238893))
Parsing string: 15.049197899972222, -16.724627799999997
Parsing string: (np.float64(15.049405005370172), np.float64(-16.724561629532655))
Parsing string: 13.6560959, -14.2912116
Parsing string: (np.float64(


100%|██████████| 17/17 [00:00<00:00, 1055.19it/s]


Parsing string: 15.0455524, -16.787483699972224
Parsing string: (np.float64(15.045392917906558), np.float64(-16.787582262942134))
Parsing string: 14.9487337, -16.75219829997222
Parsing string: (np.float64(14.948878039507425), np.float64(-16.752014977355344))
Parsing string: 14.948915299972223, -16.7523686
Parsing string: (np.float64(14.949057353646362), np.float64(-16.752202346371952))
Parsing string: 14.948852699972223, -16.752306999972223
Parsing string: (np.float64(14.948966983480293), np.float64(-16.752201612863754))
Parsing string: 15.0377322, -16.788817
Parsing string: (np.float64(15.037793247619652), np.float64(-16.788634849170766))
Parsing string: 14.9488309, -16.7522868
Parsing string: (np.float64(14.9490566404837), np.float64(-16.752295297444856))
Parsing string: 14.948625099972222, -16.75209339997222
Parsing string: (np.float64(14.948788382366637), np.float64(-16.751921292963605))
Parsing string: 15.046270899972223, -16.787287899972224
Parsing string: (np.float64(15.04620988


100%|██████████| 9/9 [00:00<00:00, 834.58it/s]
100%|██████████| 4/4 [00:00<00:00, 33.09it/s]

Parsing string: 14.948819999972223, -16.7522759
Parsing string: (np.float64(14.948967696600667), np.float64(-16.75210866182477))
Parsing string: 14.9487944, -16.75225239997222
Parsing string: (np.float64(14.948877326429342), np.float64(-16.752107928360388))
Parsing string: 14.948603399972223, -16.7520735
Parsing string: (np.float64(14.948698012184803), np.float64(-16.751920559586853))
Parsing string: 15.037328999972223, -16.7887875
Parsing string: (np.float64(15.037341409226151), np.float64(-16.788631082183425))
Parsing string: 15.0375931, -16.7887981
Parsing string: (np.float64(15.037612512264516), np.float64(-16.788633342360644))
Parsing string: 15.0455521, -16.787483399972224
Parsing string: (np.float64(15.045392917906558), np.float64(-16.787582262942134))
Parsing string: 14.9487026, -16.7521686
Parsing string: (np.float64(14.948787669330846), np.float64(-16.752014243934752))
Parsing string: 14.9488652, -16.75231899997222
Parsing string: (np.float64(14.948966983480293), np.float64(-




## 2. Merge CSV files into single file

In [28]:
df = pd.concat(dfs, ignore_index=True)

In [29]:
df.head()

Unnamed: 0,latitude,longitude,is_crop,is_maize,crop_type,capture_info,capture_time,image_url,driving_northing,driving_easting,is_right_hand_drive,adm1,adm2,gcloud_folder
0,13.649139,-14.291989,1,0,sorghum,2024-10-23-G0046689,2024-10-23 17:18:41,https://storage.cloud.google.com/street2sat-up...,3.594264,-0.086196,False,1377,1649,SENEGAL/2024-10-23
1,13.649131,-14.29182,0,0,,2024-10-23-G0046689,2024-10-23 17:18:41,https://storage.cloud.google.com/street2sat-up...,3.594264,-0.086196,False,1377,1649,SENEGAL/2024-10-23
2,13.657815,-14.290761,1,1,maize,2024-10-23-G0047004,2024-10-23 17:21:18,https://storage.cloud.google.com/street2sat-up...,8.112104,1.739239,False,1377,1649,SENEGAL/2024-10-23
3,13.657787,-14.290588,0,0,,2024-10-23-G0047004,2024-10-23 17:21:18,https://storage.cloud.google.com/street2sat-up...,8.112104,1.739239,False,1377,1649,SENEGAL/2024-10-23
4,13.648958,-14.291989,1,0,sorghum,2024-10-23-G0046683,2024-10-23 17:18:38,https://storage.cloud.google.com/street2sat-up...,6.015127,-0.525902,False,1377,1649,SENEGAL/2024-10-23


In [30]:
df[df["is_crop"] == 1]["gcloud_folder"].value_counts()

Unnamed: 0_level_0,count
gcloud_folder,Unnamed: 1_level_1
SENEGAL/2024-10-23,41
SENEGAL/2024-10-22,26


In [38]:
df_2024 = df[df["gcloud_folder"].str.contains("2024")]
df_2023 = df[df["gcloud_folder"].str.contains("2023")]

In [39]:
df_2024[["is_crop", "crop_type"]].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
is_crop,crop_type,Unnamed: 2_level_1
0,,67
1,sorghum,25
1,millet,21
1,cassava,11
1,maize,10


In [40]:
df_2023[["is_crop", "crop_type"]].value_counts()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
is_crop,crop_type,Unnamed: 2_level_1


In [43]:
df_2024.to_csv(f"/content/drive/MyDrive/Street2Sat/KMZ/CSV/Senegal_2024_batch202401.csv", index=False)