In [4]:
# only execute this when running on GPU Hub
%pip install pandas folium matplotlib

Collecting matplotlib
  Downloading matplotlib-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.54.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (163 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.7-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.3 kB)
Collecting pillow>=8 (from matplotlib)
  Downloading pillow-11.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (9.1 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Downloading pyparsing-3.2.0-py3-none-any.whl.metadata (5.0 kB)
Downloading matplotlib-3.9.2-cp311-cp311-manylinux_2_17_x86_64.manyli

In [1]:
import pandas as pd
import folium
import os.path

from folium.plugins import MarkerCluster
from tqdm import tqdm

In [2]:
file_location = "../data/raw/xeno-canto.parquet"
df = pd.read_parquet(file_location)
df_count_begin = len(df)

## Drop entries with an unknows species

In [3]:
# count recordings with an unknown species
unknown_species = df[df["en"] == "Identity unknown"]
print(f"Number of rows with unknown species: {len(unknown_species)}")

Number of rows with unknown species: 3853


In [4]:
# remove recordings with unknown species
df = df[df["en"] != "Identity unknown"]

# Drop recordings with missing location

In [5]:
# check number of recordings with missing coordinates
missing_coordinates = df[df["lat"].isna() | df["lng"].isna() | df["lat"].isnull() | (df['lat'].str.len() == 0) | (df['lng'].str.len() == 0)]
print(f"Number of rows with missing coordinates: {len(missing_coordinates)}")

Number of rows with missing coordinates: 980


In [6]:
df = df[~df["lat"].isna() & ~df["lng"].isna() & (df['lat'].str.len() > 0) & (df['lng'].str.len() > 0)]

In [7]:
df["lng"] = df["lng"].astype(float)
df["lat"] = df["lat"].astype(float)

## Drop species from exotic locations

In [8]:
def plot_map(df):
    df_map = df.dropna(subset=["lat", "lng"])

    m = folium.Map()
    marker_cluster = MarkerCluster().add_to(m)

    for _, row in df_map.iterrows():
        folium.Marker(location=[row["lat"], row["lng"]]).add_to(marker_cluster)

    return m

In [9]:
# plot all recordings on a world map
map = plot_map(df)
map.save("map.html")

In [10]:
# check number of recordings with coordinates outside of mainland europe
values_too_east = df[df["lng"] > 51.83]
values_too_west = df[df["lng"] < -60]
values_too_south = df[df["lat"] < 0]
values_too_north = df[df["lat"] > 70.9]

print(f"Values that are too east: {len(values_too_east)}")
print(f"Values that are too west: {len(values_too_west)}")
print(f"Values that are too south: {len(values_too_south)}")
print(f"Values that are too north: {len(values_too_north)}")

Values that are too east: 211
Values that are too west: 1
Values that are too south: 6
Values that are too north: 59


In [11]:
# drop those recordings
df = df.drop(values_too_east.index).drop(values_too_west.index).drop(values_too_south.index).drop(values_too_north.index)

In [12]:
# plot the map again
map = plot_map(df)
map.save("map_filtered.html")

## Drop species with less than 25 recordings

In [13]:
recordings_per_species = df["en"].value_counts()

In [14]:
# drop species with less than 25 recordings
species_with_few_recordings = recordings_per_species[recordings_per_species < 25].index
print(f"Species with more than 25 recordings: {len(species_with_few_recordings)}")

Species with more than 25 recordings: 275


In [15]:
df = df[~df["en"].isin(species_with_few_recordings)]

## Drop recordings without an audio file

In [16]:
# check number of recordings with missing file urls
(df["file"].str.len() == 0).sum()

np.int64(0)

In [17]:
# check again but against null values
df["file"].isnull().sum()

np.int64(0)

In [18]:
# loop through all recordings and check if the audio file exists

def check_audio_file_exists(file_name) -> bool:
    return os.path.exists(f"../data/raw/audio/xeno_canto/{file_name}")

missing_audio_files = []
for _, recording_data in tqdm(df.iterrows(), total=df.shape[0]):
    audio_url = recording_data["file"]
    original_audio_file_name = recording_data["file-name"]
    file_extension = original_audio_file_name.split(".")[-1]
    new_audio_file_name = f"{recording_data['id']}.{file_extension}"

    if not check_audio_file_exists(new_audio_file_name):
        missing_audio_files.append(recording_data["id"])

print(f"Number of missing audio files: {len(missing_audio_files)}")
missing_audio_files

100%|██████████| 82691/82691 [00:04<00:00, 17639.88it/s]

Number of missing audio files: 5





['357351', '441473', '516953', '246962', '825922']

In [19]:
# drop recordings with missing audio files
df = df[~df["id"].isin(missing_audio_files)]

In [22]:
print(f"Number of recordings remaining: {len(df)}")
print(f"Number of recordings removed: {df_count_begin - len(df)}")
print(f"Percentage of recordings removed: {(df_count_begin - len(df)) / df_count_begin * 100:.2f}%")

Number of recordings remaining: 82686
Number of recordings removed: 6860
Percentage of recordings removed: 7.66%


## Drop unused features

In [27]:
# only keep the columns we need
df = df[["id", "en", "lat", "lng"]]

In [28]:
df.to_parquet('../data/cleaned/cleaned_data.parquet', engine='pyarrow', index=False)