In [37]:
# Western Ghats geographic bounds (approximate, literature-backed)
WG_LAT_MIN, WG_LAT_MAX = 8.0, 21.0
WG_LON_MIN, WG_LON_MAX = 73.0, 77.5

print("üìç Western Ghats Bounding Box:")
print(f"Latitude: {WG_LAT_MIN} to {WG_LAT_MAX}")
print(f"Longitude: {WG_LON_MIN} to {WG_LON_MAX}")


üìç Western Ghats Bounding Box:
Latitude: 8.0 to 21.0
Longitude: 73.0 to 77.5


In [38]:
import pandas as pd

gbif = pd.read_csv(
    "/kaggle/input/gbif-herbarium-french-institute-of-pondicherry/0004138-260126135527185.csv",
    sep="\t",
    low_memory=False
)

print("üåø IFP Herbarium shape:", gbif.shape)
print(gbif.columns)


üåø IFP Herbarium shape: (25023, 50)
Index(['gbifID', 'datasetKey', 'occurrenceID', 'kingdom', 'phylum', 'class',
       'order', 'family', 'genus', 'species', 'infraspecificEpithet',
       'taxonRank', 'scientificName', 'verbatimScientificName',
       'verbatimScientificNameAuthorship', 'countryCode', 'locality',
       'stateProvince', 'occurrenceStatus', 'individualCount',
       'publishingOrgKey', 'decimalLatitude', 'decimalLongitude',
       'coordinateUncertaintyInMeters', 'coordinatePrecision', 'elevation',
       'elevationAccuracy', 'depth', 'depthAccuracy', 'eventDate', 'day',
       'month', 'year', 'taxonKey', 'speciesKey', 'basisOfRecord',
       'institutionCode', 'collectionCode', 'catalogNumber', 'recordNumber',
       'identifiedBy', 'dateIdentified', 'license', 'rightsHolder',
       'recordedBy', 'typeStatus', 'establishmentMeans', 'lastInterpreted',
       'mediaType', 'issue'],
      dtype='object')


In [39]:
print(gbif.columns.tolist())


['gbifID', 'datasetKey', 'occurrenceID', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species', 'infraspecificEpithet', 'taxonRank', 'scientificName', 'verbatimScientificName', 'verbatimScientificNameAuthorship', 'countryCode', 'locality', 'stateProvince', 'occurrenceStatus', 'individualCount', 'publishingOrgKey', 'decimalLatitude', 'decimalLongitude', 'coordinateUncertaintyInMeters', 'coordinatePrecision', 'elevation', 'elevationAccuracy', 'depth', 'depthAccuracy', 'eventDate', 'day', 'month', 'year', 'taxonKey', 'speciesKey', 'basisOfRecord', 'institutionCode', 'collectionCode', 'catalogNumber', 'recordNumber', 'identifiedBy', 'dateIdentified', 'license', 'rightsHolder', 'recordedBy', 'typeStatus', 'establishmentMeans', 'lastInterpreted', 'mediaType', 'issue']


In [40]:
# Ensure numeric
gbif["decimalLatitude"] = pd.to_numeric(gbif["decimalLatitude"], errors="coerce")
gbif["decimalLongitude"] = pd.to_numeric(gbif["decimalLongitude"], errors="coerce")
gbif["year"] = pd.to_numeric(gbif["year"], errors="coerce")

# Drop invalid
gbif = gbif.dropna(subset=[
    "year", "species", "decimalLatitude", "decimalLongitude"
])

# Western Ghats bounding box
WG_LAT_MIN, WG_LAT_MAX = 8.0, 21.0
WG_LON_MIN, WG_LON_MAX = 73.0, 77.5

gbif_wg = gbif[
    (gbif["decimalLatitude"].between(WG_LAT_MIN, WG_LAT_MAX)) &
    (gbif["decimalLongitude"].between(WG_LON_MIN, WG_LON_MAX)) &
    (gbif["year"].between(1990, 2024))
]

print("üå± Western Ghats records:", gbif_wg.shape)


üå± Western Ghats records: (5337, 50)


In [41]:
species_richness = gbif_wg.groupby("year")["species"].nunique()
occurrences = gbif_wg.groupby("year").size()

biodiversity = pd.DataFrame({
    "year": species_richness.index,
    "species_richness": species_richness.values,
    "occurrences": occurrences.values
})

biodiversity["species_per_1000_occ"] = (
    biodiversity["species_richness"] /
    biodiversity["occurrences"] * 1000
)

biodiversity.head()


Unnamed: 0,year,species_richness,occurrences,species_per_1000_occ
0,1990.0,175,573,305.410122
1,1991.0,21,39,538.461538
2,1992.0,361,1123,321.460374
3,1993.0,363,1498,242.323097
4,1994.0,137,423,323.877069


In [42]:
# Remove years with too few observations (noise control)
biodiversity = biodiversity[biodiversity["occurrences"] >= 20]

print("üìâ After filtering low-sample years:", biodiversity.shape)
biodiversity.head()


üìâ After filtering low-sample years: (16, 4)


Unnamed: 0,year,species_richness,occurrences,species_per_1000_occ
0,1990.0,175,573,305.410122
1,1991.0,21,39,538.461538
2,1992.0,361,1123,321.460374
3,1993.0,363,1498,242.323097
4,1994.0,137,423,323.877069


In [43]:
biodiversity["species_per_1000_occ_smooth"] = (
    biodiversity["species_per_1000_occ"]
    .rolling(window=3, min_periods=1)
    .mean()
)


In [44]:
biodiversity.to_csv(
    "gbif_biodiversity_yearly_WESTERN_GHATS.csv",
    index=False
)

print("‚úÖ Saved Western Ghats biodiversity dataset")


‚úÖ Saved Western Ghats biodiversity dataset


In [45]:
import os
# This will list all datasets attached to your notebook
print(os.listdir('/kaggle/input/'))

['gbif-herbarium-french-institute-of-pondicherry', 'gbif-species-occurrence-records', 'birdclef24-pretraining-train-model', 'gbif-western-ghat', 'birdclef-2024', 'birdcall-recognition-data']


In [46]:
AUDIO_META = "/kaggle/input/birdclef-2024/train_metadata.csv"
df_audio = pd.read_csv(AUDIO_META)

print("BirdCLEF metadata:", df_audio.shape)
df_audio.head()


BirdCLEF metadata: (24459, 12)


Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename
0,asbfly,[],['call'],39.2297,118.1987,Muscicapa dauurica,Asian Brown Flycatcher,Matt Slaymaker,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/134896,asbfly/XC134896.ogg
1,asbfly,[],['song'],51.403,104.6401,Muscicapa dauurica,Asian Brown Flycatcher,Magnus Hellstr√∂m,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/164848,asbfly/XC164848.ogg
2,asbfly,[],['song'],36.3319,127.3555,Muscicapa dauurica,Asian Brown Flycatcher,Stuart Fisher,Creative Commons Attribution-NonCommercial-Sha...,2.5,https://www.xeno-canto.org/175797,asbfly/XC175797.ogg
3,asbfly,[],['call'],21.1697,70.6005,Muscicapa dauurica,Asian Brown Flycatcher,vir joshi,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/207738,asbfly/XC207738.ogg
4,asbfly,[],['call'],15.5442,73.7733,Muscicapa dauurica,Asian Brown Flycatcher,Albert Lastukhin & Sergei Karpeev,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/209218,asbfly/XC209218.ogg


In [47]:
from pathlib import Path


In [48]:
WG_LAT_MIN, WG_LAT_MAX = 8.0, 21.0
WG_LON_MIN, WG_LON_MAX = 73.0, 77.5


In [49]:
# Ensure numeric coordinates
df_audio["latitude"] = pd.to_numeric(df_audio["latitude"], errors="coerce")
df_audio["longitude"] = pd.to_numeric(df_audio["longitude"], errors="coerce")

# Drop invalid rows
df_audio = df_audio.dropna(subset=["latitude", "longitude", "primary_label"])

# üî• Western Ghats filter for audio
audio_wg = df_audio[
    (df_audio["latitude"].between(WG_LAT_MIN, WG_LAT_MAX)) &
    (df_audio["longitude"].between(WG_LON_MIN, WG_LON_MAX))
]

print("üîä Western Ghats BirdCLEF records:", audio_wg.shape)
audio_wg.head()


üîä Western Ghats BirdCLEF records: (2175, 12)


Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename
4,asbfly,[],['call'],15.5442,73.7733,Muscicapa dauurica,Asian Brown Flycatcher,Albert Lastukhin & Sergei Karpeev,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/209218,asbfly/XC209218.ogg
5,asbfly,[],['call'],15.5442,73.7733,Muscicapa dauurica,Asian Brown Flycatcher,Albert Lastukhin & Sergei Karpeev,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/209219,asbfly/XC209219.ogg
11,asbfly,[],['call'],10.1653,76.5476,Muscicapa dauurica,Asian Brown Flycatcher,Dilip KG,Creative Commons Attribution-NonCommercial-Sha...,3.0,https://www.xeno-canto.org/291450,asbfly/XC291450.ogg
116,ashdro1,[],['call'],18.8897,73.1232,Dicrurus leucophaeus,Ashy Drongo,Saurabh Sawant,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/120586,ashdro1/XC120586.ogg
123,ashdro1,[],['call'],11.3362,76.1102,Dicrurus leucophaeus,Ashy Drongo,Vivek Puliyeri,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/214278,ashdro1/XC214278.ogg


In [50]:
audio_species_richness = (
    audio_wg.groupby("primary_label")
    .size()
    .reset_index(name="num_recordings")
)

print("ü¶ú Western Ghats unique bird species:", audio_species_richness.shape[0])
audio_species_richness.head()


ü¶ú Western Ghats unique bird species: 163


Unnamed: 0,primary_label,num_recordings
0,asbfly,3
1,ashdro1,7
2,ashpri1,32
3,ashwoo2,4
4,asikoe2,44


In [51]:
audio_species_richness["normalized_audio_strength"] = (
    audio_species_richness["num_recordings"] /
    audio_species_richness["num_recordings"].max()
)


In [52]:
audio_species_richness.to_csv(
    "audio_species_richness_WESTERN_GHATS.csv",
    index=False
)

print("‚úÖ Saved Western Ghats audio species richness dataset")


‚úÖ Saved Western Ghats audio species richness dataset


In [53]:
# Final audio strength (static signal)
audio_signal_strength = audio_species_richness["normalized_audio_strength"].mean()

audio_summary = pd.DataFrame({
    "audio_signal_strength": [audio_signal_strength]
})

audio_summary.to_csv(
    "audio_signal_summary_WESTERN_GHATS.csv",
    index=False
)

print("‚úÖ Saved Western Ghats audio summary")


‚úÖ Saved Western Ghats audio summary
