# Small analysis of tags included in the dataset

### morning tag analysis
Visualize the distribution of anthropogenic and natural tags in the dataset.

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np


In [2]:
translations_dict = {
    "antro_aviao": "airplane",
    "antro_carro": "car",
    "antro_veiculo": "vehicle",
    "antro_motor": "motor",
    "antro_ni": "unknown anthropogenic",
    "antro_serra" : "chainsaw",
    "antro_caminhao" : "truck",
    "antro_voz" : "voice",
    "antro_martelada" : "hammering",
    "antro_moto" : "motorcycle",
    "antro_humano" : "human",
    "antro_assobio_humano" : "whistling",
    "antro_trator" : "tractor",
    "antro_turbina" : "turbine",
    "antro_musica" : "music",
    "antro_passos" : "footsteps",
    "antro_buzina" : "horn",
    "antro_facao" : "machete",
    "antro_sirene" : "siren",
}

In [3]:
data = pd.read_csv(
    "./00_matriz_wide_NOISES_biodiversity_morning.csv", sep=";"
)
data

Unnamed: 0.1,Unnamed: 0,filename,antro_assobio_humano,antro_aviao,antro_buzina,antro_caminhao,antro_carro,antro_facao,antro_humano,antro_martelada,...,biof_primata_callicebus_nigrifrons,geof_chuva_forte,geof_chuva_fraca,geof_correnteza,geof_ni,geof_trovao,geof_vento,geof_vento_forte,ruido_ni,ruido_defundo
0,1,col01_LEEC02__0__20161204_064400_ma,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,col01_LEEC02__0__20161211_071600_ma,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,col01_LEEC02__0__20161231_081400_br,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,col01_LEEC02__0__20170127_080100_br,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,col01_LEEC03__0__20161022_050100_ma,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2578,2579,col20_LEEC26__0__20161209_054600_aa,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2579,2580,col20_LEEC26__0__20161215_055900_aa,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2580,2581,col22_LEEC26__0__20161203_064400_aa,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2581,2582,col22_LEEC26__0__20161224_072900_aa,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
anthro_tags = data.filter(like="antro_").columns
natural_tags = data.filter(like="biof_").columns
anthro_counts = data[anthro_tags].sum().sort_values(ascending=False)
natural_counts = data[natural_tags].sum().sort_values(ascending=False)
other_tags = data.filter(regex="^(?!.*(biof_|antro_)).*$").columns.drop(["Unnamed: 0","filename"])
other_counts = data[other_tags].sum().sort_values(ascending=False)

In [5]:
#replace anthro tags with translations
anthro_counts.index = anthro_counts.index.to_series().replace(translations_dict)
# Group by index to handle any duplicate translations
anthro_counts = anthro_counts.groupby(anthro_counts.index).sum().sort_values(ascending=False)


In [17]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# Calculate proportional heights for each subplot based on number of tags
pixels_per_tag = 20
anthro_height = len(anthro_counts) * pixels_per_tag
natural_height = len(natural_counts) * pixels_per_tag
other_height = len(other_counts) * pixels_per_tag
total_height = anthro_height + natural_height + other_height

# Calculate row heights as proportions
row_heights = [
    anthro_height / total_height,
    natural_height / total_height,
    other_height / total_height
]

fig = make_subplots(
    rows=3,
    cols=1,
    subplot_titles=("Anthropogenic Tags", "Natural Tags", "Other Tags"),
    vertical_spacing=0.05,  # reduced spacing
    row_heights=row_heights  # proportional heights
)

fig.add_trace(
    go.Bar(
        x=anthro_counts.values, y=anthro_counts.index, orientation="h", showlegend=False
    ),
    row=1,
    col=1,
)
fig.add_trace(
    go.Bar(
        x=natural_counts.values,
        y=natural_counts.index,
        orientation="h",
        showlegend=False,
    ),
    row=2,
    col=1,
)
fig.add_trace(
    go.Bar(
        x=other_counts.values, y=other_counts.index, orientation="h", showlegend=False
    ),
    row=3,
    col=1,
)

# Use total height with some padding for titles
dynamic_height = total_height + 200

fig.update_layout(height=dynamic_height, width=800, title_text="Tag Distribution", font=dict(size=14))

fig.show()
fig.write_image("tag_distribution.png")

### distribution of tags

In [None]:
anthro_total = anthro_counts.sum()
natural_total = natural_counts.sum()
other_total = other_counts.sum()
total_tags = anthro_total + natural_total + other_total

noise_data = {
    "Type": ["Anthropogenic", "Natural", "Other"],
    "Count": [anthro_total, natural_total, other_total],
}

fig = px.pie(
    noise_data,
    names="Type",
    values="Count",
    title="Distribution of Tag Occurrences",
    color="Type",
    color_discrete_map={
        "Anthropogenic": "#EF553B",
        "Natural": "#00CC96",
        "Other": "#636EFA",
    },
    hole=0.0,
    width=700,
)

# Pull out the anthropogenic slice
pull_values = [0.1 if n == "Anthropogenic" else 0 for n in noise_data["Type"]]

fig.update_traces(
    textinfo="percent+label+value", 
    pull=pull_values, 
    marker_line=dict(color="white", width=2),
    textfont_size=15
)

fig.update_layout(
    title_x=0.5,
    font=dict(size=16),
)

fig.show()

# Print summary
print(f"\nTag Occurrences:")
print(f"  Anthropogenic: {int(anthro_total)} ({anthro_total/total_tags*100:.1f}%)")
print(f"  Natural: {int(natural_total)} ({natural_total/total_tags*100:.1f}%)")
print(f"  Other: {int(other_total)} ({other_total/total_tags*100:.1f}%)")
print(f"  Total tag occurrences: {int(total_tags)}")

fig.write_image("tag_occurrences_distribution.png")

### Recordings: anthropogenic vs other recordings
Compare recordings containing anthropogenic tags against all other recordings in the dataset.

In [None]:
# Compare recordings containing anthropogenic tags vs other recordings
# This shows the proportion at the recording level (not tag occurrences)

# Get total recordings
total_files = len(data)

# Get recordings with anthropogenic tags
has_anthro = data[anthro_tags].any(axis=1)
num_with_anthro = int(has_anthro.sum())
num_without_anthro = total_files - num_with_anthro

fig = go.Figure(
    go.Pie(
        labels=["Recordings with Anthropogenic", "Recordings without Anthropogenic"],
        values=[num_with_anthro, num_without_anthro],
        textinfo="label+percent+value",
        marker_colors=["#EF553B", "#636EFA"],
        hole=0.3
    )
)

fig.update_traces(
    pull=[0.1, 0], 
    marker_line=dict(color="white", width=2),
    textfont_size=15
)

fig.update_layout(
    title="Proportion of Recordings Containing Anthropogenic Tags",
    width=800,
    height=550,
    showlegend=False,
    font=dict(size=14)
)

fig.show()

# Print statistics
print(f"\nRecordings Breakdown:")
print(f"  With anthropogenic tags: {num_with_anthro} ({num_with_anthro/total_files*100:.1f}%)")
print(f"  Without anthropogenic tags: {num_without_anthro} ({num_without_anthro/total_files*100:.1f}%)")
print(f"  Total recordings: {total_files}")

fig.write_image("recordings_anthropogenic_distribution.png")

In [None]:
# Compare tag occurrences vs recordings with tags
print("=" * 70)
print("COMPARISON: Tag Occurrences vs Recordings")
print("=" * 70)

# Tag occurrences (sum of individual tags)
anthro_tag_occurrences = int(anthro_counts.sum())
natural_tag_occurrences = int(natural_counts.sum())
other_tag_occurrences = int(other_counts.sum())

has_natural = data[natural_tags].any(axis=1)
has_other = data[other_tags].any(axis=1)
num_tagged = int((has_anthro | has_natural | has_other).sum())


# Recordings with at least one tag of each type
anthro_recordings = int(has_anthro.sum())
natural_recordings = int(has_natural.sum())
other_recordings = int(has_other.sum())

print("\nðŸ“Š ANTHROPOGENIC:")
print(f"  Total tag occurrences: {anthro_tag_occurrences}")
print(f"  Recordings with anthro tags: {anthro_recordings}")
print(f"  Average tags per recording: {anthro_tag_occurrences/anthro_recordings:.2f}")

print("\nðŸŒ¿ NATURAL (BIOF):")
print(f"  Total tag occurrences: {natural_tag_occurrences}")
print(f"  Recordings with natural tags: {natural_recordings}")
print(f"  Average tags per recording: {natural_tag_occurrences/natural_recordings:.2f}")

print("\nðŸ”Š OTHER:")
print(f"  Total tag occurrences: {other_tag_occurrences}")
print(f"  Recordings with other tags: {other_recordings}")
print(f"  Average tags per recording: {other_tag_occurrences/other_recordings:.2f}")

print("\n" + "=" * 70)
print(f"TOTAL tag occurrences: {anthro_tag_occurrences + natural_tag_occurrences + other_tag_occurrences}")
print(f"TOTAL recordings with tags: {num_tagged} (each counted once)")
print("=" * 70)

In [None]:
# Calculate recordings with each type of tag
has_anthro = data[anthro_tags].any(axis=1)
has_natural = data[natural_tags].any(axis=1)
has_other = data[other_tags].any(axis=1)

# Count recordings by tag type
num_anthro_only = int((has_anthro & ~has_natural & ~has_other).sum())
num_natural_only = int((~has_anthro & has_natural & ~has_other).sum())
num_other_only = int((~has_anthro & ~has_natural & has_other).sum())

# Count recordings with multiple tag types
num_anthro_natural = int((has_anthro & has_natural & ~has_other).sum())
num_anthro_other = int((has_anthro & ~has_natural & has_other).sum())
num_natural_other = int((~has_anthro & has_natural & has_other).sum())
num_all_three = int((has_anthro & has_natural & has_other).sum())

# Total tagged recordings
num_tagged = int((has_anthro | has_natural | has_other).sum())
num_untagged = total_files - num_tagged

print(f"Total files: {total_files}")
print(f"Tagged files: {num_tagged}")
print(f"Untagged files: {num_untagged}")
print(f"\nBreakdown:")
print(f"  Anthropogenic only: {num_anthro_only}")
print(f"  Natural only: {num_natural_only}")
print(f"  Other only: {num_other_only}")
print(f"  Anthro + Natural: {num_anthro_natural}")
print(f"  Anthro + Other: {num_anthro_other}")
print(f"  Natural + Other: {num_natural_other}")
print(f"  All three: {num_all_three}")

### Visualizations: Tag Occurrences vs Recordings
Interactive visualizations comparing tag occurrences with the number of recordings containing those tags.

In [None]:
# Grouped bar chart comparing tag occurrences vs recordings with tags
comparison_data = pd.DataFrame({
    'Category': ['Anthropogenic', 'Natural', 'Other'],
    'Tag Occurrences': [anthro_tag_occurrences, natural_tag_occurrences, other_tag_occurrences],
    'Recordings with Tags': [anthro_recordings, natural_recordings, other_recordings]
})

fig = go.Figure()

fig.add_trace(go.Bar(
    name='Tag Occurrences',
    x=comparison_data['Category'],
    y=comparison_data['Tag Occurrences'],
    text=comparison_data['Tag Occurrences'],
    textposition='auto',
    marker_color='#636EFA'
))

fig.add_trace(go.Bar(
    name='Recordings with Tags',
    x=comparison_data['Category'],
    y=comparison_data['Recordings with Tags'],
    text=comparison_data['Recordings with Tags'],
    textposition='auto',
    marker_color='#00CC96'
))

fig.update_layout(
    title='Tag Occurrences vs Recordings with Tags',
    xaxis_title='Category',
    yaxis_title='Count',
    barmode='group',
    width=800,
    height=500,
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    font=dict(size=14)
)

fig.show()

# Calculate and display average tags per recording
print("\nAverage Tags per Recording:")
for idx, row in comparison_data.iterrows():
    avg = row['Tag Occurrences'] / row['Recordings with Tags']
    print(f"  {row['Category']}: {avg:.2f}")
    
fig.write_image("tag_occurrences_vs_recordings.png")

In [None]:
# Sunburst chart showing the breakdown of recordings by tag combinations
# Prepare data for sunburst
categories = []
parents = []
values = []
colors_map = []

# Root
categories.append("All Recordings")
parents.append("")
values.append(total_files)
colors_map.append("#CCCCCC")

# First level: Tagged vs Untagged
categories.extend(["Tagged", "Untagged"])
parents.extend(["All Recordings", "All Recordings"])
values.extend([num_tagged, num_untagged])
colors_map.extend(["#636EFA", "#E8E8E8"])

# Second level: Single tag types
categories.extend(["Anthro Only", "Natural Only", "Other Only", "Mixed"])
parents.extend(["Tagged", "Tagged", "Tagged", "Tagged"])
values.extend([num_anthro_only, num_natural_only, num_other_only, 
               num_anthro_natural + num_anthro_other + num_natural_other + num_all_three])
colors_map.extend(["#EF553B", "#00CC96", "#AB63FA", "#FFA15A"])

# Third level: Mixed combinations
categories.extend(["Anthro + Natural", "Anthro + Other", "Natural + Other", "All Three"])
parents.extend(["Mixed", "Mixed", "Mixed", "Mixed"])
values.extend([num_anthro_natural, num_anthro_other, num_natural_other, num_all_three])
colors_map.extend(["#FFAA00", "#FF6692", "#19D3F3", "#B6E880"])

fig = go.Figure(go.Sunburst(
    labels=categories,
    parents=parents,
    values=values,
    branchvalues="total",
    marker=dict(colors=colors_map, line=dict(color='white', width=2)),
    textinfo='label+value+percent parent',
    hovertemplate='<b>%{label}</b><br>Count: %{value}<br>Percent: %{percentParent}<extra></extra>'
))

fig.update_layout(
    title="Recording Distribution by Tag Type (Sunburst)",
    width=900,
    height=900,
    font=dict(size=14)
)

fig.show()
fig.write_image("recording_tag_sunburst.png")

In [None]:
# Sankey diagram showing flow from recordings to tag types
from plotly import graph_objects as go

# Define nodes
node_labels = [
    "All Tagged Recordings",  # 0
    "Anthropogenic Only",      # 1
    "Natural Only",            # 2
    "Other Only",              # 3
    "Contains Anthropogenic",  # 4
    "Contains Natural",        # 5
    "Contains Other",          # 6
]

# Define links (source, target, value)
links = {
    'source': [
        0, 0, 0, 0, 0, 0, 0,  # From "All Tagged Recordings"
    ],
    'target': [
        1, 2, 3, 4, 5, 6, 4   # To specific categories
    ],
    'value': [
        num_anthro_only,
        num_natural_only,
        num_other_only,
        num_anthro_natural + num_anthro_other + num_all_three,
        num_anthro_natural + num_natural_other + num_all_three,
        num_anthro_other + num_natural_other + num_all_three,
    ]
}

# Simplified version - showing the breakdown more clearly
node_labels = [
    "All Tagged<br>Recordings",     # 0
    "Single Tag<br>Type",            # 1
    "Multiple Tag<br>Types",         # 2
    "Anthro<br>Only",                # 3
    "Natural<br>Only",               # 4
    "Other<br>Only",                 # 5
    "Anthro +<br>Natural",           # 6
    "Anthro +<br>Other",             # 7
    "Natural +<br>Other",            # 8
    "All Three<br>Types",            # 9
]

single_total = num_anthro_only + num_natural_only + num_other_only
mixed_total = num_anthro_natural + num_anthro_other + num_natural_other + num_all_three

links = {
    'source': [0, 0, 1, 1, 1, 2, 2, 2, 2],
    'target': [1, 2, 3, 4, 5, 6, 7, 8, 9],
    'value': [
        single_total,
        mixed_total,
        num_anthro_only,
        num_natural_only,
        num_other_only,
        num_anthro_natural,
        num_anthro_other,
        num_natural_other,
        num_all_three
    ]
}

# Define colors
node_colors = [
    '#636EFA',  # All Tagged
    '#AB63FA',  # Single Type
    '#FFA15A',  # Multiple Types
    '#EF553B',  # Anthro Only
    '#00CC96',  # Natural Only
    '#B6E880',  # Other Only
    '#FFAA00',  # Anthro + Natural
    '#FF6692',  # Anthro + Other
    '#19D3F3',  # Natural + Other
    '#B6E880',  # All Three
]

fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=35,  # Increased from 15 to 35 for more spacing
        thickness=20,
        line=dict(color="white", width=2),
        label=node_labels,
        color=node_colors
    ),
    link=dict(
        source=links['source'],
        target=links['target'],
        value=links['value'],
        color='rgba(200, 200, 200, 0.4)'
    )
)])

fig.update_layout(
    title="Flow of Recordings by Tag Type Combinations",
    font_size=14,
    width=1000,
    height=800  # Increased from 600 to 800 for more vertical space
)

fig.show()
fig.write_image("recording_tag_sankey.png")

## Event noises vs background noises

In [18]:
# pick a filename for each category of anthropogenic tag
anthro_example_files = {}
for tag in anthro_tags:
    example_file = data.loc[data[tag] == 1, "filename"].iloc[-1]
    anthro_example_files[tag] = example_file
anthro_example_files
# translate keys
anthro_example_files_translated = {translations_dict.get(k, k): v for k, v in anthro_example_files.items()}
anthro_example_files_translated

{'whistling': 'col06_LEEC09__0__20161202_072900_aa',
 'airplane': 'col12_LEEC49__0__20161102_072900_ma',
 'horn': 'col03_LEEC36__0__20170110_064400_br',
 'truck': 'col12_LEEC42__0__20161201_080100_aa',
 'car': 'col12_LEEC42__0__20161217_051400_aa',
 'machete': 'col06_LEEC45__0__20170123_064400_br',
 'human': 'col01_LEEC41__0__20161206_071600_aa',
 'hammering': 'col12_LEEC05__0__20161206_081400_aa',
 'motorcycle': 'col12_LEEC42__0__20170110_071600_br',
 'motor': 'col12_LEEC42__0__20161122_051400_aa',
 'music': 'col06_LEEC09__0__20170127_080100_br',
 'unknown anthropogenic': 'col09_LEEC06__0__20161218_081400_aa',
 'footsteps': 'col06_LEEC45__0__20170123_064400_br',
 'chainsaw': 'col12_LEEC49__0__20161207_055900_aa',
 'siren': 'col04_LEEC08__0__20161123_071600_aa',
 'tractor': 'col09_LEEC21__0__20161125_080100_aa',
 'turbine': 'col06_LEEC05__0__20161031_063100_ma',
 'vehicle': 'col12_LEEC36__0__20161127_064400_aa',
 'voice': 'col12_LEEC40__0__20161220_080100_aa'}

In [20]:
# take random plane sample
import random
plane_files = data.loc[data["antro_aviao"] == 1, "filename"].tolist()
random.sample(plane_files, 5)

['col03_LEEC40__0__20170115_050100_br',
 'col12_LEEC49__0__20161102_072900_ma',
 'col04_LEEC03__0__20161111_054600_ma',
 'col09_LEEC45__0__20161230_054600_br',
 'col06_LEEC21__0__20161123_063100_aa']