In [None]:
import gdown
import pandas as pd

In [None]:
import pandas as pd

'''# Load the CSV file
file_path = 'new_final_species_cleaned.csv'
df = pd.read_csv(file_path)'''

file_id = "1uJYNuUFlKgmUuMnSpDtAKhNYaWSZ6cje"  # <- replace with actual ID
url = f"https://drive.google.com/uc?id={file_id}"
output = "new_final_species_cleaned.csv"
gdown.download(url, output, quiet=False)

df = pd.read_csv(output)

# Display the first few rows
print(df.head())


Downloading...
From: https://drive.google.com/uc?id=1uJYNuUFlKgmUuMnSpDtAKhNYaWSZ6cje
To: /content/new_final_species_cleaned.csv
100%|██████████| 5.32M/5.32M [00:00<00:00, 185MB/s]

   aphia_id  ncbi_taxon_id          species_canonical all_synonyms  \
0  101170.0         7769.0           Myxine glutinosa      Unknown   
1  101171.0       975066.8                 Myxine ios   Myxine ios   
2  101172.0         7748.0       Lampetra fluviatilis      Unknown   
3  101173.0       980415.0  Lethenteron camtschaticum      Unknown   
4  101174.0         7757.0         Petromyzon marinus      Unknown   

          family_x         authority             modified    status     rank  \
0        Myxinidae    Linnaeus, 1758  2008-01-15 17:27:08  accepted  Species   
1        Myxinidae    Fernholm, 1981  2008-01-15 17:27:08  accepted  Species   
2  Petromyzontidae  (Linnaeus, 1758)  2017-08-09 06:21:34  accepted  Species   
3  Petromyzontidae  (Tilesius, 1811)  2010-05-06 09:57:44  accepted  Species   
4  Petromyzontidae    Linnaeus, 1758  2008-01-15 17:27:08  accepted  Species   

                order  ...                fbname    demerspelag  \
0        Myxiniformes  ...     




In [None]:
import plotly.express as px

# Total species
total_species = df['species_canonical'].nunique()
print("Total unique species:", total_species)

# Count per Order
order_counts = df.groupby('order')['species_canonical'].nunique().reset_index()
fig_order = px.bar(order_counts, x='order', y='species_canonical',
                   labels={'species_canonical':'Number of Species', 'order':'Order'},
                   title='Species Count per Order')
fig_order.show()


Total unique species: 19685


In [None]:
# Convert habitat flags to long format
habitat_df = df.melt(id_vars=['species_canonical'],
                     value_vars=['ismarine','isbrackish','isfreshwater','isterrestrial'],
                     var_name='Habitat', value_name='Presence')

habitat_df = habitat_df[habitat_df['Presence'] == 1]  # Keep only present habitats

fig_habitat = px.pie(habitat_df, names='Habitat', title='Habitat Distribution of Species')
fig_habitat.show()


In [None]:
%pip install dash

Collecting dash
  Downloading dash-3.2.0-py3-none-any.whl.metadata (10 kB)
Collecting retrying (from dash)
  Downloading retrying-1.4.2-py3-none-any.whl.metadata (5.5 kB)
Downloading dash-3.2.0-py3-none-any.whl (7.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading retrying-1.4.2-py3-none-any.whl (10 kB)
Installing collected packages: retrying, dash
Successfully installed dash-3.2.0 retrying-1.4.2


In [None]:
import plotly.figure_factory as ff
import numpy as np

corr_df = df[['depth_min_in_m','depth_max_in_m','length_max_in_cm','common_length_in_cm','weight_max_in_g']].corr()
fig = px.imshow(corr_df, text_auto=True, color_continuous_scale='RdBu_r', title='Correlation of Traits')
fig.show()

#eg - deeper species are heavier or longer?


In [None]:
from dash import Dash, dcc, html
from dash.dependencies import Input, Output
import pandas as pd
import plotly.graph_objects as go

# Load CSV
df = pd.read_csv('/content/new_final_species_cleaned.csv')

app = Dash(__name__)

orders = df['order'].unique()

app.layout = html.Div([
    html.H2("Taxonomy Explorer"),

    html.Label("Select Order:"),
    dcc.Dropdown(id='order_dropdown', options=[{'label': o, 'value': o} for o in orders]),

    html.Label("Select Family:"),
    dcc.Dropdown(id='family_dropdown'),

    html.Label("Select Genus:"),
    dcc.Dropdown(id='genus_dropdown'),

    html.H3("Species Flowchart:"),
    dcc.Graph(id='species_flowchart')
])

# Callbacks for cascading dropdowns
@app.callback(
    Output('family_dropdown', 'options'),
    Input('order_dropdown', 'value')
)
def set_family_options(selected_order):
    if selected_order is None:
        return []
    families = df[df['order'] == selected_order]['family_x'].unique()
    return [{'label': f, 'value': f} for f in families]

@app.callback(
    Output('genus_dropdown', 'options'),
    Input('family_dropdown', 'value')
)
def set_genus_options(selected_family):
    if selected_family is None:
        return []
    genera = df[df['family_x'] == selected_family]['genus'].unique()
    return [{'label': g, 'value': g} for g in genera]

# Callback to generate flowchart
@app.callback(
    Output('species_flowchart', 'figure'),
    Input('order_dropdown', 'value'),
    Input('family_dropdown', 'value'),
    Input('genus_dropdown', 'value')
)
def update_flowchart(order, family, genus):
    if not order or not family or not genus:
        return go.Figure()

    # Filter data
    filtered = df[(df['order'] == order) &
                  (df['family_x'] == family) &
                  (df['genus'] == genus)]

    species_list = filtered['species_canonical'].unique()
    if len(species_list) == 0:
        return go.Figure()

    # Build nodes and links for Sankey
    nodes = [order, family, genus] + list(species_list)
    node_indices = {name: i for i, name in enumerate(nodes)}

    sources = [node_indices[order], node_indices[family], node_indices[genus]] * len(species_list)
    targets = [node_indices[family], node_indices[genus]] + [node_indices[s] for s in species_list]
    values = [len(filtered)] * 2 + [1]*len(species_list)

    # Create figure
    fig = go.Figure(go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=nodes
        ),
        link=dict(
            source=[node_indices[order], node_indices[family]] + [node_indices[genus]]*len(species_list),
            target=[node_indices[family], node_indices[genus]] + [node_indices[s] for s in species_list],
            value=[len(filtered), len(filtered)] + [1]*len(species_list)
        )
    ))

    fig.update_layout(title_text="Species Flowchart", font_size=12)
    return fig

if __name__ == '__main__':
    app.run(debug=True)


<IPython.core.display.Javascript object>

In [None]:
fig_scatter = px.scatter(df,
                         x='length_max_in_cm',
                         y='weight_max_in_g',
                         size='common_length_in_cm',
                         color='genus',
                         hover_name='species_canonical',
                         title='Length vs Weight by Species')
fig_scatter.show()


In [None]:
import pandas as pd
import plotly.express as px

'''# Load dataset with sequencing_status included
df = pd.read_csv("new_final_species_cleaned_with_seq.csv")'''

file_id = "1Av-bUCv73-orYrt-veT5fVF5LvKmnN-W"  # <- replace with actual ID
url = f"https://drive.google.com/uc?id={file_id}"
output = "new_final_species_cleaned.csv"
gdown.download(url, output, quiet=False)

df = pd.read_csv(output)

# Total unique species
total_species = df['species_canonical'].nunique()
print("Total unique species:", total_species)

# Count per Order + Sequencing Status
order_seq_counts = (
    df.groupby(['order', 'sequencing_status'])['species_canonical']
      .nunique()
      .reset_index()
)

# Create grouped bar chart with thicker bars
fig_order_seq = px.bar(
    order_seq_counts,
    x='order',
    y='species_canonical',
    color='sequencing_status',
    barmode='group',   # 'stack' for stacked bars
    labels={'species_canonical': 'Number of Species', 'order': 'Order'},
    title='Species Count per Order grouped by Sequencing Status'
)

# Adjust bar thickness
fig_order_seq.update_layout(bargap=0.1)   # smaller value = thicker bars

fig_order_seq.show()


Downloading...
From: https://drive.google.com/uc?id=1Av-bUCv73-orYrt-veT5fVF5LvKmnN-W
To: /content/new_final_species_cleaned.csv
100%|██████████| 5.36M/5.36M [00:00<00:00, 165MB/s]


Total unique species: 19685


In [None]:
import pandas as pd
import plotly.express as px

'''# Load dataset with sequencing_status included
df = pd.read_csv("new_final_species_cleaned_with_seq.csv")'''

file_id = "1Av-bUCv73-orYrt-veT5fVF5LvKmnN-W"  # <- replace with actual ID
url = f"https://drive.google.com/uc?id={file_id}"
output = "new_final_species_cleaned.csv"
gdown.download(url, output, quiet=False)

df = pd.read_csv(output)

# Total unique species
total_species = df['species_canonical'].nunique()
print("Total unique species:", total_species)

# Count per Order + Sequencing Status
order_seq_counts = (
    df.groupby(['order', 'sequencing_status'])['species_canonical']
      .nunique()
      .reset_index()
)

# Create grouped bar chart
fig_order_seq = px.bar(
    order_seq_counts,
    x='order',
    y='species_canonical',
    color='sequencing_status',
    barmode='group',   # or 'stack'
    labels={
        'species_canonical': 'Number of Species',
        'order': 'Order',
        'sequencing_status': 'Sequencing Status'
    },
    title=f'Species Count per Order (Grouped by Sequencing Status)<br><sup>Total unique species: {total_species}</sup>',
    color_discrete_sequence=px.colors.qualitative.Set2  # nicer color palette
)

# Beautify layout
fig_order_seq.update_layout(
    bargap=0.05,                # thicker bars
    plot_bgcolor='white',       # clean white background
    paper_bgcolor='white',
    title_font=dict(size=20, family='Arial', color='black'),
    xaxis=dict(
        tickangle=45,           # rotate x labels for readability
        title_font=dict(size=16),
        tickfont=dict(size=12)
    ),
    yaxis=dict(
        title_font=dict(size=16),
        tickfont=dict(size=12),
        gridcolor='lightgrey'
    ),
    legend=dict(
        title='Sequencing Status',
        font=dict(size=12),
        orientation="h",        # horizontal legend
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

fig_order_seq.show()


Downloading...
From: https://drive.google.com/uc?id=1Av-bUCv73-orYrt-veT5fVF5LvKmnN-W
To: /content/new_final_species_cleaned.csv
100%|██████████| 5.36M/5.36M [00:00<00:00, 112MB/s]


Total unique species: 19685


In [None]:
pip install dash

Collecting dash
  Downloading dash-3.2.0-py3-none-any.whl.metadata (10 kB)
Collecting retrying (from dash)
  Downloading retrying-1.4.2-py3-none-any.whl.metadata (5.5 kB)
Downloading dash-3.2.0-py3-none-any.whl (7.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m74.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading retrying-1.4.2-py3-none-any.whl (10 kB)
Installing collected packages: retrying, dash
Successfully installed dash-3.2.0 retrying-1.4.2


In [None]:
import pandas as pd
import plotly.graph_objects as go
import ipywidgets as widgets
from IPython.display import display, clear_output

'''# Load dataset
df = pd.read_csv('/content/new_final_species_cleaned.csv')'''

file_id = "1uJYNuUFlKgmUuMnSpDtAKhNYaWSZ6cje"  # <- replace with actual ID
url = f"https://drive.google.com/uc?id={file_id}"
output = "new_final_species_cleaned.csv"
gdown.download(url, output, quiet=False)

df = pd.read_csv(output)

# --- Create dropdown widgets ---
order_dropdown = widgets.Dropdown(
    options=['Select Order'] + sorted(df['order'].dropna().unique()),
    description='Order:'
)

family_dropdown = widgets.Dropdown(
    options=['Select Family'],
    description='Family:'
)

genus_dropdown = widgets.Dropdown(
    options=['Select Genus'],
    description='Genus:'
)

# --- Update Family dropdown based on selected Order ---
def update_family_options(change):
    selected_order = change['new']
    if selected_order == 'Select Order':
        family_dropdown.options = ['Select Family']
    else:
        families = sorted(df[df['order'] == selected_order]['family_x'].dropna().unique())
        family_dropdown.options = ['Select Family'] + list(families)

order_dropdown.observe(update_family_options, names='value')

# --- Update Genus dropdown based on selected Family ---
def update_genus_options(change):
    selected_family = change['new']
    if selected_family == 'Select Family':
        genus_dropdown.options = ['Select Genus']
    else:
        genera = sorted(df[df['family_x'] == selected_family]['genus'].dropna().unique())
        genus_dropdown.options = ['Select Genus'] + list(genera)

family_dropdown.observe(update_genus_options, names='value')

# --- Function to create Sankey diagram ---
def create_sankey(order, family, genus):
    if order in ['Select Order', None] or family in ['Select Family', None] or genus in ['Select Genus', None]:
        fig = go.Figure()
        fig.update_layout(title_text="Please select Order, Family, and Genus", font_size=12)
        return fig

    filtered = df[(df['order'] == order) &
                  (df['family_x'] == family) &
                  (df['genus'] == genus)]

    species_list = filtered['species_canonical'].unique()
    if len(species_list) == 0:
        fig = go.Figure()
        fig.update_layout(title_text="No species found for selection", font_size=12)
        return fig

    # Nodes
    nodes = [order, family, genus] + list(species_list)
    node_indices = {name: i for i, name in enumerate(nodes)}

    # Links
    sources = [node_indices[order], node_indices[family]] + [node_indices[genus]]*len(species_list)
    targets = [node_indices[family], node_indices[genus]] + [node_indices[s] for s in species_list]
    values = [len(filtered), len(filtered)] + [1]*len(species_list)

    # Sankey figure
    fig = go.Figure(go.Sankey(
        node=dict(
            pad=20,
            thickness=25,
            line=dict(color="black", width=0.7),
            label=nodes,
            color="rgba(58, 71, 80, 0.8)"
        ),
        link=dict(
            source=sources,
            target=targets,
            value=values,
            color="rgba(63, 81, 181, 0.4)"
        )
    ))

    fig.update_layout(
        title_text=f"Species Flowchart for {order} → {family} → {genus}",
        font=dict(size=14, color="black"),
        plot_bgcolor="white",
        paper_bgcolor="white"
    )
    return fig

# --- Output widget to display Sankey ---
out = widgets.Output()

def update_plot(*args):
    with out:
        clear_output(wait=True)
        fig = create_sankey(order_dropdown.value, family_dropdown.value, genus_dropdown.value)
        fig.show()

order_dropdown.observe(update_plot, names='value')
family_dropdown.observe(update_plot, names='value')
genus_dropdown.observe(update_plot, names='value')

# --- Display the widgets and output ---
display(order_dropdown, family_dropdown, genus_dropdown, out)


In [None]:
import dash
from dash import dcc, html
import pandas as pd
import plotly.graph_objects as go

# Load dataset (same as in Colab, via gdown or small CSV in repo)
df = pd.read_csv("new_final_species_cleaned.csv")

# Create Dash app
app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Species Visualization"),
    dcc.Dropdown(
        id='order_dropdown',
        options=[{'label': o, 'value': o} for o in df['order'].unique()],
        value=df['order'].unique()[0]
    ),
    dcc.Graph(id='species_plot')
])

@app.callback(
    dash.Output('species_plot', 'figure'),
    [dash.Input('order_dropdown', 'value')]
)
def update_plot(selected_order):
    filtered = df[df['order'] == selected_order]
    fig = go.Figure()
    fig.add_bar(x=filtered['family'], y=filtered['count'])
    return fig

server = app.server  # <-- required for Hugging Face

ModuleNotFoundError: No module named 'dash'

In [None]:
import pandas as pd
import plotly.graph_objects as go
import ipywidgets as widgets
from IPython.display import display

'''# Load dataset
df = pd.read_csv("new_final_species.csv")'''

# Clean strings (remove NaN and spaces)
df = df.dropna(subset=["order", "family_x", "genus", "species_canonical"])
df["order"] = df["order"].astype(str).str.strip()
df["family_x"] = df["family_x"].astype(str).str.strip()
df["genus"] = df["genus"].astype(str).str.strip()
df["species_canonical"] = df["species_canonical"].astype(str).str.strip()

# Build hierarchy
order_to_family = {}
family_to_genus = {}
order_family_genus_to_species = {}

for _, row in df.iterrows():
    order, family, genus, species = row["order"], row["family_x"], row["genus"], row["species_canonical"]
    order_to_family.setdefault(order, set()).add(family)
    family_to_genus.setdefault((order, family), set()).add(genus)
    order_family_genus_to_species.setdefault((order, family, genus), set()).add(species)

# Widgets
order_dropdown = widgets.Dropdown(options=["Select Order"] + sorted(order_to_family.keys()), description="Order")
family_dropdown = widgets.Dropdown(options=["Select Family"], description="Family")
genus_dropdown = widgets.Dropdown(options=["Select Genus"], description="Genus")
detailed_view = widgets.Checkbox(value=False, description="Show all species")

def update_family_options(change):
    order = change["new"]
    if order in order_to_family:
        family_dropdown.options = ["Select Family"] + sorted(order_to_family[order])
    else:
        family_dropdown.options = ["Select Family"]
    genus_dropdown.options = ["Select Genus"]

def update_genus_options(change):
    order, family = order_dropdown.value, change["new"]
    if (order, family) in family_to_genus:
        genus_dropdown.options = ["Select Genus"] + sorted(family_to_genus[(order, family)])
    else:
        genus_dropdown.options = ["Select Genus"]

order_dropdown.observe(update_family_options, names="value")
family_dropdown.observe(update_genus_options, names="value")

def create_sankey(*args):
    order, family, genus = order_dropdown.value, family_dropdown.value, genus_dropdown.value
    if order in ["Select Order", None] or family in ["Select Family", None] or genus in ["Select Genus", None]:
        return

    species_list = sorted(order_family_genus_to_species.get((order, family, genus), []))
    nodes = [order, family, genus]
    sources, targets, values = [], [], []
    node_indices = {name: i for i, name in enumerate(nodes)}

    if detailed_view.value and species_list:
        for sp in species_list:
            if sp not in node_indices:
                node_indices[sp] = len(node_indices)
                nodes.append(sp)
            sources.append(node_indices[genus])
            targets.append(node_indices[sp])
            values.append(1)
    else:
        species_count = len(species_list)
        sp_node = f"{genus} Species ({species_count})"
        node_indices[sp_node] = len(node_indices)
        nodes.append(sp_node)
        sources.append(node_indices[genus])
        targets.append(node_indices[sp_node])
        values.append(species_count)

    sources.extend([node_indices[order], node_indices[family]])
    targets.extend([node_indices[family], node_indices[genus]])
    values.extend([len(species_list), len(species_list)])

    fig = go.Figure(go.Sankey(
        node=dict(label=nodes, pad=15, thickness=20),
        link=dict(source=sources, target=targets, value=values)
    ))
    fig.update_layout(title=f"Sankey for {order} → {family} → {genus}")
    fig.show()

button = widgets.Button(description="Generate Sankey")
button.on_click(create_sankey)

display(order_dropdown, family_dropdown, genus_dropdown, detailed_view, button)


Dropdown(description='Order', options=('Select Order', 'Acanthuriformes', 'Accipitriformes', 'Acipenseriformes…

Dropdown(description='Family', options=('Select Family',), value='Select Family')

Dropdown(description='Genus', options=('Select Genus',), value='Select Genus')

Checkbox(value=False, description='Show all species')

Button(description='Generate Sankey', style=ButtonStyle())