In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Phytoplankton_harmonized_database_revised.csv', sep=",", encoding='latin1')

  df = pd.read_csv('Phytoplankton_harmonized_database_revised.csv', sep=",", encoding='latin1')


In [3]:
# India-priority configuration and helpers
prioritize_india = True  # default bias towards India
region_mode = 'india_region'  # options: 'india_region' (bbox), 'india_land' (polygon), 'global'
INDIA_BBOX = {
    'lat_min': 6.0,   # includes Andaman/Nicobar latitude range partially
    'lat_max': 37.5,
    'lon_min': 68.0,  # covers Arabian Sea to Bay of Bengal coasts
    'lon_max': 98.0
}

def filter_to_region(df_in, mode='india_region', bbox=None):
    """
    Filter a DataFrame by region.

    Modes:
    - 'global': return unchanged
    - 'india_region': fast bounding-box filter (coasts + nearby offshore)
    - 'india_land': use geopandas Natural Earth polygon for India's land area;
       falls back to bbox if geopandas or dataset not available.
    """
    if df_in is None or len(df_in) == 0:
        return df_in
    df1 = df_in.copy()
    # ensure coordinate columns are numeric
    if 'decimalLatitude' in df1.columns:
        df1['decimalLatitude'] = pd.to_numeric(df1['decimalLatitude'], errors='coerce')
    if 'decimalLongitude' in df1.columns:
        df1['decimalLongitude'] = pd.to_numeric(df1['decimalLongitude'], errors='coerce')
    df1 = df1.dropna(subset=['decimalLatitude','decimalLongitude'])

    if mode == 'global':
        return df1

    if mode == 'india_region':
        b = bbox or INDIA_BBOX
        lat_ok = df1['decimalLatitude'].between(b['lat_min'], b['lat_max'], inclusive='both')
        lon_ok = df1['decimalLongitude'].between(b['lon_min'], b['lon_max'], inclusive='both')
        return df1[lat_ok & lon_ok]

    if mode == 'india_land':
        try:
            import geopandas as gpd
            gdf = gpd.GeoDataFrame(
                df1,
                geometry=gpd.points_from_xy(df1['decimalLongitude'], df1['decimalLatitude']),
                crs='EPSG:4326'
            )
            world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
            india = world[world['name'] == 'India']
            if india.empty:
                print("India polygon not found in Natural Earth; falling back to bbox.")
                return filter_to_region(df1, 'india_region', bbox)
            try:
                joined = gpd.sjoin(gdf, india[['geometry']], how='inner', predicate='within')
            except TypeError:  # older geopandas uses 'op'
                joined = gpd.sjoin(gdf, india[['geometry']], how='inner', op='within')
            # drop geometry to return a pandas DataFrame-like structure
            result = joined.drop(columns=['geometry'], errors='ignore')
            # ensure original columns retained
            return result[df1.columns.intersection(result.columns)]
        except Exception as e:
            print(f"Geopandas-based land filter failed ({e}); falling back to bbox.")
            return filter_to_region(df1, 'india_region', bbox)

    # unknown mode -> default to bbox
    return filter_to_region(df1, 'india_region', bbox)

print(f"Config -> prioritize_india={prioritize_india}, region_mode='{region_mode}'")

Config -> prioritize_india=True, region_mode='india_region'


In [4]:
df.head()

Unnamed: 0,scientificName,decimalLongitude,decimalLatitude,year,month,day,depth,depthAccuracy,taxonRank,occurrenceStatus,...,scientificNameOriginal_gbif,scientificNameOriginal_obis,scientificNameOriginal_maredat,scientificNameOriginal_villar,scientificNameOriginal_sal,organismQuantity,organismQuantityType,individualCount,yearOfDataAccess,flag
0,Carteria marina,-79.215,-9.482,2000,2,11.0,10.0,,SPECIES,PRESENT,...,,Carteria marina,,,,,,,2015,NONE
1,Coccopterum labyrinthus,-11.6042,40.6832,1993,5,11.0,0.0,,SPECIES,PRESENT,...,Coccopterum labyrinthus,,,,,,,,2017,S
2,Coccopterum labyrinthus,-11.6042,40.6832,1993,5,11.0,139.0,,SPECIES,PRESENT,...,Coccopterum labyrinthus,,,,,,,,2017,S
3,Coccopterum labyrinthus,-11.6042,40.6832,1993,5,11.0,181.0,,SPECIES,PRESENT,...,Coccopterum labyrinthus,,,,,,,,2017,S
4,Coccopterum labyrinthus,-11.6042,40.6832,1993,5,11.0,22.0,,SPECIES,PRESENT,...,Coccopterum labyrinthus,,,,,,,,2017,S


In [5]:
df['scientificName'].unique()

array(['Carteria marina', 'Coccopterum labyrinthus',
       'Dunaliella tertiolecta', ..., 'Fibrocapsa japonica',
       'Olisthodiscus luteus', 'Picoeukaryotes'], dtype=object)

In [6]:
df['class'].unique()

array(['Chlorophyceae', 'Prasinophyceae', 'Pyramimonadophyceae',
       'Chlorodendrophyceae', 'Chlorophyta incertae sedis',
       'Cryptophyceae', 'Cryptophyta incertae sedis', 'Telonemea',
       'Cyanophyceae', 'Euglenoidea', 'Prymnesiophyceae',
       'Haptophyta incertae sedis', 'Coccolithophyceae', 'Dinophyceae',
       'Bacillariophyceae', 'Chrysophyceae', 'Dictyochophyceae',
       'Pelagophyceae', 'Raphidophyceae', nan], dtype=object)

In [7]:
df['phylum'].unique()

array(['Chlorophyta', 'Cryptophyta', 'Cyanobacteria', 'Euglenozoa',
       'Haptophyta', 'Myzozoa', 'Ochrophyta', nan], dtype=object)

In [8]:
df.drop(['depthAccuracy', 'occurrenceStatus'], inplace = True, axis = 1)

In [9]:
df

Unnamed: 0,scientificName,decimalLongitude,decimalLatitude,year,month,day,depth,taxonRank,phylum,class,...,scientificNameOriginal_gbif,scientificNameOriginal_obis,scientificNameOriginal_maredat,scientificNameOriginal_villar,scientificNameOriginal_sal,organismQuantity,organismQuantityType,individualCount,yearOfDataAccess,flag
0,Carteria marina,-79.2150,-9.4820,2000,2,11.0,10.0,SPECIES,Chlorophyta,Chlorophyceae,...,,Carteria marina,,,,,,,2015,NONE
1,Coccopterum labyrinthus,-11.6042,40.6832,1993,5,11.0,0.0,SPECIES,Chlorophyta,Prasinophyceae,...,Coccopterum labyrinthus,,,,,,,,2017,S
2,Coccopterum labyrinthus,-11.6042,40.6832,1993,5,11.0,139.0,SPECIES,Chlorophyta,Prasinophyceae,...,Coccopterum labyrinthus,,,,,,,,2017,S
3,Coccopterum labyrinthus,-11.6042,40.6832,1993,5,11.0,181.0,SPECIES,Chlorophyta,Prasinophyceae,...,Coccopterum labyrinthus,,,,,,,,2017,S
4,Coccopterum labyrinthus,-11.6042,40.6832,1993,5,11.0,22.0,SPECIES,Chlorophyta,Prasinophyceae,...,Coccopterum labyrinthus,,,,,,,,2017,S
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1360616,Picoeukaryotes,95.0100,-39.9900,1995,9,29.0,51.0,,,,...,,,Picoeukaryotes,,,10463000.0,number_of_cells_per_L,,2015_2017,NONE
1360617,Picoeukaryotes,95.0100,-43.0000,1995,9,28.0,0.0,,,,...,,,Picoeukaryotes,,,5163000.0,number_of_cells_per_L,,2015_2017,NONE
1360618,Picoeukaryotes,95.0100,-43.0000,1995,9,28.0,48.0,,,,...,,,Picoeukaryotes,,,5134000.0,number_of_cells_per_L,,2015_2017,NONE
1360619,Picoeukaryotes,95.0000,-31.7500,1995,10,2.0,0.0,,,,...,,,Picoeukaryotes,,,10949000.0,number_of_cells_per_L,,2015_2017,NONE


In [10]:
dfhot = df.drop(['scientificNameOriginal_gbif', 'scientificNameOriginal_obis', 'scientificNameOriginal_maredat', 'scientificNameOriginal_villar', 'scientificNameOriginal_sal', 'individualCount', 'yearOfDataAccess'], axis = 1)

dfhot['organismQuantity_was_na'] = dfhot['organismQuantity'].isna()
dfhot['organismQuantity'] = dfhot['organismQuantity'].fillna(1)

dfhot['organismQuantity'] = pd.to_numeric(dfhot['organismQuantity'], errors='coerce')


In [11]:
dfhot.head(500)

Unnamed: 0,scientificName,decimalLongitude,decimalLatitude,year,month,day,depth,taxonRank,phylum,class,...,originDatabase_maredat,cruiseOrStationID_maredat,taraStation_villar,cruise_sal,sampleID_sal,MLD_villar_sal,organismQuantity,organismQuantityType,flag,organismQuantity_was_na
0,Carteria marina,-79.2150,-9.48200,2000,2,11.0,10.0,SPECIES,Chlorophyta,Chlorophyceae,...,,,,,,,1.0,,NONE,True
1,Coccopterum labyrinthus,-11.6042,40.68320,1993,5,11.0,0.0,SPECIES,Chlorophyta,Prasinophyceae,...,,,,,,,1.0,,S,True
2,Coccopterum labyrinthus,-11.6042,40.68320,1993,5,11.0,139.0,SPECIES,Chlorophyta,Prasinophyceae,...,,,,,,,1.0,,S,True
3,Coccopterum labyrinthus,-11.6042,40.68320,1993,5,11.0,181.0,SPECIES,Chlorophyta,Prasinophyceae,...,,,,,,,1.0,,S,True
4,Coccopterum labyrinthus,-11.6042,40.68320,1993,5,11.0,22.0,SPECIES,Chlorophyta,Prasinophyceae,...,,,,,,,1.0,,S,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,Halosphaera viridis,-115.7167,29.48333,1987,3,10.0,50.0,SPECIES,Chlorophyta,Pyramimonadophyceae,...,,,,,,,1.0,,NONE,True
496,Halosphaera viridis,-115.7167,29.48333,1987,3,21.0,0.0,SPECIES,Chlorophyta,Pyramimonadophyceae,...,,,,,,,1.0,,NONE,True
497,Halosphaera viridis,-115.7167,29.48333,1987,3,21.0,10.0,SPECIES,Chlorophyta,Pyramimonadophyceae,...,,,,,,,1.0,,NONE,True
498,Halosphaera viridis,-115.7167,29.48333,1987,3,21.0,175.0,SPECIES,Chlorophyta,Pyramimonadophyceae,...,,,,,,,1.0,,NONE,True


In [12]:
import plotly.express as px
hotspot_df = dfhot.dropna(subset=['organismQuantity', 'organismQuantityType', 'decimalLatitude', 'decimalLongitude'])


In [14]:
import plotly.express as px

# prepare dfhot (drop extras) and keep a flag for filled quantities
dfhot = df.drop([
    'scientificNameOriginal_gbif','scientificNameOriginal_obis',
    'scientificNameOriginal_maredat','scientificNameOriginal_villar',
    'scientificNameOriginal_sal','individualCount','yearOfDataAccess'
], axis=1)

dfhot['organismQuantity_was_na'] = dfhot['organismQuantity'].isna()
dfhot['organismQuantity'] = dfhot['organismQuantity'].fillna(1)
dfhot['organismQuantity'] = pd.to_numeric(dfhot['organismQuantity'], errors='coerce')

# Ask user
scientific_name = input("Enter the scientific name to plot: ").strip()

# find matches globally first (exact, then partial)
base = dfhot[dfhot['scientificName'].astype(str).notna()].copy()
exact_matches = base[base['scientificName'].astype(str).str.strip() == scientific_name]

if exact_matches.empty:
    # try case-insensitive contains on species epithet (or full string)
    candidates = base[base['scientificName'].astype(str).str.contains(scientific_name, case=False, na=False)]
    if candidates.empty:
        # suggest similar names
        suggestions = base['scientificName'].dropna().astype(str).str.strip().value_counts().head(20)
        print("No exact or partial matches found for:", scientific_name)
        print("Top 20 most frequent scientificName values (use one of these or try a partial):")
        print(suggestions)
        raise SystemExit
    else:
        print(f"Found {len(candidates)} partial matches for '{scientific_name}'.")
        species_df_global = candidates.copy()
else:
    species_df_global = exact_matches.copy()
    print(f"Found {len(species_df_global)} exact matches for '{scientific_name}'.")

# India-priority filtering
if prioritize_india:
    species_df_region = filter_to_region(species_df_global, mode=region_mode)
    if species_df_region.empty and region_mode == 'india_land':
        # fallback to bbox if land polygon produced none
        species_df_region = filter_to_region(species_df_global, mode='india_region')
    if species_df_region.empty:
        print("No rows in selected India region; falling back to global for plotting.")
        species_df = species_df_global.copy()
        title_prefix = f"{scientific_name} — global (India-priority fallback)"
    else:
        species_df = species_df_region.copy()
        title_prefix = f"{scientific_name} — India-priority ({region_mode})"
else:
    species_df = species_df_global.copy()
    title_prefix = f"{scientific_name} — global"

# ensure numeric coords and drop rows missing coords
species_df['decimalLatitude'] = pd.to_numeric(species_df['decimalLatitude'], errors='coerce')
species_df['decimalLongitude'] = pd.to_numeric(species_df['decimalLongitude'], errors='coerce')
species_df = species_df.dropna(subset=['decimalLatitude','decimalLongitude','organismQuantity'])

if species_df.empty:
    print("No rows with valid coordinates and organismQuantity to plot for this selection.")
    raise SystemExit

# optionally aggregate by rounded coordinates to reduce overlap / size
species_df['lat_round'] = species_df['decimalLatitude'].round(4)
species_df['lon_round'] = species_df['decimalLongitude'].round(4)
agg = species_df.groupby(['lat_round','lon_round']).agg({
    'organismQuantity':'sum',
    'scientificName':'first'
}).reset_index().rename(columns={'lat_round':'decimalLatitude','lon_round':'decimalLongitude'})

# take top N to avoid lag
agg = agg.sort_values('organismQuantity', ascending=False).head(10000)

fig = px.scatter_geo(
    
    agg,
    lat='decimalLatitude',
    lon='decimalLongitude',
    color='organismQuantity',
    size='organismQuantity',
    hover_name='scientificName',
    projection='natural earth',
    title=f'Top hotspots for {title_prefix} (aggregated, top 10000)'
)
fig.update_layout(height=600)
fig.show()

Found 33011 partial matches for 'leptocylindrus'.
