In [9]:
!pip install openai

Collecting openai
  Downloading openai-2.0.0-py3-none-any.whl.metadata (29 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Using cached distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Using cached jiter-0.11.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (5.2 kB)
Collecting tqdm>4 (from openai)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading openai-2.0.0-py3-none-any.whl (955 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m955.5/955.5 kB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached distro-1.9.0-py3-none-any.whl (20 kB)
Using cached jiter-0.11.0-cp312-cp312-macosx_11_0_arm64.whl (316 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm, jiter, distro, openai
Successfully installed distro-1.9.0 jiter-0.11.0 openai-2.0.0 tqdm-4.67.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [

In [14]:
import pandas as pd

In [15]:
df = pd.read_csv('/Users/shawnpana/Documents/GitHub/sushihacks2025/backend/data/occurrence_parsed.csv')

In [16]:

# in a csv file called output.csv, write the unique values of the column 'scientificName', one per line
df['scientificName'].drop_duplicates().to_csv('/Users/shawnpana/Documents/GitHub/sushihacks2025/backend/data/output.csv', index=False, header=True)

In [None]:
# First install openai if not already installed
# Run this in terminal: pip install openai

# Fish Classification using OpenAI API
import pandas as pd
import os
import time
from datetime import datetime

# Check if openai is installed
try:
    from openai import OpenAI
    print("✅ OpenAI library is installed")
except ImportError:
    print("❌ OpenAI library not found. Please install it with: pip install openai")
    print("After installing, restart the kernel and run this cell again")
    raise

# Initialize OpenAI client
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    print("⚠️ Warning: OPENAI_API_KEY environment variable not set")
    print("Set it with: export OPENAI_API_KEY='your-key-here'")
else:
    print("✅ API key found")

client = OpenAI(api_key=api_key)

def classify_fish(scientific_name: str):
    """Classify a single fish species using GPT-4"""
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {
                    "role": "system", 
                    "content": "You are a marine biology and culinary expert. Classify fish species accurately based on scientific knowledge and fisheries data. Respond with EXACTLY 6 values separated by pipe symbols (|). No extra text."
                },
                {
                    "role": "user", 
                    "content": f"""Classify the fish species '{scientific_name}':
1. Cleaning Difficulty: easy, medium, or hard
2. Market Commonality: common, uncommon, or rare
3. General Availability: year-round, seasonal, or rarely-available
4. Peak Season: ONLY format as Month-Month (e.g., "June-August" or "December-February" or "Year-Round" if available all year)
5. Is Edible: yes or no (consider if safe for human consumption)
6. Primary Data Source: The main source for this information (e.g., "FAO Fisheries", "NOAA Database", "FishBase", "SeaLifeBase", "Marine Biology Research", "General Knowledge")

Format EXACTLY as: difficulty|commonality|availability|peak_season|edible|source
Example: medium|common|seasonal|June-September|yes|FAO Fisheries"""
                }
            ],
            temperature=0.3,
            max_tokens=60
        )
        
        result = response.choices[0].message.content.strip()
        parts = [x.strip() for x in result.split('|')]
        
        if len(parts) != 6:
            print(f"  ⚠️ Unexpected format for {scientific_name}: {result}")
            return ['unknown', 'unknown', 'unknown', 'unknown', False, 'GPT-4 Estimation']
        
        # Format peak season properly (capitalize months)
        peak_season = parts[3]
        if peak_season.lower() != 'year-round':
            # Capitalize first letter of each word for months
            peak_season = '-'.join([month.capitalize() for month in peak_season.split('-')])
        else:
            peak_season = 'Year-Round'
        
        # Convert edibility to boolean
        is_edible = parts[4].lower() in ['yes', 'edible', 'true', '1']
        
        # Clean up source name
        source = parts[5] if parts[5] else 'GPT-4 Knowledge Base'
        
        return [parts[0].lower(), parts[1].lower(), parts[2].lower(), peak_season, is_edible, source]
        
    except Exception as e:
        print(f"  ❌ Error with {scientific_name}: {str(e)[:50]}")
        return ['error', 'error', 'error', 'error', False, 'Error']

# Process fish species
output_rows = []
csv_path = '/Users/shawnpana/Documents/GitHub/sushihacks2025/backend/data/output.csv'

try:
    # Read species names (skip header if present)
    df_species = pd.read_csv(csv_path)
    scientific_names = df_species['scientificName'].tolist() if 'scientificName' in df_species.columns else df_species.iloc[:, 0].tolist()
    
    total_species = len(scientific_names)
    print(f"\n📊 Processing ALL {total_species} fish species...")
    print("Will classify: cleaning difficulty, commonality, seasonality, peak season, edibility, and source")
    print("-" * 90)
    
    # Process ALL species
    for i, name in enumerate(scientific_names, 1):
        print(f"{i:4d}/{total_species}. {name[:30]:<30}", end=" ")
        
        parts = classify_fish(name)
        output_rows.append({
            'scientificName': name,
            'cleaning_difficulty': parts[0],
            'commonality': parts[1],
            'seasonality': parts[2],
            'peak_season': parts[3],  # Format: "June-August" or "Year-Round"
            'is_edible': parts[4],  # Boolean: True or False
            'data_source': parts[5],  # Source of information
            'classification_date': datetime.now().strftime('%Y-%m-%d')  # When classified
        })
        
        # Shorter output with emoji indicators
        edible_emoji = "✅" if parts[4] else "❌"
        source_short = parts[5][:15] if len(parts[5]) > 15 else parts[5]
        print(f"→ {parts[3][:12]:<12} {edible_emoji} [{source_short}]")
        
        # Rate limiting to avoid hitting API rate limits
        if i % 3 == 0:
            time.sleep(0.5)  # Small delay every 3 requests
        
        if i % 50 == 0:
            # Save intermediate progress every 50 species
            df_temp = pd.DataFrame(output_rows)
            df_temp.to_csv('/Users/shawnpana/Documents/GitHub/sushihacks2025/backend/data/fish_classification_temp.csv', index=False)
            print(f"  💾 Saved progress: {i}/{total_species} species processed")
            time.sleep(2)  # Longer pause every 50 to avoid rate limits
    
    # Save final results
    df_output = pd.DataFrame(output_rows)
    output_path = '/Users/shawnpana/Documents/GitHub/sushihacks2025/backend/data/fish_classification.csv'
    df_output.to_csv(output_path, index=False)
    
    print("-" * 90)
    print(f"✅ Successfully classified ALL {len(output_rows)} fish species!")
    print(f"📄 Results saved to: fish_classification.csv")
    
    # Summary statistics
    print("\n📊 Summary Statistics:")
    print(f"  Edible fish: {df_output['is_edible'].sum()} ({(df_output['is_edible'].sum()/len(df_output)*100):.1f}%)")
    print(f"  Non-edible fish: {(~df_output['is_edible']).sum()} ({((~df_output['is_edible']).sum()/len(df_output)*100):.1f}%)")
    print(f"  Year-round availability: {(df_output['seasonality'] == 'year-round').sum()}")
    print(f"  Seasonal fish: {(df_output['seasonality'] == 'seasonal').sum()}")
    
    # Data source breakdown
    print("\n📚 Data Sources Used:")
    source_counts = df_output['data_source'].value_counts()
    for source, count in source_counts.head(10).items():
        print(f"  {source}: {count} species ({count/len(df_output)*100:.1f}%)")
    
    print("\n🐟 Sample Results:")
    sample_cols = ['scientificName', 'peak_season', 'is_edible', 'data_source']
    print(df_output[sample_cols].head(10))
    
    # Show some edible seasonal fish
    print("\n🎣 Sample of Edible Seasonal Fish:")
    seasonal_edible = df_output[(df_output['is_edible'] == True) & (df_output['seasonality'] == 'seasonal')]
    if len(seasonal_edible) >= 5:
        print(seasonal_edible[['scientificName', 'peak_season', 'commonality', 'data_source']].sample(min(5, len(seasonal_edible))))
    
except FileNotFoundError:
    print(f"❌ File not found: {csv_path}")
    print("Run the previous cells to create output.csv first")
except KeyboardInterrupt:
    print(f"\n\n⚠️ Process interrupted! Processed {len(output_rows)} species so far")
    if output_rows:
        df_partial = pd.DataFrame(output_rows)
        df_partial.to_csv('/Users/shawnpana/Documents/GitHub/sushihacks2025/backend/data/fish_classification_partial.csv', index=False)
        print(f"💾 Partial results saved to fish_classification_partial.csv")
except Exception as e:
    print(f"❌ Error: {e}")
    if output_rows:
        df_error = pd.DataFrame(output_rows)
        df_error.to_csv('/Users/shawnpana/Documents/GitHub/sushihacks2025/backend/data/fish_classification_error.csv', index=False)
        print(f"💾 Results before error saved to fish_classification_error.csv")