In [None]:
# Download data from DL and save a csv file with filename, identifier, scientificName, CommonName, and Comments columns

## Note that you might have to download in segments because of discover life black listing ip addresses
import os
import requests
from bs4 import BeautifulSoup
import csv

# Function to download an image
def download_image(url, folder, filename):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        filepath = os.path.join(folder, filename)
        with open(filepath, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        return filepath
    return None

# Configurables
output_folder = "/Users/markfisher/Desktop/cr_moth_classification/cr_images"
csv_filename = "fisher_image_data.csv"
photoAuthorCode = "I_MFS"
baseDownloadUrl = f"https://www.discoverlife.org/mp/20p?res=640&see={photoAuthorCode}/"


# Define the folder for saving images
os.makedirs(output_folder, exist_ok=True)

# Prepare CSV file
csv_headers = ["Filename", "Identifier", "scientificName", "CommonName", "Comments"]

# Open the CSV file once and manage it across all iterations
with open(csv_filename, "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(csv_headers)  # Write headers only once

# Loop through the URLs
# for pageNum in range(0, 359):
for pageNum in range(118, 164):
    url = f"{baseDownloadUrl}{pageNum:04}&flags=col9:"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Append rows to the CSV file
    with open(csv_filename, "a", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)

        # Parse the HTML to find image entries
        for td in soup.find_all("td", valign="top"):
            img_tag = td.find("img")
            if img_tag and "src" in img_tag.attrs:
                img_src = img_tag["src"]
                img_url = f"https://www.discoverlife.org{img_src}"
                filename = os.path.basename(img_src)

                # Download the image
                download_image(img_url, output_folder, filename)

                # Extract text elements (identifier, scientific name, common name, comments)
                text_elements = td.find_all(string=True)
                text_values = [text.strip() for text in text_elements if text.strip()]

                # Ensure exactly 4 columns (fill with None if missing)
                while len(text_values) < 4:
                    text_values.append(None)

                # Write the data row to CSV
                writer.writerow([filename] + text_values[:4])

    print(f"Processed page {pageNum:04}: Images downloaded and data appended to '{csv_filename}'.")

print(f"All pages processed. Images saved to '{output_folder}' and metadata saved to '{csv_filename}'.")


In [None]:
# Combine all of the target csvs generated from the above download runs

import os
import pandas as pd

# Specify the directory containing the CSV files
input_directory = "/Users/markfisher/Desktop/cr_moth_classification/target_csvs"  # Change this to the path where your CSV files are stored
output_file = "combined_output.csv"  # Name of the output file

# Get a list of all CSV files in the directory
csv_files = [file for file in os.listdir(input_directory) if file.endswith('.csv')]

# Combine the CSV files
combined_df = pd.DataFrame()
for csv_file in csv_files:
    file_path = os.path.join(input_directory, csv_file)
    df = pd.read_csv(file_path)
    combined_df = pd.concat([combined_df, df], ignore_index=True)

# Save the combined data to a new CSV file
combined_df.to_csv(output_file, index=False)

print(f"Combined {len(csv_files)} files into {output_file}")


In [None]:
# Get all unique sciNames from the combined csv file above

## This is only needed when creating the initial classification training data. Creates and de-dupes the scientificName column

import pandas as pd

# Specify the input CSV file
input_file = "combined_output.csv"  # Change this to your input CSV file
output_file = "unique_values_output.csv"  # Name of the output file

# Specify the column name to extract unique values from
target_column = "scientificName"  # Replace with the actual column name you want to target

try:
    # Read the input CSV file
    df = pd.read_csv(input_file)

    # Check if the target column exists
    if target_column not in df.columns:
        raise ValueError(f"Column '{target_column}' not found in the input file.")

    # Extract unique values from the target column
    unique_values = df[target_column].drop_duplicates().reset_index(drop=True)

    # Save the unique values to a new CSV file
    unique_values.to_csv(output_file, index=False, header=[target_column])

    print(f"Unique values from column '{target_column}' have been written to {output_file}")

except Exception as e:
    print(f"An error occurred: {e}")


In [None]:
# Run the GBIF tool (@TODO get that link from Slack convo with Michael) in order to generate the scientificNames as recognized by GBIF

## E.g.

## @TODO render this into a pretty table
occurrenceId,verbatimScientificName,scientificName,key,matchType,confidence,status,rank,kingdom,phylum,class,order,family,genus,species,canonicalName,authorship,usageKey,acceptedUsageKey
,"Lobocleta tenellata","Lobocleta tenellata (Möschler, 1886)","1990261","EXACT","99","ACCEPTED","SPECIES","Animalia","Arthropoda","Insecta","Lepidoptera","Geometridae","Lobocleta","Lobocleta tenellata","Lobocleta tenellata","(Möschler, 1886) ","1990261",
,"Notodontidae","Notodontidae","7016","EXACT","94","ACCEPTED","FAMILY","Animalia","Arthropoda","Insecta","Lepidoptera","Notodontidae",,,"Notodontidae",,"7016",
,"Clemensia leopardina","Clemensia leopardina Schaus, 1911","5114902","EXACT","99","ACCEPTED","SPECIES","Animalia","Arthropoda","Insecta","Lepidoptera","Erebidae","Clemensia","Clemensia leopardina","Clemensia leopardina","Schaus, 1911","5114902",
,"Rhabdatomis laudamia","Rhabdatomis laudamia Druce, 1885","1811258","EXACT","99","ACCEPTED","SPECIES","Animalia","Arthropoda","Insecta","Lepidoptera","Erebidae","Rhabdatomis","Rhabdatomis laudamia","Rhabdatomis laudamia","Druce, 1885","1811258",
,"Eupithecia sp_group_san_luis","Eupithecia Curtis, 1825","1982367","HIGHERRANK","92","ACCEPTED","GENUS","Animalia","Arthropoda","Insecta","Lepidoptera","Geometridae","Eupithecia",,"Eupithecia","Curtis, 1825","1982367",
,"Crambidia myrlosea","Crambidia myrlosea Dyar, 1917","1805542","EXACT","99","ACCEPTED","SPECIES","Animalia","Arthropoda","Insecta","Lepidoptera","Erebidae","Crambidia","Crambidia myrlosea","Crambidia myrlosea","Dyar, 1917","1805542",
,"Leptostales crossii","Leptostales crossii (Hulst, 1900)","9619088","EXACT","99","ACCEPTED","SPECIES","Animalia","Arthropoda","Insecta","Lepidoptera","Geometridae","Leptostales","Leptostales crossii","Leptostales crossii","(Hulst, 1900) ","9619088",
,"Meganola sp_san_luis_b","Meganola Dyar, 1898","4405493","HIGHERRANK","94","ACCEPTED","GENUS","Animalia","Arthropoda","Insecta","Lepidoptera","Nolidae","Meganola",,"Meganola","Dyar, 1898","4405493",
,"Crambidae","Crambidae","8841","EXACT","94","ACCEPTED","FAMILY","Animalia","Arthropoda","Insecta","Lepidoptera","Crambidae",,,"Crambidae",,"8841",