In [10]:
import pandas as pd
import os
import pickle

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


DATA VISUALIZATION


In [17]:
# load images
with open("/content/drive/MyDrive/Plankton/image_dataframe.pkl", "rb") as f:
    df = pickle.load(f)

print(df.head())
print(df.columns)


  Image_Class                                       Image_Matrix
0         mix  [[204, 204, 204, 203, 203, 203, 203, 200, 202,...
1         mix  [[200, 201, 203, 201, 199, 200, 202, 201, 199,...
2         mix  [[221, 222, 223, 224, 223, 222, 221, 222, 222,...
3         mix  [[192, 191, 188, 183, 182, 189, 185, 180, 185,...
4         mix  [[212, 212, 212, 212, 213, 213, 213, 211, 212,...
Index(['Image_Class', 'Image_Matrix'], dtype='object')


In [15]:
counts = df["Image_Class"].value_counts()
print(counts)



Image_Class
mix                      500
Ciliate_mix              500
Dactyliosolen            500
Cerataulina              500
mix_elongated            500
Cylindrotheca            500
Mesodinium_sp            500
DactFragCerataul         500
Guinardia_striata        500
Ditylum                  500
Asterionellopsis         500
Chaetoceros              500
Leptocylindrus           500
G_delicatula_parasite    500
bad                      500
Corethron                500
Dictyocha                500
Guinardia_delicatula     500
flagellate_sp3           500
Dinobryon                500
detritus                 500
dino30                   500
Name: count, dtype: int64


In [5]:
import zipfile
import os

# Path to your zip file
zip_path = '/content/drive/MyDrive/Plankton/WHOI_raw_data/2013.zip'

# Extract to the same directory as the zip file
extract_path = '/content/drive/MyDrive/Plankton/WHOI_raw_data/'

# Create the extraction directory if it doesn't exist
os.makedirs(extract_path, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"Successfully extracted to {extract_path}")

KeyboardInterrupt: 

## LOAD DATASET

In [None]:
folder_paths = ['/content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2007', '/content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2013', '/content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2014']

In [None]:
all_images_data = []

for folder_path in folder_paths:
    print(f"--> Processing main folder: {folder_path}")

    for root, subdirs, files in os.walk(folder_path):
        if files:
            image_class = os.path.basename(root)

            for file_name in files:
                file_path = os.path.join(root, file_name)

                all_images_data.append({
                    'Image Class': image_class,
                    'Image Path': file_path
                })

print("\n...Processing complete!")

# 4. Create a Pandas DataFrame from the collected data
df = pd.DataFrame(all_images_data)

# 5. Display information about the final DataFrame
print("\n--- Combined DataFrame ---")
print(f"Total number of images found: {len(df)}")
print(f"Number of unique classes found: {df['Image Class'].nunique()}")

print("\nFirst 5 rows of the DataFrame:")
print(df.head())

print("\nLast 5 rows of the DataFrame:")
print(df.tail())

Starting to process folders...
--> Processing main folder: /content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2007
--> Processing main folder: /content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2013
--> Processing main folder: /content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2014

...Processing complete!

--- Combined DataFrame ---
Total number of images found: 581892
Number of unique classes found: 98

First 5 rows of the DataFrame:
        Image Class                                         Image Path
0         Bidulphia  /content/drive/MyDrive/Data/Plankton/WHOI_unzi...
1         Bidulphia  /content/drive/MyDrive/Data/Plankton/WHOI_unzi...
2         Bidulphia  /content/drive/MyDrive/Data/Plankton/WHOI_unzi...
3  Asterionellopsis  /content/drive/MyDrive/Data/Plankton/WHOI_unzi...
4  Asterionellopsis  /content/drive/MyDrive/Data/Plankton/WHOI_unzi...

Last 5 rows of the DataFrame:
       Image Class                                         Image Path
581887  Licmophora

In [None]:
df.shape, df['Image Class'], df['Image Path']

((581892, 2),
 0                Bidulphia
 1                Bidulphia
 2                Bidulphia
 3         Asterionellopsis
 4         Asterionellopsis
                 ...       
 581887          Licmophora
 581888          Licmophora
 581889          Licmophora
 581890          Licmophora
 581891          Licmophora
 Name: Image Class, Length: 581892, dtype: object,
 0         /content/drive/MyDrive/Data/Plankton/WHOI_unzi...
 1         /content/drive/MyDrive/Data/Plankton/WHOI_unzi...
 2         /content/drive/MyDrive/Data/Plankton/WHOI_unzi...
 3         /content/drive/MyDrive/Data/Plankton/WHOI_unzi...
 4         /content/drive/MyDrive/Data/Plankton/WHOI_unzi...
                                 ...                        
 581887    /content/drive/MyDrive/Data/Plankton/WHOI_unzi...
 581888    /content/drive/MyDrive/Data/Plankton/WHOI_unzi...
 581889    /content/drive/MyDrive/Data/Plankton/WHOI_unzi...
 581890    /content/drive/MyDrive/Data/Plankton/WHOI_unzi...
 581891    /conte

In [None]:
import cv2
import numpy as np

## LOAD DATASET TO IMAGES

In [None]:
folder_paths = ['/content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2007', '/content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2013', '/content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2014']
IMAGE_WIDTH = 64
IMAGE_HEIGHT = 64

all_images_data = []

print("--- Starting to Scan Image Folders ---")
for folder_path in folder_paths:
    print(f"--> Processing main folder: {folder_path}")

    for root, subdirs, files in os.walk(folder_path):
        if files:
            image_class = os.path.basename(root)

            for file_name in files:
                file_path = os.path.join(root, file_name)

                # Append the data to our list
                all_images_data.append({
                    'Image Class': image_class,
                    'Image Path': file_path
                })

print("\n...Initial file scan complete!")

# Create a Pandas DataFrame from the collected path data
path_df = pd.DataFrame(all_images_data)

if path_df.empty:
    print("\nError: No images were found. Please check your 'folder_paths'.")
else:
    print("\n--- Initial DataFrame with Image Paths ---")
    print(f"Total number of images found: {len(path_df)}")
    print(f"Number of unique classes found: {path_df['Image Class'].nunique()}")
    print("\nFirst 5 rows:")
    print(path_df.head())

    # --- 2. Create New DataFrame with Image Matrices ---
    # This is the new part that processes the images into numerical data.

    processed_images_data = []
    total_images = len(path_df)

    print(f"\n--- Processing Images into Numerical Matrices ({IMAGE_WIDTH}x{IMAGE_HEIGHT}) ---")

    for index, row in path_df.iterrows():
        image_path = row['Image Path']
        image_class = row['Image Class']

        try:
            img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

            # Resize the image to a consistent size
            resized_img = cv2.resize(img, (IMAGE_WIDTH, IMAGE_HEIGHT))

            processed_images_data.append({
                'Image Class': image_class,
                'Image Matrix': resized_img
            })

            # Print progress
            if (index + 1) % 10000 == 0:
              print(f"Processing image {index + 1}/{total_images}...")

        except Exception as e:
            print(f"\nError processing image {image_path}: {e}")

    print("\n\n...Image processing complete!")

    # 3. Create the final DataFrame with numerical data
    image_matrix_df = pd.DataFrame(processed_images_data)

    # 4. Display information about the final DataFrame
    if not image_matrix_df.empty:
        print("\n--- Final DataFrame with Image Matrices ---")
        print(f"Total number of images processed: {len(image_matrix_df)}")
        print("\nFirst 5 rows of the new DataFrame:")
        print(image_matrix_df.head())

        # You can access a specific image matrix like this:
        print("\nExample of one image matrix from the DataFrame:")
        # .iloc[0] gets the first row, ['Image Matrix'] gets the matrix itself
        example_matrix = image_matrix_df.iloc[0]['Image Matrix']
        print(example_matrix)
        print(f"Shape of the matrix: {example_matrix.shape}")
    else:
        print("\nCould not create the final DataFrame as no images were processed successfully.")


--- Starting to Scan Image Folders ---
--> Processing main folder: /content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2007
--> Processing main folder: /content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2013
--> Processing main folder: /content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2014

...Initial file scan complete!

--- Initial DataFrame with Image Paths ---
Total number of images found: 552006
Number of unique classes found: 98

First 5 rows:
        Image Class                                         Image Path
0         Bidulphia  /content/drive/MyDrive/Data/Plankton/WHOI_unzi...
1         Bidulphia  /content/drive/MyDrive/Data/Plankton/WHOI_unzi...
2         Bidulphia  /content/drive/MyDrive/Data/Plankton/WHOI_unzi...
3  Asterionellopsis  /content/drive/MyDrive/Data/Plankton/WHOI_unzi...
4  Asterionellopsis  /content/drive/MyDrive/Data/Plankton/WHOI_unzi...

--- Processing Images into Numerical Matrices (64x64) ---

Error processing image /content/drive/MyDrive

In [None]:
# Get all items in the main folder
folder_path = folder_paths[1]
items = os.listdir(folder_path)

# Filter to get only directories
subfolders = [item for item in items if os.path.isdir(os.path.join(folder_path, item))]

print(f"Found {len(subfolders)} subfolders in {folder_path}:")
print("-" * 50)

# Print each subfolder name and count files in it
for subfolder in sorted(subfolders):
    subfolder_path = os.path.join(folder_path, subfolder)

    # Count files in the subfolder
    try:
        files = [f for f in os.listdir(subfolder_path) if os.path.isfile(os.path.join(subfolder_path, f))]
        file_count = len(files)
        print(f"{subfolder}: {file_count} files")
    except PermissionError:
        print(f"{subfolder}: Permission denied")
    except Exception as e:
        print(f"{subfolder}: Error - {e}")

print("-" * 50)
print(f"Total subfolders processed: {len(subfolders)}")

Found 65 subfolders in /content/drive/MyDrive/Data/Plankton/WHOI_unzipped_data/2014:
--------------------------------------------------
Akashiwo: 2 files
Amphidinium_sp: 66 files
Asterionellopsis: 128 files
Bacillaria: 0 files
Bidulphia: 0 files
Cerataulina: 412 files
Cerataulina_flagellate: 5 files
Ceratium: 6 files
Chaetoceros: 1871 files
Chaetoceros_didymus: 11 files
Chaetoceros_didymus_flagellate: 1 files
Chaetoceros_flagellate: 4 files
Chaetoceros_other: 9 files
Chaetoceros_pennate: 6 files
Chrysochromulina: 48 files
Ciliate_mix: 1074 files
Cochlodinium: 4 files
Corethron: 447 files
Coscinodiscus: 17 files
Cylindrotheca: 2345 files
DactFragCerataul: 175 files
Dactyliosolen: 532 files
Delphineis: 55 files
Dictyocha: 61 files
Didinium_sp: 7 files
Dinobryon: 588 files
Dinophysis: 13 files
Ditylum: 217 files
Ditylum_parasite: 22 files
Emiliania_huxleyi: 8 files
Ephemera: 23 files
Eucampia: 44 files
Euglena: 19 files
Euplotes_sp: 3 files
G_delicatula_detritus: 1 files
G_delicatula_exte