# Machine Learning to Identify Forams species and and whether the shell is cracked or not

## Finnegan Lab
## Data preprocessing

In [1]:
# Imports. Feel free to add or remove as necessary 

!pip install -Uq tensorflow-datasets==4.5.2
import pandas as pd
import numpy as np
np.random.seed(2022)
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import os, re, math
import keras
import shutil
from pathlib import Path
from PIL import Image
import tensorflow_datasets as tfds

In [2]:
#@title mount cloud
from google.colab import auth
auth.authenticate_user()
project_id = 'lucid-loader-347021'
!gcloud config set project {project_id}
!gsutil ls

Updated property [core/project].
gs://paleo-ml/


In [3]:
from google.colab import drive
drive.mount('/content/drive')
drive_dir_path ='drive/MyDrive/MV1012_SBB_images'
assert os.path.isdir(drive_dir_path), 'unable to find the drive. Please check your drive path above.'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Processing data and storing

Take data from its original form and crop/resize images as necessary. Store images with its corresponding label.

### Global default variable definitions

In [4]:
# Default to temporary Colab folder. WILL NOT BE SAVED.
dev_data_dir = './training_data_debug'
if not os.path.exists(dev_data_dir):
    os.mkdir(dev_data_dir)
# Directory where training data goes. The real one.
out_dir = os.path.join(drive_dir_path, 'ML_projects', 'Processed_data', 'forams')
train_out_dir = os.path.join(out_dir, 'train') 
val_out_dir = os.path.join(out_dir, 'val')
test_out_dir = os.path.join(out_dir, 'test')

### Function definitions

In [88]:
def create_training_data(from_file_path, labels_df, data_dir, override, verbose=False):
    """
    Takes a single image file FROM_FILE_PATH and crops and resizes it
     to create a training data image. Write that to the corresponding directory.
    """
    
    ## Extract identifying information from the sample ##
    # from_file_path == drive/MyDrive/MV1012_SBB_images/Box_Core_images/MV1012-BC-8_identify/MV1012-BC-8_obj01142_plane000.jpg

    # MV1012-BC-8_obj01142.jpg
    image_file_name = os.path.basename(from_file_path).replace('_plane000', '')

    # MV1012-BC-8_obj01142
    image_name = Path(image_file_name).stem

    # MV1012-BC-8 01142
    sample_name, object_number = re.match(r'(.+)_obj(\d+)', image_name).groups()
    object_number = int(object_number)
    
    # TODO: exit if sample_name and object_number now found within labels_df
    if not (sample_name, object_number) in labels_df.index:
        if verbose: print(f'skipped {(sample_name, object_number)} becuase it is not in labels_df')
        return

    # can get label after confirming existence
    species_label = labels_df.loc[(sample_name, object_number), 'species']
    dataset_type = 'train'
    if labels_df.loc[(sample_name, object_number), 'test']:
        dataset_type = 'test'
    elif labels_df.loc[(sample_name, object_number), 'val']:
        dataset_type = 'val'


    ## Create empty dir or exit if it already exists and we don't want to override.
    img_location = os.path.join(data_dir, dataset_type, image_file_name)
    print(img_location)
    if os.path.exists(img_location):
        if not override: 
            if verbose: print(f'skipped {image_file_name}')
            return
        os.remove(img_location)

    img = Image.open(from_file_path)
    img_data = np.asarray(img)
    label_size = 160
    no_label_img_data = img_data[:-label_size]

    ## Find columns where a majority of the summed pixel intensities is 0
    zero_pixels = no_label_img_data.sum(axis=(2)) == 0
    col_filter = zero_pixels.sum(axis=0) < zero_pixels.shape[0]/5 #Cols where less than 1/5 pixels are zeros
    cropped_image_data = no_label_img_data[:, col_filter, :]
    cropped_image = Image.fromarray(cropped_image_data)
    ## Convert to image again and resize
    size = 416 # input size, width and height of image 
    resized_image = cropped_image.resize((size, size))

    ## Store image in output directory
    # print(os.path.join(data_point_dir, f'{image_name}.jpg'))
    resized_image.save(img_location)
    if verbose: print(f'created ${image_name} in {species_label} as {dataset_type}')


In [89]:
def process_sample_dir(sample_dir, sample_name, labels_df, data_dir=dev_data_dir, override=False, verbose=True):
    '''
    Takes a directory full of images and loops over them 
    to process the images.
    '''
    for file_name in os.listdir(sample_dir):
        file_path = os.path.join(sample_dir, file_name)
        file_ext = os.path.splitext(file_name)[-1].lower()
        if file_ext == '.csv':
            pass
        elif file_ext == '.jpg':
            #TODO: possible make the name the classification? 
            create_training_data(file_path, labels_df, data_dir=data_dir,override=override, verbose=verbose) #TODO set training data dir instead of using default debug

In [99]:
def augment_data(aug_dir, num_copies=1, num_images=None):
    """
    DEPRECATED. Data augmentation will be done by keras 
    Augment data according to a predefined set of params. 
    """
    for class_dir in os.listdir(aug_dir):
        p = Augmentor.Pipeline(os.path.join(aug_dir, class_dir), output_directory='')

        class_path = os.path.join(aug_dir, class_dir)
        num_existing_images = len(os.listdir(class_path))
        if (num_existing_images < 1):
            continue

        p.skew(0.5, 0.3)
        p.random_distortion(0.5, 8, 8, 8)
        p.rotate_random_90(0.75)
        p.shear(0.5, max_shear_left=7, max_shear_right=7)
        p.flip_random(0.75)

        if num_images is None:
            p.process() # 1 for each
        p.sample(num_existing_images * num_copies)

    



In [100]:
def purge(train_dir, aug_only=True):
    print(f'purging all {"augmentation" if aug_only else "images"} in {train_dir}. Type "YES" to continue.')
    if input() != "YES":
        return None

    
    for class_dir in os.listdir(train_dir):
        print(f'starting {class_dir}')
        total_deleted_in_class = 0
        class_path = os.path.join(train_dir, class_dir)
        for file in os.listdir(class_path):
            if not aug_only or 'original' in file:
                total_deleted_in_class += 1
                file_path = os.path.join(class_path, file)
                try:
                    os.remove(file_path)
                except:
                    shutil.rmtree(file_path)
        print(f'finished {class_dir} and removed {total_deleted_in_class}')


### Run processing

In [107]:
# Collect label/metadata information from master CSV

labels_csv_path = os.path.join(drive_dir_path, 'Final_Diversity_CSVs', 'ML_master2.csv')
assert os.path.isfile(labels_csv_path), 'Unable to find master CSV document!'
labels_df = pd.read_csv(labels_csv_path)
labels_df = labels_df.set_index(['sample_name', 'object_num'])
labels_df.sort_index()
# ACCESS via: labels_df.loc[('MV1012-BC-2', 1)]
labels_df = labels_df.dropna(how='any')

# Randomly decide if it will be in the train, val or test section.
# 0.8 training, 0.2 test
# 0.8 * 0.8 = 0.64 train, 0.16 val
sample_ind = np.random.random_sample(labels_df.shape[0])
labels_df['test'] = sample_ind > 0.8
labels_df['val'] = (sample_ind <= 0.8) & (sample_ind > 0.64)
labels_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Broken,species,age,test,val
sample_name,object_num,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MV1012-BC-12,1.0,unbroken,NOT FORAM,1994.0,False,False
MV1012-BC-12,2.0,unbroken,suggrunda eckisi,1994.0,True,False
MV1012-BC-12,3.0,broken,bulimina exilis,1994.0,False,True
MV1012-BC-12,4.0,unbroken,nonionella stella,1994.0,True,False
MV1012-BC-12,5.0,unbroken,NOT FORAM,1994.0,False,True


In [None]:
#Run data processing

assert os.path.isdir(out_dir), 'unable to find the output for processed training data'

imgs_dir = os.path.join(drive_dir_path, 'Box_Core_images')


for sample_dir in os.listdir(imgs_dir):
    sample_dir_path = os.path.join(imgs_dir, sample_dir)
    sample_name = re.match(r'(.+)_identify', sample_dir).groups()[0]
    # process_sample_dir(sample_dir_path, sample_name, labels_df, data_dir=dev_data_dir, override=False)
    process_sample_dir(sample_dir_path, sample_name, labels_df, data_dir=out_dir, override=False)


    
 

## Data augmentation

In [82]:
# Iterate over all training directories to create 
debug_aug = False


aug_dir = os.path.join(dev_data_dir, 'train') if debug_aug else train_out_dir
aug_dir

'drive/MyDrive/MV1012_SBB_images/ML_projects/Processed_data/forams/train'

In [None]:
#Clear previous aug. Deletes all non images too!
purge(aug_dir, aug_only=True)

In [None]:
augment_data(aug_dir, num_copies=5)

## Create tensorflow dataset and push to google cloud

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import image_dataset_from_directory


In [6]:
#@title constants
image_size = (416, 416)

input_shape = (*image_size, 3)
drive_prefix = '/content/drive/MyDrive/MV1012_SBB_images/ML_projects/'
# train_data_dir = f'{drive_prefix}/Processed_data/train'
# val_data_dir = f'{drive_prefix}/Processed_data/val'
# test_data_dir = f'{drive_prefix}/Processed_data/test'
tfds_dir = f'{drive_prefix}/Processed_data/forams'

logdir_base = f'{drive_prefix}/Training_logs/'
trained_model_dir_base = f'{drive_prefix}/Trained_models/'

batch_size = 128 * 2
google_cloud_storage = 'gs://paleo-ml'

In [None]:
#@title import datasets
training_set = image_dataset_from_directory(
    train_data_dir,
    labels="inferred",
    label_mode="categorical",
    image_size=image_size,
    shuffle=True,
    seed=0,
    batch_size=batch_size,
)
validation_set = image_dataset_from_directory(
    val_data_dir,
    labels="inferred",
    label_mode="categorical",
    image_size=image_size,
    shuffle=True,
    seed=0,
    batch_size=batch_size
)
testing_set = image_dataset_from_directory(
    test_data_dir,
    labels="inferred",
    label_mode="categorical",
    image_size=image_size,
    shuffle=True,
    seed=0,
    batch_size=batch_size
)

Found 43015 files belonging to 65 classes.
Found 1553 files belonging to 65 classes.
Found 1939 files belonging to 65 classes.


In [12]:

# TODO create the respective sets by IMPORTING the image dataset!
os.chdir(tfds_dir)
# print(os.getcwd())
import forams
ds = tfds.load('forams', split='train')  # `my_dataset` registered

In [22]:

tf.data.experimental.save(training_set, f'{google_cloud_storage}/training{batch_size}')
tf.data.experimental.save(validation_set, f'{google_cloud_storage}/validation{batch_size}')
tf.data.experimental.save(testing_set, f'{google_cloud_storage}/testing{batch_size}')

NameError: ignored