## Carvana Image Masking: Image Segmentation with LoRA

* This file just creates the metadata files that for loading data (which I do in the main project).

Dataset Source: https://www.kaggle.com/datasets/ipythonx/carvana-image-masking-png

#### Import Necessary Libraries

In [None]:
import os, sys, glob, shutil
os.environ['TOKENIZERS_PARALLELISM']='false'

import numpy as np
import pandas as pd

import sklearn
from sklearn.model_selection import train_test_split

from PIL import Image

Git LFS initialized.


#### Display Library Versions

In [None]:
print("Python :".rjust(18), sys.version[0:6])
print("NumPy :".rjust(18), np.__version__)
print("Pandas :".rjust(18), pd.__version__)
print("Scikit-Learn :".rjust(18), sklearn.__version__)

          Python : 3.9.12
           NumPy : 1.25.1
          Pandas : 2.0.3
    Scikit-Learn : 1.3.0
           Torch : 2.0.1
    Torch Vision : 0.15.2
    Transformers : 4.26.1
        Evaluate : 0.4.0
            PEFT : 0.3.0


#### Create & Save Metadata File

In [None]:
parent_dir = "/Users/briandunn/Desktop/Image Segmentation Projects/Carvana Image Masking/data/"

# Gather Collection of images and masks
image_file_names = glob.glob(f"{parent_dir}images/*.jpg")
mask_file_names = glob.glob(f"{parent_dir}masks/*.png")

# Convert Collections to Pandas DataFrames
image_df = pd.DataFrame(image_file_names, columns=['images'])
mask_df = pd.DataFrame(mask_file_names, columns=['annotation'])

# Convert file_names from absolute to relative
image_df['images'] = image_df['images'].apply(lambda x: x.split('data/')[-1])
mask_df['annotation'] = mask_df['annotation'].apply(lambda x: x.split('data/')[-1])

# Create column of commonality for both DataFrames
image_df['idx'] = image_df['images'].apply(lambda x: x.split('images/')[-1]).apply(lambda x: x.split('.')[0])
mask_df['idx'] = mask_df['annotation'].apply(lambda x: x.split('masks/')[-1]).apply(lambda x: x.split('.')[0])

# Concatenate the two DataFrames with file_names and labels matching correctly
image_df = image_df.set_index('idx')
mask_df = mask_df.set_index('idx')

meta_df = pd.concat([image_df, mask_df], axis=1)

meta_df = meta_df.reset_index().drop(columns=['idx'])

# Split Dataset into Training & Testing Datasets
train_meta_df, test_meta_df = train_test_split(meta_df, 
                                               test_size=0.20)


train_meta_df = train_meta_df.reset_index(drop=True)
test_meta_df = test_meta_df.reset_index(drop=True)

# Create Training & Testing Directories 
train_directory_location = os.path.join(parent_dir + "training")
os.mkdir(train_directory_location)
os.mkdir(os.path.join(train_directory_location, "images"))
os.mkdir(os.path.join(train_directory_location, "annotations"))

test_directory_location = os.path.join(parent_dir + "testing")
os.mkdir(test_directory_location)
os.mkdir(os.path.join(test_directory_location, "images"))
os.mkdir(os.path.join(test_directory_location, "annotations"))

#### Move Files in Training Dataset

In [None]:
training_images_to_relocate = train_meta_df['images'].apply(lambda x: x.split('images/')[-1])

current_training_parent_dir = os.path.join(parent_dir + "images/")
new_training_parent_dir = os.path.join(parent_dir + "training/images/")

for image_files in training_images_to_relocate:
    current_image_location = os.path.join(current_training_parent_dir + image_files)
    new_image_location = os.path.join(new_training_parent_dir + image_files)
    shutil.move(current_image_location, new_image_location)
    
training_masks_to_relocate = train_meta_df['annotation'].apply(lambda x: x.split('masks/')[-1])

current_training_parent_dir = os.path.join(parent_dir + "masks/")
new_training_parent_dir = os.path.join(parent_dir + "training/annotations/")

for image_files in training_masks_to_relocate:
    current_image_location = os.path.join(current_training_parent_dir + image_files)
    new_image_location = os.path.join(new_training_parent_dir + image_files)
    shutil.move(current_image_location, new_image_location)

#### Move Files in Testing Dataset

In [None]:
testing_images_to_relocate = test_meta_df['images'].apply(lambda x: x.split('images/')[-1])

current_testing_parent_dir = os.path.join(parent_dir + "images/")
new_testing_parent_dir = os.path.join(parent_dir + "testing/images/")

for image_files in testing_images_to_relocate:
    current_image_location = os.path.join(current_testing_parent_dir + image_files)
    new_image_location = os.path.join(new_testing_parent_dir + image_files)
    shutil.move(current_image_location, new_image_location)

testing_masks_to_relocate = test_meta_df['annotation'].apply(lambda x: x.split('masks/')[-1])

current_testing_parent_dir = os.path.join(parent_dir + "masks/")
new_testing_parent_dir = os.path.join(parent_dir + "testing", "annotations/")

for image_files in testing_masks_to_relocate:
    current_image_location = os.path.join(current_testing_parent_dir + image_files)
    new_image_location = os.path.join(new_testing_parent_dir + image_files)
    shutil.move(current_image_location, new_image_location)

#### Update Metadata Files With Correct Image Locations

In [None]:
new_parent_dir = "/Users/briandunn/Desktop/Image Segmentation Projects/Carvana Image Masking/data/"

### Training Metadata File

# Remove current folder/prefix
train_meta_df['images'] = train_meta_df['images'].apply(lambda x: x.split('images/')[-1])
train_meta_df['annotation'] = train_meta_df['annotation'].apply(lambda x: x.split('masks/')[-1])

train_meta_df['images'] = "training/" + train_meta_df['images']
train_meta_df['annotation'] = "training/" + train_meta_df['annotation']

# Save Metadata Files respectively
train_meta_df.to_csv(os.path.join(new_parent_dir + "training", "training.csv"))

### Testing Metadata File

# Remove current folder/prefix
test_meta_df['images'] = test_meta_df['images'].apply(lambda x: x.split('images/')[-1])
test_meta_df['annotation'] = test_meta_df['annotation'].apply(lambda x: x.split('masks/')[-1])

test_meta_df['images'] = "testing/" + test_meta_df['images']
test_meta_df['annotation'] = "testing/" + test_meta_df['annotation']

# Save Metadata Files respectively
test_meta_df.to_csv(os.path.join(new_parent_dir + "testing", "testing.csv"))