In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import shutil
import random

def split_data(data_dir, train_dir, val_dir, split_ratio=0.8):
    """
    Splits data into training and validation sets.

    Parameters:
    - data_dir: Directory containing the dataset, structured with subdirectories for each class.
    - train_dir: Directory where the training data will be saved.
    - val_dir: Directory where the validation data will be saved.
    - split_ratio: Proportion of data to be used for training (default is 0.7 for a 70-30 split).
    """
    if not os.path.exists(train_dir):
        os.makedirs(train_dir)
    if not os.path.exists(val_dir):
        os.makedirs(val_dir)

    for class_dir in os.listdir(data_dir):
        class_path = os.path.join(data_dir, class_dir)

        if os.path.isdir(class_path):
            files = os.listdir(class_path)
            random.shuffle(files)

            split_point = int(len(files) * split_ratio)
            train_files = files[:split_point]
            val_files = files[split_point:]

            train_class_dir = os.path.join(train_dir, class_dir)
            val_class_dir = os.path.join(val_dir, class_dir)

            if not os.path.exists(train_class_dir):
                os.makedirs(train_class_dir)
            if not os.path.exists(val_class_dir):
                os.makedirs(val_class_dir)

            for file in train_files:
                shutil.copy(os.path.join(class_path, file), os.path.join(train_class_dir, file))

            for file in val_files:
                shutil.copy(os.path.join(class_path, file), os.path.join(val_class_dir, file))

# Example usage
data_dir = '/content/drive/MyDrive/Datasets'
train_dir = '/content/drive/MyDrive/Capstone Dataset 80:20/training'
val_dir = '/content/drive/MyDrive/Capstone Dataset 80:20/validation'
split_data(data_dir, train_dir, val_dir, split_ratio=0.8)

In [None]:
# import os
# import shutil
# import random

# def split_data(data_dir, train_dir, val_dir, split_ratio=0.8):
#     """
#     Splits data into training and validation sets, avoiding duplicate data.

#     Parameters:
#     - data_dir: Directory containing the dataset, structured with subdirectories for each class.
#     - train_dir: Directory where the training data will be saved.
#     - val_dir: Directory where the validation data will be saved.
#     - split_ratio: Proportion of data to be used for training (default is 0.8 for an 80-20 split).
#     """
#     if not os.path.exists(train_dir):
#         os.makedirs(train_dir)
#     if not os.path.exists(val_dir):
#         os.makedirs(val_dir)

#     # Dictionary to store file paths assigned to each set
#     train_files_assigned = {}
#     val_files_assigned = {}

#     for class_dir in os.listdir(data_dir):
#         class_path = os.path.join(data_dir, class_dir)

#         if os.path.isdir(class_path):
#             files = os.listdir(class_path)
#             random.shuffle(files)

#             split_point = int(len(files) * split_ratio)
#             train_files = files[:split_point]
#             val_files = files[split_point:]

#             train_class_dir = os.path.join(train_dir, class_dir)
#             val_class_dir = os.path.join(val_dir, class_dir)

#             if not os.path.exists(train_class_dir):
#                 os.makedirs(train_class_dir)
#             if not os.path.exists(val_class_dir):
#                 os.makedirs(val_class_dir)

#             for file in train_files:
#                 if file not in train_files_assigned:
#                     train_files_assigned[file] = True
#                     shutil.copy(os.path.join(class_path, file), os.path.join(train_class_dir, file))

#             for file in val_files:
#                 if file not in val_files_assigned:
#                     val_files_assigned[file] = True
#                     shutil.copy(os.path.join(class_path, file), os.path.join(val_class_dir, file))

# # Example usage
# data_dir = '/content/drive/MyDrive/Datasets'
# train_dir = '/content/drive/MyDrive/Capstone Dataset 80:20/training'
# val_dir = '/content/drive/MyDrive/Capstone Dataset 80:20/validation'
# split_data(data_dir, train_dir, val_dir, split_ratio=0.8)

In [None]:
shared_drive_path = '/content/drive/MyDrive/Capstone Dataset 80:20/training'
if os.path.exists(shared_drive_path):
    # List all items in the shared_drive_path
    all_items = os.listdir(shared_drive_path)

    # Filter out directories only
    directories = [item for item in all_items if os.path.isdir(os.path.join(shared_drive_path, item))]

    # Iterate through each directory and count the number of items
    for directory in directories:
        dir_path = os.path.join(shared_drive_path, directory)
        num_items = len(os.listdir(dir_path))
        print(f"Training Directory: {directory}, Number of items: {num_items}")
else:
    print(f"The directory {shared_drive_path} does not exist.")

Training Directory: Cabai Merah besar, Number of items: 164
Training Directory: Cabai Hijau Besar, Number of items: 176
Training Directory: Cabai Keriting, Number of items: 159
Training Directory: Cabai Rawit, Number of items: 167
Training Directory: Cabai gendot (habanero), Number of items: 161
Training Directory: Cabai Jalapeno, Number of items: 161
Training Directory: Cabai Paprika, Number of items: 169


In [None]:
shared_drive_path = '/content/drive/MyDrive/Capstone Dataset 80:20/validation'
if os.path.exists(shared_drive_path):
    # List all items in the shared_drive_path
    all_items = os.listdir(shared_drive_path)

    # Filter out directories only
    directories = [item for item in all_items if os.path.isdir(os.path.join(shared_drive_path, item))]

    # Iterate through each directory and count the number of items
    for directory in directories:
        dir_path = os.path.join(shared_drive_path, directory)
        num_items = len(os.listdir(dir_path))
        print(f"Validation Directory: {directory}, Number of items: {num_items}")
else:
    print(f"The directory {shared_drive_path} does not exist.")

Validation Directory: Cabai Merah besar, Number of items: 42
Validation Directory: Cabai Hijau Besar, Number of items: 44
Validation Directory: Cabai Keriting, Number of items: 40
Validation Directory: Cabai Rawit, Number of items: 42
Validation Directory: Cabai gendot (habanero), Number of items: 41
Validation Directory: Cabai Jalapeno, Number of items: 41
Validation Directory: Cabai Paprika, Number of items: 43


In [None]:
import os
import hashlib

def find_duplicates(directory):
    """
    Find duplicate files within a directory.

    Parameters:
    - directory: Directory containing the dataset.

    Returns:
    - A list of duplicate file paths.
    """
    duplicates = []
    hash_dict = {}

    for root, _, files in os.walk(directory):
        for filename in files:
            file_path = os.path.join(root, filename)

            # Calculate the hash value of the file
            with open(file_path, 'rb') as f:
                file_hash = hashlib.sha256(f.read()).hexdigest()

            # Check if the hash value already exists in the dictionary
            if file_hash in hash_dict:
                duplicates.append(file_path)
            else:
                hash_dict[file_hash] = file_path

    return duplicates

# Example usage
train_dir = '/content/drive/MyDrive/Capstone Dataset 80:20/training'
val_dir = '/content/drive/MyDrive/Capstone Dataset 80:20/validation'

# Find duplicates in both training and validation sets
print("Duplicates between training and validation sets:")
train_files = set(find_duplicates(train_dir))
val_files = set(find_duplicates(val_dir))

common_duplicates = train_files.intersection(val_files)
if common_duplicates:
    for duplicate in common_duplicates:
        print(duplicate)
else:
    print("No duplicates found between training and validation sets.")

Duplicates between training and validation sets:
No duplicates found between training and validation sets.


In [None]:
import hashlib
import os

def calculate_file_hash(file_path):
    """
    Calculate the SHA-256 hash of a file.

    Parameters:
    - file_path: Path to the file.

    Returns:
    - The SHA-256 hash of the file.
    """
    with open(file_path, 'rb') as f:
        return hashlib.sha256(f.read()).hexdigest()

def find_duplicates(train_dir, val_dir):
    """
    Find duplicate files between training and validation sets.

    Parameters:
    - train_dir: Directory containing the training data.
    - val_dir: Directory containing the validation data.

    Returns:
    - A list of duplicate file paths.
    """
    train_hashes = {}
    val_hashes = {}

    # Calculate hashes for training set
    for root, _, files in os.walk(train_dir):
        for filename in files:
            file_path = os.path.join(root, filename)
            file_hash = calculate_file_hash(file_path)
            train_hashes[file_hash] = file_path

    # Calculate hashes for validation set
    for root, _, files in os.walk(val_dir):
        for filename in files:
            file_path = os.path.join(root, filename)
            file_hash = calculate_file_hash(file_path)
            val_hashes[file_hash] = file_path

    # Find duplicates
    duplicates = []
    for file_hash, file_path in train_hashes.items():
        if file_hash in val_hashes:
            duplicates.append((file_path, val_hashes[file_hash]))

    return duplicates

# Example usage
train_dir = '/content/drive/MyDrive/Capstone Dataset 80:20/training'
val_dir = '/content/drive/MyDrive/Capstone Dataset 80:20/validation'

duplicates = find_duplicates(train_dir, val_dir)
if duplicates:
    print("Duplicates found between training and validation sets:")
    for train_file, val_file in duplicates:
        print(f"Training file: {train_file}, Validation file: {val_file}")
else:
    print("No duplicates found between training and validation sets.")

Duplicates found between training and validation sets:
Training file: /content/drive/MyDrive/Capstone Dataset 80:20/training/Cabai Keriting/cabe-keriting-hijau-1-SESA_1-removebg-preview (1).jpg, Validation file: /content/drive/MyDrive/Capstone Dataset 80:20/validation/Cabai Keriting/cabe-keriting-hijau-1-SESA_1-removebg-preview.jpg
