# Image Recognition Project - Structural Defect Recognition
---------------------------------------------------------------
## Data Collection

### Section Objectives
 - Find relevant dataset from Kaggle
 - Collect the data
 - Preprocess data, checking for outlier images or irrelevant files
 - Divide dataset into the following subsets: Train, Test and Validation; at the ratio 0.7, 0.2, 0.1
 

---------------------------------------------------------------

### Importing Packages

In [None]:
%pip install -r /workspaces/ML_Project_Image_Recognition/requirements.txt --silent

In [None]:
import numpy
import os
import random
import shutil

### Setting Working Directory

In [None]:
current_dir = os.getcwd()
current_dir

In [None]:
directory = 'workspaces/ML_Project_Image_Recognition'

if not os.path.exists(directory):
    os.makedirs(directory)
    print(f"Directory '{directory}' created.")
else:
    print(f"Directory '{directory}' already exists.")


In [None]:
os.chdir('workspaces/ML_Project_Image_Recognition')
print("This is your set Working Directory")

### Installing Kaggle


In [None]:
%pip install kaggle==1.5.12 --silent

In [None]:
os.environ['KAGGLE_CONFIG_DIR'] = '/workspaces/ML_Project_Image_Recognition'
!chmod 600 /workspaces/ML_Project_Image_Recognition/kaggle.json

In [None]:
new_destination_folder = "/workspaces/ML_Project_Image_Recognition/inputs/cracks_dataset_new"
os.makedirs(new_destination_folder, exist_ok=True)
print(f"Created new folder: {new_destination_folder}")


Pulling dataset from Kaggle - (add here)

In [None]:
KaggleDatasetPath = "aniruddhsharma/structural-defects-network-concrete-crack-images"
DestinationFolder = "/workspaces/ML_Project_Image_Recognition/inputs/cracks_dataset_new"   
os.makedirs(DestinationFolder, exist_ok=True)
! kaggle datasets download -d {KaggleDatasetPath} -p {DestinationFolder}

In [None]:
zip_file_path = DestinationFolder + '/download.zip'
if os.path.exists(zip_file_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(DestinationFolder)
    os.remove(zip_file_path)  
else:
    print(f"File not found: {zip_file_path}")
    print("Listing files in the destination folder:")
    print(os.listdir(DestinationFolder))


In [None]:
import zipfile

with zipfile.ZipFile(DestinationFolder + '/structural-defects-network-concrete-crack-images.zip', 'r') as zip_ref:
    zip_ref.extractall(DestinationFolder)  

os.remove(DestinationFolder + '/structural-defects-network-concrete-crack-images.zip')

---------------------------------------------------------------

## Preparing Data

### Data Cleaning
Checking for and removing any non-images from the downloaded dataset. 

In [None]:
def remove_non_image_file(my_data_dir):
    image_extensions = ('.jpg', '.jpeg', '.png')

    for category in os.listdir(my_data_dir):  # Walls, Decks, Pavements
        category_path = os.path.join(my_data_dir, category)
        if not os.path.isdir(category_path):
            continue

        for class_name in os.listdir(category_path):  # Cracked, Non-Cracked
            class_path = os.path.join(category_path, class_name)
            if not os.path.isdir(class_path):
                continue

            total_images = 0
            removed_files = 0

            for file in os.listdir(class_path):
                file_path = os.path.join(class_path, file)
                if not file.lower().endswith(image_extensions):
                    os.remove(file_path)
                    removed_files += 1
                else:
                    total_images += 1

            print(f"{category}/{class_name} - {total_images} images, {removed_files} non-image files removed")


In [None]:
remove_non_image_file(my_data_dir='/workspaces/ML_Project_Image_Recognition/inputs/cracks_dataset_new')

fix

## Dividing Dataset
As mentioned previously, the dataset must be split into three partitions: a training set; a validation set and a testing set - in the ratio of 0.7, 0.1, 0.2 respectively.

In [None]:
def split_train_validation_test_images(my_data_dir, train_set_ratio, validation_set_ratio, test_set_ratio):
    if train_set_ratio + validation_set_ratio + test_set_ratio != 1.0:
        print("Ratios must sum to 1.0")
        return

    classes = os.listdir(my_data_dir)
    base_output_dir = os.path.dirname(my_data_dir)
    
    for class_name in classes:
        class_path = os.path.join(my_data_dir, class_name)
        if not os.path.isdir(class_path):
            continue

        images = [img for img in os.listdir(class_path) if img.lower().endswith(('.jpg', '.jpeg', '.png'))]
        random.shuffle(images)

        n_total = len(images)
        n_train = int(n_total * train_set_ratio)
        n_val = int(n_total * validation_set_ratio)

        splits = {
            'train': images[:n_train],
            'val': images[n_train:n_train + n_val],
            'test': images[n_train + n_val:]
        }

        for split_name, split_images in splits.items():
            split_dir = os.path.join(base_output_dir, split_name, os.path.basename(my_data_dir), class_name)
            os.makedirs(split_dir, exist_ok=True)
            for img_name in split_images:
                src_path = os.path.join(class_path, img_name)
                dst_path = os.path.join(split_dir, img_name)
                shutil.copy2(src_path, dst_path)

        print(f"{os.path.basename(my_data_dir)} / {class_name}: {n_total} → {len(splits['train'])} train, {len(splits['val'])} val, {len(splits['test'])} test.")


In [None]:
data_types = ['Walls', 'Pavements', 'Decks']
base_data_dir = os.path.join('..', 'inputs', 'cracks_dataset_new') 

for dtype in data_types:
    full_path = os.path.join(base_data_dir, dtype)
    print(f"Checking: {full_path}")  
    if os.path.exists(full_path):
        split_train_validation_test_images(
            my_data_dir=full_path,
            train_set_ratio=0.7,
            validation_set_ratio=0.1,
            test_set_ratio=0.2
        )
    else:
        print(f"Directory not found: {full_path}")
