# Data Preparation

In [2]:
import random
import shutil
import os
import cv2

from tqdm.notebook import tqdm 

In [4]:
!pip install -U --no-cache-dir gdown --pre

Collecting gdown
  Downloading gdown-4.6.4-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.5.1
    Uninstalling gdown-4.5.1:
      Successfully uninstalled gdown-4.5.1
Successfully installed gdown-4.6.4
[0m

### Download the project data
> i.e. task2_data
>
> I manually extracted the zip folder into my working directory. 

In [5]:
!gdown --id '1iU0qE7sM_AvdQnh0emkbcwwh2d3pE5TG'

Downloading...
From: https://drive.google.com/uc?id=1iU0qE7sM_AvdQnh0emkbcwwh2d3pE5TG
To: /notebooks/task2_data.zip
100%|█████████████████████████████████████████| 201M/201M [00:01<00:00, 109MB/s]


### Code to create the training data which consists of images and their annotations


> The `training` folder is said within my current working directory. 

In [6]:
## prepare data for yolo model
def prep_data(csv_file, images):
    
    names = { 1:'longitudinal_high',2:'longitudinal_low',3:'longitudinal_medium', 
           4:'grass', 5:'patch_high', 6:'manhole_high', 7:'transverse_high',
           8:'transverse_low', 9:'transverse_medium', 10:'diag_high', 
           11:'diag_low', 12:'diag_medium', 13:'alligator_high', 
           14:'alligator_low', 15:'alligator_medium',  16:'block_low', 
           17:'block_high', 18:'block_medium'}


    dst_imgs_path = 'training/images'
    dst_lbls_path = 'training/labels'
    
    if not os.path.isdir('training/images'):
        os.makedirs(dst_imgs_path)
        os.makedirs(dst_lbls_path)
        
    
    df = pd.read_csv(csv_file)
    img_ids = df['img_name'].unique()
    
    for img in img_ids:
        df_cur = df[df['img_name'] == img]
        full_img_path = os.path.join(images,img)
        frame = cv2.imread(full_img_path)
        height,width,chan = frame.shape
        for data in df_cur.values.tolist():
            cimg,x1,y1,x2,y2 = data[0:-2]
            ccolor = data[-2] + '_' + data[-1]
            value = [i for i in names if names[i]==ccolor]
            cls = value[0]
            dw = 1. / width
            dh = 1. / height
            x = (x1 + x2) / 2.0
            y = (y1 + y2) / 2.0
            w = x2 - x1
            h = y2 - y1
            x = x * dw
            w = w * dw
            y = y * dh
            h = h * dh

            file_path_img = os.path.join(dst_lbls_path, img)
            filename, file_extension = os.path.splitext(file_path_img)
        # print (filename,file_extension)
            with open(file_path_img.replace(file_extension,'.txt'), 'a+') as f:
                f.write(' '.join([str(int(cls)), str(float(x)), str(float(y)), str(float(w)), str(float(h))]))
                f.write('\n')

        shutil.copy(full_img_path,os.path.join(dst_imgs_path, img))

    return df

In [None]:
## converts annotations into yolo format
## creates new folder - training data
csv_file = 'task2_data/annotations.csv'
images = 'task2_data/images'

# data = prep_data(csv_file, images)
# data.head()

## Split the Dataset into Training and Validation 

In [9]:
def split_dataset(path_to_dataset, split_ratio=0.8):
    
    # Set the path to the training folder and the names of the train and val folders
    training_folder = path_to_dataset
    train_folder = 'train_data'
    val_folder = 'val_data'

    # Set the train/val split ratio
    split_ratio = split_ratio

    # Create the train and val folders if they don't exist
    if not os.path.exists(train_folder):
        os.makedirs(os.path.join(train_folder, 'images'))
        os.makedirs(os.path.join(train_folder, 'labels'))

    if not os.path.exists(val_folder):
        os.makedirs(os.path.join(val_folder, 'images'))
        os.makedirs(os.path.join(val_folder, 'labels'))

    # Get the list of image and label files in the training folder
    image_files = os.listdir(os.path.join(training_folder, 'images'))
    label_files = os.listdir(os.path.join(training_folder, 'labels'))

    # Get the number of files to put in the train and val sets
    num_files = len(image_files)
    num_train_files = int(num_files * split_ratio)
    num_val_files = num_files - num_train_files

    random.shuffle(image_files)

    for i in range(num_train_files):
        image_file = image_files[i]
        label_file = image_file[:-4] + '.txt'
        shutil.copy(os.path.join(training_folder, 'images', image_file),
                    os.path.join(train_folder, 'images', image_file))
        shutil.copy(os.path.join(training_folder, 'labels', label_file),
                    os.path.join(train_folder, 'labels', label_file))

    # Copy the remaining images and labels to the val set
    for i in range(num_train_files, num_files):
        image_file = image_files[i]
        label_file = image_file[:-4] + '.txt'
        shutil.copy(os.path.join(training_folder, 'images', image_file),
                    os.path.join(val_folder, 'images', image_file))
        shutil.copy(os.path.join(training_folder, 'labels', label_file),
                    os.path.join(val_folder, 'labels', label_file))

In [None]:
# Path to the main data to be splitted into train and validation
path_to_dataset = 'training'
split_dataset(path_to_dataset=path_to_dataset)

### Test whether our split dataset function is working 

In [8]:
train_data = os.listdir(os.path.join('train_data', 'images'))
train_lbl = os.listdir(os.path.join('train_data', 'labels'))

print('Number of images in the train_data: ', len(train_data))
print('Number of labels in the train_data: ', len(train_lbl))

val_data = os.listdir(os.path.join('val_data', 'images'))
val_lbl = os.listdir(os.path.join('val_data', 'labels'))

print('Number of images in the val_data: ', len(val_data))
print('Number of labels in the val_data: ', len(val_lbl))

Number of images in the train_data:  160
Number of labels in the train_data:  160
Number of images in the val_data:  41
Number of labels in the val_data:  41


In [10]:
## Download test data
!gdown --id '11iKT_JlR5OPhLLdMtP1wkxNQ-fz6_who'
!unzip 'test.zip' -d './'

Downloading...
From: https://drive.google.com/uc?id=11iKT_JlR5OPhLLdMtP1wkxNQ-fz6_who
To: /notebooks/test.zip
100%|██████████████████████████████████████| 99.8M/99.8M [00:01<00:00, 77.6MB/s]
Archive:  test.zip
  inflating: ./images/4 (30).JPG     
  inflating: ./images/2 (15).JPG     
  inflating: ./images/3 (3).JPG      
  inflating: ./images/4 (1).JPG      
  inflating: ./images/3 (16).JPG     
  inflating: ./images/2 (2).JPG      
  inflating: ./images/4 (32).JPG     
  inflating: ./images/2 (11).JPG     
  inflating: ./images/3 (6).JPG      
  inflating: ./images/4 (16).JPG     
  inflating: ./images/2 (9).JPG      
  inflating: ./images/4 (12).JPG     
  inflating: ./images/3 (2).JPG      
  inflating: ./images/4 (35).JPG     
  inflating: ./images/4 (21).JPG     
  inflating: ./images/4 (25).JPG     
  inflating: ./images/4 (20).JPG     
  inflating: ./images/4 (19).JPG     
  inflating: ./images/2 (6).JPG      
  inflating: ./images/4 (31).JPG     
  inflating: ./images/4 (2).JP

In [11]:
len(os.listdir('/notebooks/test/images'))

104

In [None]:
#!git clone 'https://github.com/UM-Titan/DSPS23.git'

In [None]:
#!pip install -r '/notebooks/DSPS23/requirements.txt'   