In [2]:
import os
import cv2
import pandas as pd
import numpy as np
import shutil  

In [5]:
train_folder = '/kaggle/input/datascience-dataset/original/train'
val_folder = '/kaggle/input/datascience-dataset/original/val'
test_folder = '/kaggle/input/datascience-dataset/original/test'

train_dir_list=os.listdir(train_folder)
val_dir_list=os.listdir(val_folder)
test_dir_list=os.listdir(test_folder)

train_dir_list.sort()
val_dir_list.sort()
test_dir_list.sort()


print(train_dir_list)
print(val_dir_list)
print(test_dir_list)

['leaf_waste', 'metal', 'paper', 'plastic', 'wood_waste']
['leaf_waste', 'metal', 'paper', 'plastic', 'wood_waste']
['leaf_waste', 'metal', 'paper', 'plastic', 'wood_waste']


In [6]:
for i in train_dir_list:
  pth = train_folder+'/'+i
  print('class:', i, 'No. of files:',len(os.listdir(pth)))

class: leaf_waste No. of files: 1000
class: metal No. of files: 1000
class: paper No. of files: 835
class: plastic No. of files: 692
class: wood_waste No. of files: 568


In [7]:
class_list = train_dir_list
one_hot_list = [[1, 0, 0, 0, 0],
               [0, 1, 0, 0, 0],
               [0, 0, 1, 0, 0],
               [0, 0, 0, 1, 0],
               [0, 0, 0, 0, 1]
              ]

# Create a dictionary from class_list and one_hot_list
class_dict = {class_list[i]: one_hot_list[i] for i in range(len(class_list))}

# Print the resulting dictionary
print(class_dict)

{'leaf_waste': [1, 0, 0, 0, 0], 'metal': [0, 1, 0, 0, 0], 'paper': [0, 0, 1, 0, 0], 'plastic': [0, 0, 0, 1, 0], 'wood_waste': [0, 0, 0, 0, 1]}


In [8]:
def augment(img):
  return rotate_image(img,45), horizontal_flip(img), vertical_flip(img)

def rotate_image(image, angle):
    height, width = image.shape[:2]
    rotation_matrix = cv2.getRotationMatrix2D((width / 2, height / 2), angle, 1.0)
    rotated_image = cv2.warpAffine(image, rotation_matrix, (width, height))
    return rotated_image

def horizontal_flip(image):
    flipped_image = cv2.flip(image, 1)
    return flipped_image

def vertical_flip(image):
    flipped_image = cv2.flip(image, 0)
    return flipped_image

In [9]:
# making output directories
output_root='/kaggle/working/train'
os.makedirs(output_root)
for PATH in os.listdir(train_folder):
    os.makedirs(os.path.join(output_root, PATH))
    
output_root='/kaggle/working/val'
os.makedirs(output_root)
for PATH in os.listdir(val_folder):
    os.makedirs(os.path.join(output_root, PATH))
    
output_root='/kaggle/working/test'
os.makedirs(output_root)
for PATH in os.listdir(test_folder):
    os.makedirs(os.path.join(output_root, PATH))

In [10]:
# Setting the Image dimensions

IMG_WIDTH = 224
IMG_HEIGHT = 224

# Function to create a dataset by transforming data
paths=[]
labels=[]

def create_dataset(folder,typee):

    '''
    TAKES image folder and typee (train, test or val)
    '''
    output_root='/kaggle/working/'+typee+'/'
    paths=[]
    labels=[]
    # Loop through subfolders in the specified folder
    for PATH in os.listdir(folder):
        # Loop through files in the subfolder
        i = 0
        image_count=0
        for file in os.listdir(os.path.join(folder, PATH)):
            file_name_no_ext = typee+'_'+PATH+'_'+str(image_count)
            # Construct the full image file path
            image_path = os.path.join(folder, PATH, file)
            new_path = '/kaggle/input/datascience-dataset/processed/'+typee+'/'+PATH+'/' + file_name_no_ext+ '.jpg'
            output_path_no_ext = output_root+PATH+'/'+ file_name_no_ext
            # Read the image in RGB format
            image = cv2.imread(image_path,cv2.COLOR_BGR2RGB)

            # Resize the image to the specified dimensions
            image = cv2.resize(image, (IMG_HEIGHT, IMG_WIDTH), interpolation=cv2.INTER_AREA)

            # Convert the image to a numpy array and normalize pixel values to [0, 1]
            image = np.array(image)
            image = image.astype('float64')
            image /= 255.0
            image=(image * 255).astype('uint8')
            cv2.imwrite(output_path_no_ext + '.jpg', image)
            paths.append(new_path)
            labels.append(PATH)
            # Check if the image is in 3-channel format
            if len(image.shape) == 3:
                # augmenting training image(s)
                if folder == train_folder and ((i%15 == 0 and PATH == 'paper') or (i%6 == 0 and PATH == 'plastic') or (i%4 == 0 and PATH == 'wood_waste')):
                    im1, im2, im3 = augment(image)
                    # im1 path
                    paths.append('/kaggle/input/datascience-dataset/processed/'+typee+'/'+PATH+'/'+file_name_no_ext+'_1.jpg')
                    out1 = output_path_no_ext + '_1.jpg'
                    labels.append(PATH)
                    cv2.imwrite(out1,im1)
                    # im2 path
                    paths.append('/kaggle/input/datascience-dataset/processed/'+typee+'/'+PATH+'/'+file_name_no_ext+'_2.jpg')
                    out2 = output_path_no_ext + '_2.jpg'
                    labels.append(PATH)
                    cv2.imwrite(out2,im2)

                    # im3 path
                    paths.append('/kaggle/input/datascience-dataset/processed/'+typee+'/'+PATH+'/'+file_name_no_ext+'_3.jpg')
                    out3 = output_path_no_ext + '_3.jpg'
                    cv2.imwrite(out3,im3)
                    labels.append(PATH)
            i+=1
            image_count+=1
    return pd.DataFrame({'paths':paths, 'labels':labels})

In [11]:
df_test=create_dataset(test_folder,'test')

In [12]:

# Define the directory path you want to archive
directory_path = '/kaggle/working/test'  # Replace with the actual directory path

# Create a zip file of the directory
shutil.make_archive(directory_path, 'zip', directory_path)

# Download the zip file
from IPython.display import FileLink
display(FileLink(directory_path+'.zip'))

In [13]:
df_val=create_dataset(val_folder,'val')



In [14]:
# Define the directory path you want to archive
directory_path = '/kaggle/working/val'  # Replace with the actual directory path

# Create a zip file of the directory
shutil.make_archive(directory_path, 'zip', directory_path)

# Download the zip file
from IPython.display import FileLink
display(FileLink(directory_path+'.zip'))

In [15]:
df_train=create_dataset(train_folder,'train')

# Define the directory path you want to archive
directory_path = '/kaggle/working/train'  # Replace with the actual directory path

# Create a zip file of the directory
shutil.make_archive(directory_path, 'zip', directory_path)

# Download the zip file
from IPython.display import FileLink
display(FileLink(directory_path+'.zip'))



In [16]:
df1=pd.get_dummies(df_train,columns=['labels'])
# Convert boolean columns to 0's and 1's
for column in df1.columns[1:]:  # Start from the second column, assuming the first column contains strings
    df1[column] = df1[column].astype(int)
df1.to_csv('train.csv',index=False)

In [17]:
df2=pd.get_dummies(df_val,columns=['labels'])
# Convert boolean columns to 0's and 1's
for column in df2.columns[1:]:  # Start from the second column, assuming the first column contains strings
    df2[column] = df2[column].astype(int)
df2.to_csv('val.csv',index=False)

In [18]:
df3=pd.get_dummies(df_test,columns=['labels'])
# Convert boolean columns to 0's and 1's
for column in df3.columns[1:]:  # Start from the second column, assuming the first column contains strings
    df3[column] = df3[column].astype(int)
df3.to_csv('test.csv',index=False)