### **1. Import the necessary packages**

In [1]:
import os
import pandas as pd
from tqdm import tqdm
import shutil
from shutil import copyfile as copy

df = pd.read_csv('imgs.csv')
df.head()

Unnamed: 0,img_link,tags,likes,comments,img_id,img_path
0,https://cdn.pixabay.com/photo/2022/03/06/05/30...,"Clouds, Sky, Atmosphere, Blue Sky",196,55,A00001,/content/Imgs/A00001.jpg
1,https://cdn.pixabay.com/photo/2022/04/07/11/45...,"Bird, Ornithology, Hummingbird",76,20,A00002,/content/Imgs/A00002.jpg
2,https://cdn.pixabay.com/photo/2022/02/28/15/28...,"Sea, Rainbow, Rainfall, Subtropical",282,106,A00003,/content/Imgs/A00003.jpg
3,https://cdn.pixabay.com/photo/2022/04/04/02/52...,"Cherry Blossoms, Road, Japan, Sakura",42,11,A00004,/content/Imgs/A00004.jpg
4,https://cdn.pixabay.com/photo/2022/04/09/18/06...,"Cape Marguerite, Flower, Plant",39,15,A00005,/content/Imgs/A00005.jpg


### **2.Remove Unnecessary Columns**

In [2]:
del df['img_link']
del df['img_id']
del df['likes']
del df['comments']

df.head()

Unnamed: 0,tags,img_path
0,"Clouds, Sky, Atmosphere, Blue Sky",/content/Imgs/A00001.jpg
1,"Bird, Ornithology, Hummingbird",/content/Imgs/A00002.jpg
2,"Sea, Rainbow, Rainfall, Subtropical",/content/Imgs/A00003.jpg
3,"Cherry Blossoms, Road, Japan, Sakura",/content/Imgs/A00004.jpg
4,"Cape Marguerite, Flower, Plant",/content/Imgs/A00005.jpg


### **3. Finding all the tags**

In [3]:
t = []

for tags in df['tags']:

    t += [tag.strip() for tag in tags.split(',')]

tags = list(set(t))

In [7]:
len(tags)

8015

### **4. Creating Folders for each Tag**

In [8]:
for tag in tqdm(tags):
  try:
        os.mkdir('Dataset/' + tag)
  except:
        pass

100%|██████████| 8015/8015 [00:00<00:00, 22296.35it/s]


### **5. Saving Images in Specific Folders**

In [9]:
error = 0

for data in tqdm(df.values):

    tags = data[0]
    tags = ['Dataset/' + tag.strip() + '/' for tag in tags.split(',')]

    src = data[1]

    for i in tags:

        dst = i + src.split('/')[-1]

        try:
            copy(src, dst)
        except:
            error += 1

100%|██████████| 9101/9101 [00:17<00:00, 535.28it/s]


In [10]:
error

20

### **6. Checking Number of Folders**

In [11]:
folders = os.listdir('Dataset')

print(len(folders))

8003


### **7. Checking number of Images in Each Folder**

In [13]:
folder_ = []
freq    = []

for folder in tqdm(folders):

    try:
        freq.append(len(os.listdir('Dataset/' + folder)))
        folder_.append(folder)

    except:
        pass

100%|██████████| 8003/8003 [00:00<00:00, 31820.06it/s]


### **8. Top 10 Folders with most number of Images**

In [14]:
df_ = pd.DataFrame()

df_['folder'] = folder_
df_['freq']   = freq

df_.sort_values(by = 'freq', ascending = False).head(10)

Unnamed: 0,folder,freq
5896,Nature,839
895,Animal,609
3926,Flower,549
7486,Bird,519
6759,Flowers,505
2485,Sea,306
1833,Plant,284
4085,Sunset,275
2974,Forest,261
3981,Trees,242


### **9. Top 10 Folders with least number of Images**

In [15]:
df_.sort_values(by = 'freq', ascending = True).head(10)

Unnamed: 0,folder,freq
3999,Indian Elephants,1
3429,Parts,1
3428,Kungfu,1
3427,Backpack,1
3426,Logistics,1
3425,Marmot,1
3424,Pirate Ship,1
3422,Businessmen,1
3421,Nests,1
3420,Werewolf,1


### **10. Folders are having images more than 10**

In [16]:
df_[df_['freq'] >= 10]

Unnamed: 0,folder,freq
12,Full Moon,12
22,Perched,46
33,Pollination,47
50,Male,25
62,Alcohol,13
...,...,...
7929,Bear,19
7943,Waves,43
7962,Wedding,24
7981,Notebook,15


### **11. Removing the Folders with less than 10 Images**

In [17]:
error= 0
for i in tqdm(df_[df_['freq'] < 10]['folder']):

  # Define the path to the dataset folder
  Main_folder = '/content/Dataset'

  # Construct the full path to the subfolder
  folder_path = os.path.join(Main_folder,i)

  # Check if the subfolder exists
  if os.path.exists(folder_path):

    # If the subfolder exists, delete it
    shutil.rmtree(folder_path)
  else:
    error += 1

100%|██████████| 7393/7393 [00:02<00:00, 3386.25it/s]


In [18]:
error

0

### **12. Downloading the folder from colab**

In [20]:
import shutil
folder_path = '/content/Dataset'

zip_file_path = '/content/Dataset.zip'

shutil.make_archive(zip_file_path[:-4],'zip', folder_path)

'/content/Dataset.zip'

In [21]:
from google.colab import files
files.download('/content/Dataset.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>