In [18]:
import os
import pandas as pd
from tqdm import tqdm
from shutil import copyfile as copy

df = pd.read_csv('final.csv')
df.head()

Unnamed: 0,img_link,tags,likes,comments,path,img_id
0,https://cdn.pixabay.com/photo/2022/03/06/05/30...,"Clouds, Sky, Atmosphere, Blue Sky",196,55,Imgs/A00000000.jpg,A00000000
1,https://cdn.pixabay.com/photo/2022/04/07/11/45...,"Bird, Ornithology, Hummingbird",76,20,Imgs/A00000001.jpg,A00000001
2,https://cdn.pixabay.com/photo/2022/02/28/15/28...,"Sea, Rainbow, Rainfall, Subtropical",282,106,Imgs/A00000002.jpg,A00000002
3,https://cdn.pixabay.com/photo/2022/04/04/02/52...,"Cherry Blossoms, Road, Japan, Sakura",42,11,Imgs/A00000003.jpg,A00000003
4,https://cdn.pixabay.com/photo/2022/04/09/18/06...,"Cape Marguerite, Flower, Plant",39,15,Imgs/A00000004.jpg,A00000004


### 1. Remove Unnecessary Columns

In [20]:
del df['img_link']
del df['img_id']
del df['likes']
del df['comments']

df.head()

Unnamed: 0,tags,path
0,"Clouds, Sky, Atmosphere, Blue Sky",Imgs/A00000000.jpg
1,"Bird, Ornithology, Hummingbird",Imgs/A00000001.jpg
2,"Sea, Rainbow, Rainfall, Subtropical",Imgs/A00000002.jpg
3,"Cherry Blossoms, Road, Japan, Sakura",Imgs/A00000003.jpg
4,"Cape Marguerite, Flower, Plant",Imgs/A00000004.jpg


### 2. Finding all the tags

In [42]:
t = []

for tags in df['tags']:
    
    t += [tag.strip() for tag in tags.split(',')]

tags = list(set(t))

### 3. Creating Folders for each Tag

In [68]:
for tag in tqdm(tags):
    
    try:
        os.mkdir('Dataset/' + tag)
    except:
        pass

100%|██████████████████████████████████| 64096/64096 [00:02<00:00, 26828.99it/s]


### 4. Saving Images in Specific Folders

In [105]:
error = 0

for data in tqdm(df.values):
    
    tags = data[0]
    tags = ['Dataset/' + tag.strip() + '/' for tag in tags.split(',')]
    
    src = data[1]
    
    for i in tags:
        
        dst = i + src.split('/')[-1]
        
        try:
            copy(src, dst)
        except:
            error += 1

100%|██████████████████████████████████| 205866/205866 [06:58<00:00, 492.38it/s]


### 5. Checking Number of Folders

In [110]:
folders = os.listdir('Dataset')

print(len(folders))

64010


### 6. Checking number of Images in Each Folder

In [125]:
folder_ = []
freq    = []

for folder in tqdm(folders):
    
    try:
        freq.append(len(os.listdir('Dataset/' + folder)))
        folder_.append(folder)

    except:
        pass

100%|██████████████████████████████████| 64010/64010 [00:01<00:00, 33199.12it/s]


### 7. Top 10 Folders with most number of Images

In [132]:
df_ = pd.DataFrame()

df_['folder'] = folder_
df_['freq']   = freq

df_.sort_values(by = 'freq', ascending = False).head(10)

Unnamed: 0,folder,freq
29522,Nature,29598
18068,Flower,19423
46010,Animal,10040
14111,Plant,8514
38982,Bird,7894
50386,Landscape,7539
13600,Water,7305
22399,Flowers,6868
55534,Sea,6431
29188,Bloom,6194


### 8. Top 10 Folders with least number of Images

In [134]:
df_.sort_values(by = 'freq', ascending = True).head(10)

Unnamed: 0,folder,freq
31999,Karroo,1
37018,Snow Woman,1
37019,Don'T Like,1
37021,Arboreal,1
37023,Nandine,1
37024,Girl With Mobile,1
37025,Topkapä± Palace,1
37027,Ekh,1
37028,Fischer-Art In Sebnitz,1
37029,Five Grain Rice,1


### 9. How many folders are having images more than 50

In [140]:
df_[df_['freq'] >= 50]

Unnamed: 0,folder,freq
36,Usa,444
55,Dried,95
61,Volkswagen,53
88,Hunter,54
109,Porsche,52
...,...,...
63909,Frogs,69
63923,Flow,551
63944,Poultry,332
63967,Police,61


### 10. Removing the Folders with less than 50 Images

In [156]:
for i in tqdm(df_[df_['freq'] < 50]['folder']):
    
    
    src = 'Dataset/' + i
    dst = 'Temp/' + i
    
    shutil.move(src, dst)    

100%|██████████████████████████████████| 62028/62028 [00:05<00:00, 10766.93it/s]


In [163]:
df = pd.read_csv('final.csv')

del df['img_link']
del df['img_id']
del df['likes']
del df['comments']

df.to_csv('final.csv', index = False)