In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import shutil
import pathlib
import PIL
from PIL import Image

Matplotlib created a temporary cache directory at /tmp/matplotlib-bvwqvhcv because the default path (/.config/matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


### Image Dataset Processing

In [2]:
# rename images with nice clean label_number formulas
for dir in os.listdir(os.getcwd()+'/instrument-images'):
    print(dir)

harp
kemenche
baglama
electric_guitar
oud
acoustic_guitar
violin
mandolin
yayli_tambur
kanun


In [3]:
# create a labels dictionary
img_dir = os.path.join(os.getcwd(), 'instrument-images')

labels = {}

for i, dir in enumerate(sorted(os.listdir(img_dir))):
    labels[dir] = i

inverted_labels = {v: k for k, v in labels.items()}

print(labels)
print(inverted_labels)

{'acoustic_guitar': 0, 'baglama': 1, 'electric_guitar': 2, 'harp': 3, 'kanun': 4, 'kemenche': 5, 'mandolin': 6, 'oud': 7, 'violin': 8, 'yayli_tambur': 9}
{0: 'acoustic_guitar', 1: 'baglama', 2: 'electric_guitar', 3: 'harp', 4: 'kanun', 5: 'kemenche', 6: 'mandolin', 7: 'oud', 8: 'violin', 9: 'yayli_tambur'}


In [4]:
# rename images with nice clean "label"_"number" filenames, move to the 'instrument-images' directory
for dir in os.listdir(img_dir):
    dir_path = os.path.join(img_dir, dir)
    
    for i, file in enumerate(sorted(os.listdir(dir_path))):
        old_file = os.path.join(dir_path, file)
        new_file = os.path.join(img_dir, f'{dir}_{i+1}.jpg')
        shutil.move(old_file, new_file)

In [5]:
# remove the empty subdirectories from 'instrument-images'
for lmnt in os.listdir(img_dir):
    if os.path.isdir(os.path.join(img_dir, lmnt)):
        os.rmdir(os.path.join(img_dir, lmnt))

In [6]:
# image cleanup - everything to 8-bit rbg
def convert_to_rgb(img_path):
    with Image.open(img_path) as img:
        if img.mode != 'RGB':
            img = img.convert('RGB')
        img.save(img_path)

for filename in os.listdir(img_dir):
    if filename.lower().endswith((".jpg", ".jpeg", ".png", ".gif")):
        img_path = os.path.join(img_dir, filename)
        convert_to_rgb(img_path)



In [7]:
# create a df of image names and labels to export to csv
image_labels = pd.DataFrame(columns=['image', 'label'])
image_list = []
label_list = []

for img in sorted(os.listdir(img_dir)):
    image_list.append(img)
    img_label = labels[img[0:img.rfind('_')]]
    label_list.append(img_label)
    # print(lmnt_label, lmnt)

image_labels['image'] = image_list
image_labels['label'] = label_list

In [8]:
# confirm results
print(image_labels.label.value_counts()) # confirm 100 of each label
image_labels.iloc[95:105] # check a "transition" section of the df

label
0    100
1    100
2    100
3    100
4    100
5    100
6    100
7    100
8    100
9    100
Name: count, dtype: int64


Unnamed: 0,image,label
95,acoustic_guitar_95.jpg,0
96,acoustic_guitar_96.jpg,0
97,acoustic_guitar_97.jpg,0
98,acoustic_guitar_98.jpg,0
99,acoustic_guitar_99.jpg,0
100,baglama_1.jpg,1
101,baglama_10.jpg,1
102,baglama_100.jpg,1
103,baglama_11.jpg,1
104,baglama_12.jpg,1


In [9]:
# export CSV
image_labels.to_csv(os.path.join(img_dir, 'image_labels.csv'), index=False)