# Metadata Tools
* Use mutagen to view and tag files
* Parse song filename and pull metadata to store as dataframe
* Save metadata as CSV
* Convert and save metadata 

In [1]:
from mutagen.id3 import ID3, TIT2, TPE1, TALB, APIC
import os
import numpy
import re
import pandas as pd
import pyarrow

In [2]:
# set path
path = '/Users/cam/Code/repos/music_tools/data'
os.chdir(path)
print('current directory: ', os.getcwd())

current directory:  /Users/cam/Code/repos/music_tools/data


In [None]:
tags = ID3("./King Gizzard & The Lizard Wizard - Live at Field of Vision '25 - 01 Gamma Knife (Live at Field of Vision '25).mp3")

title = tags.get('TIT2').text[0] if tags.get('TIT2') else "Unknown Title"
artist = tags.get('TPE1').text[0] if tags.get('TPE1') else "Unknown Artist"
album = tags.get('TALB').text[0] if tags.get('TALB') else "Unknown Album"

print(f"Title: {title}")
print(f"Artist: {artist}")
print(f"Album: {album}")

In [None]:
# TIT2 = Song title
# TPE1 = Artist
# TALB = Album

tags.add(TIT2(encoding=3, text='Gamma Knife'))
tags.add(TPE1(encoding=3, text='King Gizzard & The Lizard Wizard'))
tags.add(TALB(encoding=3, text="Live at Field of Vision '25"))

tags.save()

In [None]:
tags.add(TIT2(encoding=3, text='hello'))
tags.add(TPE1(encoding=3, text='KGLW'))
tags.add(TALB(encoding=3, text="FoV '25"))

tags.save()

## Functions

In [5]:
# Functions
def get_tags(path, verbose = False):

    tags = ID3(path)

    title = tags.get('TIT2').text[0] if tags.get('TIT2') else "Unknown Title"
    artist = tags.get('TPE1').text[0] if tags.get('TPE1') else "Unknown Artist"
    album = tags.get('TALB').text[0] if tags.get('TALB') else "Unknown Album"

    if verbose:
        print(f"Title: {title}")
        print(f"Artist: {artist}")
        print(f"Album: {album}")
        print('-' * 20)

    return {'title':title, 'artist':artist, 'album':album}

def add_tags(tag_dict, path, verbose = False):
    
    tags = ID3(path)

    tags.add(TIT2(encoding=3, text= tag_dict['title'] ))
    tags.add(TPE1(encoding=3, text= tag_dict['artist'] ))
    tags.add(TALB(encoding=3, text= tag_dict['album'] ))

    tags.save()

    if verbose:
        print(f'Tags saved to {path}')

def parse_filename(filename):
    # (?P<artist>.*?)           -> Capture artist until the first " - "
    # \s-\s(?P<album>.*?)       -> Capture album until the second " - "
    # \s-\s(?P<track_num>\d+)\s -> Capture the digits (track number)
    # (?P<title>.*?)\s          -> Capture title until a space and "("
    # \((?P<full_title>.*?)\)   -> Capture everything inside the parentheses
    # \.mp3                     -> Match extension
    
    pattern = r"(?P<artist>.*?)\s-\s(?P<album>.*?)\s-\s(?P<track_number>\d+)\s(?P<title>.*?)\s(?P<full_title>\(.*\))\.mp3"
    
    match = re.search(pattern, filename)
    
    if match:
        metadata = match.groupdict()
        
        return metadata
    else:
        print(f'Parsing Failed: {filename}')
        # fallback
        return {
            'artist': None, 
            'album': None, 
            'track_number': None, 
            'title': None, 
            'full_title': None
            }

def find_files(directory, extension='.mp3'):
    """
    Recursively finds all files of a specific type.
    :param directory: The root folder to start the search.
    :param extension: The file extension to look for (e.g., '.mp3', '.wav', '.flac').
    """
    # Don't forget the '.'
    if not extension.startswith('.'):
        extension = '.' + extension
    
    found_files = []
    
    try:
        with os.scandir(directory) as entries:
            for entry in entries:
                if entry.is_file():
                    # Check if the file ends with the extension (case-insensitive)
                    if entry.name.lower().endswith(extension.lower()):
                        found_files.append(entry.name)
                elif entry.is_dir():
                    # Recurse into subdirectories
                    found_files.extend(find_files(entry.path, extension))
    except PermissionError:
        pass
    
    return found_files

def convert_metadata(paths):
    """
    Pass a list of paths to parse for metadata and convert to a dataframe
    """
    all_metadata = []
    # parse the filenames for metadata
    for path in paths:
        filename = os.path.basename(path)
        metadata = parse_filename(filename)
        metadata['path'] = path
        metadata['filename'] = filename
        all_metadata.append(metadata)

    # convert to dataframe
    df = pd.DataFrame(all_metadata)
    
    return df

def update_song(df, metadata, filename):
    for key in metadata:
        df.loc[df['filename']==filename, key] = metadata[key]

In [None]:
tags = get_tags("./King Gizzard & The Lizard Wizard - Live at Field of Vision '25 - 01 Gamma Knife (Live at Field of Vision '25).mp3")

In [None]:
tags = {'title':'', 'artist':'King Gizzard & The Lizard Wizard', 'album':"Live at Field of Vision '25"}

add_tags(tags, "./King Gizzard & The Lizard Wizard - Live at Field of Vision '25 - 01 Gamma Knife (Live at Field of Vision '25).mp3")

In [None]:
# tag the songs with artist and album
tag_dict = {'title':'', 'artist':'King Gizzard & the Lizard Wizard', 'album':"Live at Field of Vision '25"}
for song in mp3s:
    add_tags(tag_dict, f'kglw2025-08-15/{song}')
    get_tags(f'kglw2025-08-15/{song}')

## Find Songs and Generate Metadata

In [6]:
data_path = '/Users/cam/Code/repos/music_tools/data/kglw2025-08-15'

# Find MP3s
mp3_files = find_files(data_path, extension='.mp3')
# make dataframe
df_mp3 = convert_metadata(mp3_files)

Parsing Failed: King Gizzard & The Lizard Wizard - Live at Field of Vision '25 - 23 Police Truck (feat. Jello Biafra) -Live at Field of Vision '25-.mp3


In [7]:
# manually annotate songs that fail
metadata = {
        'artist': 'King Gizzard & The Lizard Wizard', 
        'album': "Live at Field of Vision '25", 
        'track_number': '23', 
        'title': 'Police Truck', 
        'full_title': "(feat. Jello Biafra) (Live at Field of Vision '25)"
}

update_song(df_mp3, metadata, "King Gizzard & The Lizard Wizard - Live at Field of Vision '25 - 23 Police Truck (feat. Jello Biafra) -Live at Field of Vision '25-.mp3")



## Save metadata as CSV and Parquet

In [8]:
# save metadata
df_mp3.to_csv('metadata/library.csv', index=False)


In [None]:
# Bounce to dictionary then back to dataframe to remove any complicated data
df_plain = pd.DataFrame(df_mp3.to_dict(orient='list'))

# save to parquet
df_plain.to_parquet('metadata/library.parquet', engine='pyarrow')