In [None]:
import os
import pandas as pd
import zipfile

def init_comparison_dataframe():
    return pd.DataFrame(columns=['Name'])

def add_comparison_column(df, new_folder_or_zip, column_name='folder', stop_at_first_no=False, comparison_level=1):
    """
    Adds a column to the DataFrame indicating whether items at a specified level exist
    in the given folder or zip file, without using the 'append' method.
    """
    # Determine if the path is a zip file or a folder
    is_zip = zipfile.is_zipfile(new_folder_or_zip)
    
    # Initialize a set to hold the item names
    items = set()

    if is_zip:
        with zipfile.ZipFile(new_folder_or_zip, 'r') as zip_ref:
            for info in zip_ref.infolist():
                parts = info.filename.split('/')
                for depth in range(1, min(len(parts), comparison_level + 1)):
                    item = os.sep.join(parts[:depth])
                    items.add(item)
                if comparison_level >= len(parts) - 1:
                    full_item_path = os.sep.join(parts)
                    items.add(full_item_path)
    else:
        for root, dirs, files in os.walk(new_folder_or_zip):
            depth = os.path.relpath(root, new_folder_or_zip).count(os.sep)
            if depth >= comparison_level:
                # Skip further processing if we're deeper than the desired level
                continue

            r = os.path.relpath(root, new_folder_or_zip)
            if r != '.':
                items.add(r)
            if comparison_level-1 == depth:
                continue
            
            # Add directories at the current depth
            for dir in dirs:
                dir_path = os.path.join(root, dir)
                rel_dir_path = os.path.relpath(dir_path, new_folder_or_zip)
                items.add(rel_dir_path)

            # Add files at the current depth
            for file in files:
                file_path = os.path.join(root, file)
                rel_file_path = os.path.relpath(file_path, new_folder_or_zip)
                items.add(rel_file_path)

    # Ensure all items in DataFrame have the new column initialized to 'no'
    if column_name not in df.columns:
        df[column_name] = 'no'
    
    # Prepare rows to be added/updated
    rows_to_add = []
    for item in items:
        if item not in df['Name'].values:
            new_row = {'Name': item}
            for col in df.columns:
                if col != 'Name':
                    new_row[col] = 'no'
            new_row[column_name] = 'yes'
            rows_to_add.append(new_row)
        else:
            df.loc[df['Name'] == item, column_name] = 'yes'
    
    if rows_to_add:
        df = pd.concat([df, pd.DataFrame(rows_to_add)], ignore_index=True)
    
    return df

In [None]:
df = init_comparison_dataframe()

zip_path = 'C:\\Users\\Patrick\\Desktop\\vox2_aac_2.zip'
folder_path = 'C:\\Users\\Patrick\\Desktop\\vox2_aac_2'
zip2_path = 'C:\\Users\\Patrick\\Desktop\\vox2_aac_1.zip'

comparison_level = 25
df = add_comparison_column(df, zip_path, 'zip', comparison_level=comparison_level)
df = add_comparison_column(df, folder_path, 'folder', comparison_level=comparison_level)
df = add_comparison_column(df, zip2_path, 'zip2', comparison_level=comparison_level)

filtered_df = df[(df['folder'] == 'no')]
filtered_df


In [None]:
df = init_comparison_dataframe()

zip_path = 'C:\\Users\\Patrick\\Desktop\\vox1_dev_wav.zip'  # Path to your zip file
folder_path = 'C:\\Users\\Patrick\\Desktop\\vox1_dev_wav'  # Path to your folder

comparison_level = 25
df = add_comparison_column(df, zip_path, 'zip', comparison_level=comparison_level)
df = add_comparison_column(df, folder_path, 'folder', comparison_level=comparison_level)

filtered_df = df[(df['zip'] == 'no') | (df['folder'] == 'no')]
filtered_df
