In [None]:
import os
import pandas as pd
import zipfile
from tqdm import tqdm

def init_comparison_dataframe():
    return pd.DataFrame(columns=['Name'])

def add_comparison_column(df, new_folder_or_zip, column_name='folder'):
    """
    Adds a column to the DataFrame indicating whether items at a specified level exist
    in the given folder or zip file, without using the 'append' method.
    """
    # Determine if the path is a zip file or a folder
    is_zip = zipfile.is_zipfile(new_folder_or_zip)
    
    # Initialize a set to hold the item names
    items = set()

    if is_zip:
        with zipfile.ZipFile(new_folder_or_zip, 'r') as zip_ref:
            for info in tqdm(zip_ref.infolist(), desc="Processing items in zip"):
                parts = info.filename.split('/')
                if parts[-1] == '':
                    continue
                items.add(os.sep.join(parts))
    else:
        for root, _, files in tqdm(os.walk(new_folder_or_zip), desc="Processing items in folder"):
            # Add files at the current depth
            for file in files:
                file_path = os.path.join(root, file)
                rel_file_path = os.path.relpath(file_path, new_folder_or_zip)
                items.add(rel_file_path)

    # Ensure all items in DataFrame have the new column initialized to 'no'
    if column_name not in df.columns:
        df[column_name] = 'no'
    
    # Convert the DataFrame column to a set for faster lookup
    df_items_set = set(df['Name'])

    # Find the difference between the items and what's already in the DataFrame
    items_to_add = items - df_items_set

    # For the items already in the DataFrame, set the column to 'yes'
    df.loc[df['Name'].isin(items), column_name] = 'yes'

    # Create a new DataFrame for the items that need to be added
    new_rows = pd.DataFrame({'Name': list(items_to_add)})
    new_rows[column_name] = 'yes'

    # For columns other than 'Name' and the new column, set them to 'no'
    for col in df.columns:
        if col not in ['Name', column_name]:
            new_rows[col] = 'no'

    # Concatenate the new rows with the original DataFrame
    df = pd.concat([df, new_rows], ignore_index=True)
    
    return df

In [None]:
df = init_comparison_dataframe()

zip_path = 'C:\\Users\\Patrick\\Desktop\\vox2_aac_2.zip'
folder_path = 'C:\\Users\\Patrick\\Desktop\\vox2_aac_2'
zip2_path = 'C:\\Users\\Patrick\\Desktop\\vox2_aac_1.zip'

df = add_comparison_column(df, zip_path, 'zip')
df = add_comparison_column(df, folder_path, 'folder')
#df = add_comparison_column(df, zip2_path, 'zip2')


In [None]:

filtered_df = df[(df['folder'] == 'no')]
filtered_df

In [None]:
df = init_comparison_dataframe()

zip_path = 'C:\\Users\\Patrick\\Desktop\\vox1_dev_wav.zip'  # Path to your zip file
folder_path = 'C:\\Users\\Patrick\\Desktop\\vox1_dev_wav'  # Path to your folder

df = add_comparison_column(df, zip_path, 'zip')
df = add_comparison_column(df, folder_path, 'folder')

filtered_df = df[(df['zip'] == 'no') | (df['folder'] == 'no')]
filtered_df
