# Extracting paths and group numbers

In [23]:
import os 
folder_path = "/home/kaibald231/ABC/26Apr" #26Apr #28Apr #28jan #27jan
if os.path.isdir(folder_path):
    print("Folder exists")
else:
    print("Folder does not exist")
    s


Folder exists


In [24]:
import pandas as pd
import os
def find_exports_folders(root_path):
    """
    Recursively searches for all folders named 'exports' starting from root_path.
    
    Args:
        root_path (str): The path to the root folder to search from
        
    Returns:
        list: A list of absolute paths to all folders named 'exports'
    """
    exports_paths = []
    
    # Check if the root path exists
    if not os.path.exists(root_path):
        print(f"Warning: Path '{root_path}' does not exist")
        return exports_paths
    
    # Walk through all directories and subdirectories
    for root, dirs, files in os.walk(root_path):
        # Check if 'exports' is in the list of directory names
        if 'exports' in dirs:
            # Get the full path to the exports folder
            exports_path = os.path.join(root, 'exports')
            exports_paths.append(exports_path)
    
    return exports_paths

def extract_part_number_and_date(paths):
    records = []

    for path in paths:
        try:
            parts = path.split(os.sep)
            # Look for the timestamp
            for i in range(len(parts)-1, 0, -1):
                part = parts[i]
                if part.isdigit() and len(part) >= 8:
                    date_str = part[:8]  # YYYYMMDD
                    mm_dd = f"{date_str[4:6]}_{date_str[6:8]}"
                    part_number = parts[i - 1]  # Folder before timestamp
                    records.append({'part_number': part_number, 'date': mm_dd})
                    break
            else:
                records.append({'part_number': None, 'date': None})
        except Exception:
            records.append({'part_number': None, 'date': None})

    return pd.DataFrame(records)

def restructure_phase_columns(df):
    # Step 1: Clean column names
    df.columns = [col.strip() for col in df.columns]

    # Step 2: Find the split index (where 'part number.1' appears)
    split_idx = df.columns.get_loc('part number.1')

    # Step 3: Split columns
    phase1_cols = df.columns[:split_idx]
    phase2_cols = df.columns[split_idx:]

    # Step 4: Create MultiIndex columns
    new_columns = []
    for col in phase1_cols:
        col_name = 'date' if col == 'phase1 date' else col
        new_columns.append(('phase1', col_name))

    for col in phase2_cols:
        col = col.replace('.1', '').strip()
        col_name = 'date' if col == 'phase2 date' else col
        new_columns.append(('phase2', col_name))

    # Step 5: Apply new MultiIndex columns
    df.columns = pd.MultiIndex.from_tuples(new_columns)

    return df

def normalize_date_column(df, col='date'):
    month_map = {
        'jan': '01', 'feb': '02', 'mar': '03', 'apr': '04',
        'may': '05', 'jun': '06', 'jul': '07', 'aug': '08',
        'sep': '09', 'oct': '10', 'nov': '11', 'dec': '12'
    }

    def parse_date(val):
        if pd.isna(val):
            return None
        val = str(val).lower().replace(" ", "")  # e.g., 'Apr 26' -> 'apr26'
        for mon, mm in month_map.items():
            if val.startswith(mon):
                day_part = val[len(mon):]
                if day_part.isdigit():
                    return f"{mm}_{day_part.zfill(2)}"  # zero-pad day
        return None  # fallback if no match

    df[col] = df[col].apply(parse_date)
    return df

def merge_group_tablo_order(small_df, big_df):
    merged_rows = []

    for phase in ['phase1', 'phase2']:
        # Extract and rename necessary columns
        df_phase = big_df[phase][['part number', 'date', 'group', 'tablo order']].copy()
        df_phase.columns = ['part_number', 'date', 'group', 'tablo_order']
        
        # Merge with small_df
        merged = pd.merge(
            small_df,
            df_phase,
            on=['part_number', 'date'],
            how='left'
        )
        merged['phase'] = phase  # optional: track source phase
        merged_rows.append(merged)

    # Combine phase1 + phase2 results (you can skip this if you only need matched rows)
    final = pd.concat(merged_rows, ignore_index=True)

    # Optionally drop rows with no match
    final = final.dropna(subset=['group', 'tablo_order'], how='all')

    return final

def process_paths(df, paths):
    result = []
    
    # Create a mapping from part_number to its corresponding row in df
    part_to_row = {}
    for idx, row in df.iterrows():
        part_to_row[row['part_number']] = row
    
    for path in paths:
        # Extract part_number from the path (assuming it's the 2nd last directory)
        part_number = path.split('/')[-3]
        
        # Get the corresponding row from df
        row = part_to_row.get(part_number)
        
        if row is None:
            # If part_number not found in df, skip or handle as needed
            continue
        
        # Determine the value for the 2nd element
        if row['phase'] == 'phase2':
            value = '2'
        elif row['phase'] == 'phase1':
            value = row['group']
        else:
            value = None  # Fallback (adjust as needed)
        
        # Determine if tablo_order is "bad" (assuming NaN is not "bad")
        is_valid = False if str(row['tablo_order']).lower() == "bad" else True
        
        # Append to result
        result.append([path, value, is_valid])
    
    return result

result = find_exports_folders(folder_path)
current_folders_df = extract_part_number_and_date(result)

# https://docs.google.com/spreadsheets/d/14dm8AJ1-oXXnaGY_-iNnv1_S62QoHA3jy3gcePCKnF4/edit?usp=sharing
sheet_id = "14dm8AJ1-oXXnaGY_-iNnv1_S62QoHA3jy3gcePCKnF4"
sheet_name = "Sheet1"  # or whatever your tab is named
csv_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

# Step 1: Read the raw two-level header
df = pd.read_csv(csv_url)
df = restructure_phase_columns(df)
df.loc[:, ('phase1', 'part number')] = df['phase1']['part number'].astype(str).str.zfill(3)
df.loc[:, ('phase2', 'part number')] = df['phase2']['part number'].astype(str).str.zfill(3)

# Apply to all phase columns (e.g., 'phase1', 'phase2', etc.)
for phase_col in ['phase1', 'phase2']:  # Add more if needed
    if phase_col in df.columns:
        # Ensure we're working with a copy to avoid SettingWithCopyWarning
        df[phase_col] = df[phase_col].copy()
        if 'date' in df[phase_col].columns:
            df[phase_col] = normalize_date_column(df[phase_col], col='date')

result_df = merge_group_tablo_order(current_folders_df, df)
process_paths(result_df ,result)

 '013' '014' '015' '016' '017' '018' '019' '020' '021' '022' '023' '024'
 '025' '026' '027' '028' '029' '030' '031' '032' '033' '034' '035' '036'
 '037' '038' '039' '040' '041' '042' '043' '044' '045' '046' '047' '048'
 '049' '050' '051' '052' '053' '054' '055' '056' '057' '058' '059' '060'
 '061' '062' '063' '064' '065' '066' '067' '068' '069' '070' '071' '072'
 '073' '074' '075' '076' '077' '078' '079' '080' '081' '082' '083' '084'
 '085' '086' '087' '088' '089' '090' '091' '092' '093' '094' '095' '096'
 '097' '098' '099' '100' '101' '102' '103' '104' '105' '106' '107' '108'
 '109' '110' '111' '112' '113' '114' '115' '116' '117' '118' '119' '120'
 '121' '122' '123' '124' '125' '126' '127' '128' '129' '130' '131' '132'
 '133' '134' '135' '136' '137' '138' '139' '140' '141' '142' '143' '144'
 '145' '146' '147' '148' '149' '150' '151' '152' '153' '154' '155' '156'
 '157' '158' '159' '160' '161' '162' '163' '164' '165' '166' '167' '168'
 '169' '170' '171' '172' '173' '174' '175' '176' '1

[['/home/kaibald231/ABC/26Apr/BIN9:15/092/20250426101651949/exports',
  '2',
  True],
 ['/home/kaibald231/ABC/26Apr/BIN9:15/093/20250426101622479/exports',
  '2',
  True],
 ['/home/kaibald231/ABC/26Apr/BIN10:15/063/20250426112258634/exports',
  '2',
  True],
 ['/home/kaibald231/ABC/26Apr/BIN10:15/031/20250426112215382/exports',
  '2',
  True],
 ['/home/kaibald231/ABC/26Apr/BIN13:15/037/20250426141354347/exports',
  '2',
  True],
 ['/home/kaibald231/ABC/26Apr/BIN13:15/014/20250426142058230/exports',
  '2',
  True],
 ['/home/kaibald231/ABC/26Apr/BIN14:15/129/20250426151921955/exports',
  '2',
  True],
 ['/home/kaibald231/ABC/26Apr/BIN14:15/158/20250426151952420/exports',
  '2',
  True]]

# Checking if there are no missing files after processing

In [None]:
import os
from pathlib import Path

def find_processed_folder(root_folder: str) -> Path | None:
    """
    Recursively search for the first folder starting with 'processed' under the given root folder.

    Args:
        root_folder (str): The path to start searching from.

    Returns:
        Path or None: Path to the first folder starting with 'processed', or None if not found.
    """
    root_path = Path(root_folder)

    for dirpath, dirnames, _ in os.walk(root_path):
        for dirname in dirnames:
            if dirname.lower().startswith("processed_"):
                return Path(dirpath) / dirname
    return None

def check_files_in_subfolders(root_folder):
    """
    Check if specific files are present in subfolders of the given root folder.
    Reports which files are missing from which subfolders.
    
    Args:
        root_folder (str): Path to the root folder to search in
        
    Returns:
        list: List of tuples (folder_path, missing_files) for folders with missing files
    """
    required_files = [
        "gazeData_mapped.tsv",
        # "fixations_mapped.tsv", 
        # "pupil_trimmed_audio.mp3",
        # "pupil_segment_transcriptions.json"
    ]
    
    root_path = Path(root_folder)
    
    if not root_path.exists():
        print(f"Error: Root folder '{root_folder}' does not exist.")
        return []
    
    if not root_path.is_dir():
        print(f"Error: '{root_folder}' is not a directory.")
        return []
    
    # Get all subdirectories
    subfolders = [d for d in root_path.iterdir() if d.is_dir()]
    
    if not subfolders:
        print(f"No subfolders found in '{root_folder}'.")
        return []
    
    # print(f"Checking {len(subfolders)} subfolders in '{root_folder}'...")
    # print("-" * 60)
    
    folders_with_missing_files = []
    
    for subfolder in subfolders:
        missing_files = []
        
        # Check each required file
        for required_file in required_files:
            file_path = subfolder / required_file
            if not file_path.exists():
                missing_files.append(required_file)
        
        # If any files are missing, record this subfolder
        if missing_files:
            folders_with_missing_files.append((str(subfolder), missing_files))
    
    # Report results
    if folders_with_missing_files:
        print(f"Found {len(folders_with_missing_files)} subfolders with missing files:\n")
        
        for subfolder, missing_files in folders_with_missing_files:
            print(f"📁 {subfolder}")
            for missing_file in missing_files:
                print(f"   ❌ Missing: {missing_file}")
            print()
    return folders_with_missing_files

def check_files_in_subfolders(root_folder):

    """
    Check if specific files are present in subfolders of the given root folder.
    Reports which subfolders contain all required files.
    
    Args:
        root_folder (str): Path to the root folder to search in
        
    Returns:
        list: List of folder paths that have all required files
    """
    required_files = [
        "painting_in_world.mp4",
        "ref_gaze.mp4", 
        "ref2world_mapping.mp4",
        # "pupil_segment_transcriptions.json"
    ]
    
    root_path = Path(root_folder)
    
    if not root_path.exists():
        print(f"Error: Root folder '{root_folder}' does not exist.")
        return []
    
    if not root_path.is_dir():
        print(f"Error: '{root_folder}' is not a directory.")
        return []
    
    # Get all subdirectories
    subfolders = [d for d in root_path.iterdir() if d.is_dir()]
    
    if not subfolders:
        print(f"No subfolders found in '{root_folder}'.")
        return []
    
    folders_with_all_files = []
    
    for subfolder in subfolders:
        has_all_files = True
        
        # Check each required file
        for required_file in required_files:
            file_path = subfolder / required_file
            if not file_path.exists():
                has_all_files = False
                break
        
        # If all files are present, record this subfolder
        if has_all_files:
            folders_with_all_files.append(str(subfolder))
    
    # Report results
    if folders_with_all_files:
        print(f"Found {len(folders_with_all_files)} subfolders with all required files:\n")
        for subfolder in folders_with_all_files:
            print(f"✅ {subfolder}")
    else:
        print("No subfolders contain all required files.")
    
    return folders_with_all_files


# Collect all folder paths with missing files
all_folders_with_missing_files = []

result = find_exports_folders(folder_path)
for sub_folder in result:
    processed_folder = find_processed_folder(sub_folder)
    missing_folders = check_files_in_subfolders(processed_folder)
    
    # Extract just the folder paths (not the missing files list)
    folder_paths_only = [folder_path for folder_path, _ in missing_folders]
    all_folders_with_missing_files.extend(folder_paths_only)

print(f"\nAll folders with missing files: {len(all_folders_with_missing_files)} total")
print(all_folders_with_missing_files)


Found 8 subfolders with all required files:

✅ /home/kaibald231/ABC/26Apr/BIN9:15/092/20250426101651949/exports/processed_2abc092/000
✅ /home/kaibald231/ABC/26Apr/BIN9:15/092/20250426101651949/exports/processed_2abc092/001
✅ /home/kaibald231/ABC/26Apr/BIN9:15/092/20250426101651949/exports/processed_2abc092/002
✅ /home/kaibald231/ABC/26Apr/BIN9:15/092/20250426101651949/exports/processed_2abc092/003
✅ /home/kaibald231/ABC/26Apr/BIN9:15/092/20250426101651949/exports/processed_2abc092/004
✅ /home/kaibald231/ABC/26Apr/BIN9:15/092/20250426101651949/exports/processed_2abc092/005
✅ /home/kaibald231/ABC/26Apr/BIN9:15/092/20250426101651949/exports/processed_2abc092/006
✅ /home/kaibald231/ABC/26Apr/BIN9:15/092/20250426101651949/exports/processed_2abc092/007


ValueError: too many values to unpack (expected 2)

# If there are missing data, run this

In [5]:
import os
from pathlib import Path
import glob

def create_jobs_from_missing_folders(missing_folders):
    """
    Create a jobs list from missing folder paths.
    
    Args:
        missing_folders (list): List of folder paths that are missing required files
        
    Returns:
        list: List of tuples in the format (gaze_csv, world_mp4, ref_jpg, output_path, "False")
    """
    jobs = []
    
    for missing_folder in missing_folders:
        try:
            missing_path = Path(missing_folder)
            
            # Extract the folder name (e.g., "006" from the path)
            folder_name = missing_path.name
            
            # Navigate to the exports folder
            # From: /home/kaibald231/ABC/26Apr/BIN13:15/014/20250426142058230/exports/processed_2abc014/006
            # To:   /home/kaibald231/ABC/26Apr/BIN13:15/014/20250426142058230/exports/006
            
            # Go up to exports folder and then to the folder with same name
            exports_path = missing_path.parent.parent / folder_name
            
            # Path 1: gaze_positions.csv in exports folder
            gaze_csv = exports_path / "gaze_positions.csv"
            
            # Path 2: Find .mp4 file in exports folder (usually world.mp4)
            mp4_files = list(exports_path.glob("*.mp4"))
            if mp4_files:
                world_mp4 = mp4_files[0]  # Take the first .mp4 file found
            else:
                # If no .mp4 found, use expected path
                world_mp4 = exports_path / "world.mp4"
            
            # Path 3: Find .jpg file in the missing folder (original path)
            jpg_files = list(missing_path.glob("*.jpg"))
            if jpg_files:
                ref_jpg = jpg_files[0]  # Take the first .jpg file found
            else:
                # If no .jpg found, use a generic name
                ref_jpg = missing_path / "reference.jpg"
            
            # Path 4: Output path (the missing folder itself)
            output_path = missing_folder
            
            # Path 5: Always "False"
            flag = "True"
            
            # Create the job tuple
            job_tuple = (
                str(gaze_csv),
                str(world_mp4),
                str(ref_jpg),
                str(output_path),
                flag
            )
            
            jobs.append(job_tuple)
            
        except Exception as e:
            print(f"Error processing folder {missing_folder}: {e}")
            continue
    
    return jobs

def print_jobs_list(jobs):
    """
    Print the jobs list in a readable format.
    """
    print("jobs = [")
    for i, job in enumerate(jobs):
        print("    (")
        print(f'        "{job[0]}",')
        print(f'        "{job[1]}",')
        print(f'        "{job[2]}",')
        print(f'        "{job[3]}",')
        print(f'        "{job[4]}"')
        if i < len(jobs) - 1:
            print("    ),")
        else:
            print("    )")
    print("]")

# Create jobs list
jobs = create_jobs_from_missing_folders(all_folders_with_missing_files)

# Print in the required format
print_jobs_list(jobs)

# Copy and paste the output into your script run_missing_data.py and run it


jobs = [
    (
        "/home/kaibald231/ABC/27jan/BIN9:15/052/20250127100823065/exports/007/gaze_positions.csv",
        "/home/kaibald231/ABC/27jan/BIN9:15/052/20250127100823065/exports/007/world.mp4",
        "/home/kaibald231/ABC/27jan/BIN9:15/052/20250127100823065/exports/processed_2abc052/007/FLORIS Frans, Portrait de dame âgée, parfois dit la Femme du fauconnier Inv.47.jpg",
        "/home/kaibald231/ABC/27jan/BIN9:15/052/20250127100823065/exports/processed_2abc052/007",
        "True"
    ),
    (
        "/home/kaibald231/ABC/27jan/BIN14:15/134/20250127150153661/exports/007/gaze_positions.csv",
        "/home/kaibald231/ABC/27jan/BIN14:15/134/20250127150153661/exports/007/world.mp4",
        "/home/kaibald231/ABC/27jan/BIN14:15/134/20250127150153661/exports/processed_2abc134/007/FLORIS Frans, Portrait de dame âgée, parfois dit la Femme du fauconnier Inv.47.jpg",
        "/home/kaibald231/ABC/27jan/BIN14:15/134/20250127150153661/exports/processed_2abc134/007",
        "True"
  

# Copy pasting the processed folders into output folder

In [None]:
import os
import shutil
def copy_processed_folders_simple(source_path, destination_path):

    """
    Alternative version that copies all 'processed_' folders into a subfolder
    named after the source folder within the destination.
    
    Args:
        source_path (str): The path to search for 'processed_' folders
        destination_path (str): The path where the folders should be copied to
        
    Returns:
        list: A list of tuples (source_path, destination_path) for each copied folder
    """
    copied_folders = []
    
    # Check if source path exists
    if not os.path.exists(source_path):
        print(f"Error: Source path '{source_path}' does not exist")
        return copied_folders
    
    # Get the name of the source folder
    source_folder_name = os.path.basename(os.path.abspath(source_path))
    
    # Create the destination structure: destination_path/source_folder_name/
    final_destination = os.path.join(destination_path, source_folder_name)
    
    # Create destination directory if it doesn't exist
    if not os.path.exists(final_destination):
        os.makedirs(final_destination)
        print(f"Created destination directory: {final_destination}")
    
    # Walk through all directories and subdirectories
    for root, dirs, files in os.walk(source_path):
        # Check each directory name
        for dir_name in dirs:
            if dir_name.startswith('processed_'):
                source_folder = os.path.join(root, dir_name)
                dest_folder = os.path.join(final_destination, dir_name)
                
                # Handle name conflicts by adding a number suffix
                counter = 1
                original_dest = dest_folder
                while os.path.exists(dest_folder):
                    dest_folder = f"{original_dest}_{counter}"
                    counter += 1
                
                try:
                    # Copy the entire folder and its contents
                    shutil.copytree(source_folder, dest_folder)
                    copied_folders.append((source_folder, dest_folder))
                    print(f"Copied: {source_folder} -> {dest_folder}")
                    
                except Exception as e:
                    print(f"Error copying {source_folder}: {str(e)}")
    
    return copied_folders

source_folder = folder_path
dest_folder = "/home/kaibald231/ABC_Processed"  # Change this to your desired destination folder

# Use the main function (preserves directory structure)
result = copy_processed_folders_simple(source_folder, dest_folder)

print(f"\nCopied {len(result)} 'processed_' folders:")
for src, dst in result:
    print(f"  {src} -> {dst}")