In [15]:
import json
import os
from pathlib import Path
from typing import Dict, List, Any
import pandas as pd

# Define file paths
SOURCE_FILE = "../example_upload_data/uploaded_files-100.json"  # File with all metadata
TARGET_FILE = "../example_upload_data/uploaded_files-new.json"  # File with new IDs
OUTPUT_FILE = "../example_upload_data/uploaded_files-merged-100.json"  # Output file

print("Required libraries imported successfully!")


Required libraries imported successfully!


In [16]:
def load_json_file(file_path: str) -> List[Dict[str, Any]]:
    """Load JSON file and return its contents."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
        print(f"✅ Successfully loaded {file_path}")
        print(f"   Contains {len(data)} entries")
        return data
    except FileNotFoundError:
        print(f"❌ File not found: {file_path}")
        return []
    except json.JSONDecodeError as e:
        print(f"❌ JSON decode error in {file_path}: {e}")
        return []
    except Exception as e:
        print(f"❌ Error loading {file_path}: {e}")
        return []

def create_filename_to_id_mapping(data: List[Dict[str, Any]]) -> Dict[str, int]:
    """Create a mapping from filename to ID."""
    mapping = {}
    for entry in data:
        if 'filename' in entry and 'id' in entry:
            mapping[entry['filename']] = entry['id']
    print(f"Created mapping for {len(mapping)} filenames")
    return mapping


In [17]:
def merge_json_files(source_data: List[Dict[str, Any]], 
                    id_mapping: Dict[str, int]) -> List[Dict[str, Any]]:
    """
    Merge source data with new IDs from the mapping, removing duplicates.
    
    Args:
        source_data: List of file entries from the source JSON
        id_mapping: Dictionary mapping filename to new ID
    
    Returns:
        List of merged entries with updated IDs (duplicates removed)
    """
    merged_data = []
    updated_count = 0
    not_found_count = 0
    duplicate_count = 0
    seen_combinations = set()  # Track (filename, file_path) combinations
    
    for entry in source_data:
        filename = entry.get('filename', '')
        file_path = entry.get('file_path', '')
        
        # Create a unique key for this file
        file_key = (filename, file_path)
        
        # Skip if we've already processed this exact file
        if file_key in seen_combinations:
            duplicate_count += 1
            print(f"🔄 Skipping duplicate: '{filename}' at '{file_path}'")
            continue
            
        seen_combinations.add(file_key)
        
        # Create a copy of the entry
        merged_entry = entry.copy()
        
        if filename in id_mapping:
            # Update the ID
            old_id = merged_entry.get('id')
            new_id = id_mapping[filename]
            merged_entry['id'] = new_id
            updated_count += 1
            print(f"Updated '{filename}': ID {old_id} → {new_id}")
        else:
            not_found_count += 1
            print(f"⚠️  No new ID found for '{filename}' - keeping original ID {entry.get('id')}")
        
        merged_data.append(merged_entry)
    
    print(f"\n📊 Merge Summary:")
    print(f"   Total entries processed: {len(source_data)}")
    print(f"   Duplicates removed: {duplicate_count}")
    print(f"   Unique entries kept: {len(merged_data)}")
    print(f"   IDs updated: {updated_count}")
    print(f"   IDs unchanged: {not_found_count}")
    
    return merged_data


In [4]:
def save_json_file(data: List[Dict[str, Any]], file_path: str) -> bool:
    """Save data to JSON file."""
    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(data, f, indent=2, ensure_ascii=False)
        print(f"✅ Successfully saved {len(data)} entries to {file_path}")
        return True
    except Exception as e:
        print(f"❌ Error saving to {file_path}: {e}")
        return False

def display_sample_data(data: List[Dict[str, Any]], title: str, max_items: int = 3):
    """Display sample data for verification."""
    print(f"\n{title}")
    print("=" * len(title))
    
    if not data:
        print("No data to display")
        return
    
    for i, entry in enumerate(data[:max_items]):
        print(f"\nEntry {i+1}:")
        for key, value in entry.items():
            print(f"  {key}: {value}")
    
    if len(data) > max_items:
        print(f"\n... and {len(data) - max_items} more entries")


In [18]:
# Main execution
print("🚀 Starting JSON file merge process...\n")

# Step 1: Load source file (contains all file metadata)
print("Step 1: Loading source file...")
source_data = load_json_file(SOURCE_FILE)

if not source_data:
    print("❌ Cannot proceed without source data")
else:
    display_sample_data(source_data, "Sample from Source File")
    
    # Step 2: Load target file (contains new IDs)
    print(f"\nStep 2: Loading target file...")
    target_data = load_json_file(TARGET_FILE)
    
    if not target_data:
        print("❌ Cannot proceed without target data")
    else:
        display_sample_data(target_data, "Sample from Target File")
        
        # Step 3: Create filename to ID mapping from target file
        print(f"\nStep 3: Creating filename to ID mapping...")
        id_mapping = create_filename_to_id_mapping(target_data)
        
        if not id_mapping:
            print("❌ No ID mapping created")
        else:
            # Step 4: Merge the files
            print(f"\nStep 4: Merging files...")
            merged_data = merge_json_files(source_data, id_mapping)
            
            # Step 5: Save the result
            print(f"\nStep 5: Saving merged file...")
            if save_json_file(merged_data, OUTPUT_FILE):
                display_sample_data(merged_data, "Sample from Merged File")
                print(f"\n🎉 Process completed successfully!")
                print(f"   Merged file saved as: {OUTPUT_FILE}")
            else:
                print("❌ Failed to save merged file")


🚀 Starting JSON file merge process...

Step 1: Loading source file...
✅ Successfully loaded ../example_upload_data/uploaded_files-100.json
   Contains 98 entries

Sample from Source File

Entry 1:
  id: 12
  filename: Datenschutzerklärung Heier Grundschule.docx
  file_path: app/data/uploads/Datenschutzerklärung Heier Grundschule.docx
  created_at: 2025-05-15T11:12:25.702298
  document_id: None
  relative_path: Datenschutzerklärung Heier Grundschule.docx

Entry 2:
  id: 13
  filename: Drop shipping.docx
  file_path: app/data/uploads/Drop shipping.docx
  created_at: 2025-05-15T11:12:25.702298
  document_id: None
  relative_path: Drop shipping.docx

Entry 3:
  id: 16
  filename: Book.xlsx
  file_path: app/data/uploads/Book.xlsx
  created_at: 2025-05-15T11:12:25.702298
  document_id: None
  relative_path: Book.xlsx

... and 95 more entries

Step 2: Loading target file...
✅ Successfully loaded ../example_upload_data/uploaded_files-new.json
   Contains 117 entries

Sample from Target File

E

In [9]:
# Optional: Additional analysis and utilities

def analyze_files_difference(source_data: List[Dict[str, Any]], 
                           target_data: List[Dict[str, Any]]):
    """Analyze differences between source and target files."""
    print("🔍 File Analysis")
    print("=" * 50)
    
    source_filenames = {entry.get('filename') for entry in source_data}
    target_filenames = {entry.get('filename') for entry in target_data}
    
    print(f"Source file entries: {len(source_data)}")
    print(f"Target file entries: {len(target_data)}")
    print(f"Unique filenames in source: {len(source_filenames)}")
    print(f"Unique filenames in target: {len(target_filenames)}")
    
    # Files in source but not in target
    only_in_source = source_filenames - target_filenames
    if only_in_source:
        print(f"\n⚠️  Files in source but not in target ({len(only_in_source)}):")
        for filename in sorted([f for f in only_in_source if f is not None]):
            print(f"   - {filename}")
    
    # Files in target but not in source
    only_in_target = target_filenames - source_filenames
    if only_in_target:
        print(f"\n⚠️  Files in target but not in source ({len(only_in_target)}):")
        for filename in sorted([f for f in only_in_target if f is not None]):
            print(f"   - {filename}")
    
    # Common files
    common_files = source_filenames & target_filenames
    print(f"\n✅ Files found in both ({len(common_files)}):")
    for filename in sorted([f for f in list(common_files)[:5] if f is not None]):  # Show first 5
        print(f"   - {filename}")
    if len(common_files) > 5:
        print(f"   ... and {len(common_files) - 5} more")

# Uncomment the following lines to run the analysis
# if 'source_data' in locals() and 'target_data' in locals():
#     analyze_files_difference(source_data, target_data)
analyze_files_difference(source_data, target_data)

🔍 File Analysis
Source file entries: 98
Target file entries: 117
Unique filenames in source: 92
Unique filenames in target: 108

⚠️  Files in target but not in source (16):
   - .849C9593-D756-4E56-8D6E-42412F2A707B
   - .DS_Store
   - Becke Beratungsvertrag.pages
   - Becke DSVGO.pages
   - Becke Servicevertrag.pages
   - Book 1.xlsx
   - IMG_0053.jpeg
   - Icon%0D
   - Liste mit Produkten und Ranking.xlsx
   - Untitled Diagram.drawio
   - ideenpapier_gsnrw.pages
   - logo.png
   - v1_1_II.jpg
   - v1_2_II.jpg
   - v2_1_II.jpg
   - v2_2_II.jpg

✅ Files found in both (92):
   - Datenschutzerklärung Heier Grundschule.docx
   - KI_Projekt_Elterninfo.pdf.txt
   - KI_Projekt_Feedbackbogen_Lehrkraefte.txt
   - KI_Projekt_Kostenkalkulation.csv
   - Meilensteinplan-Vorlage-Excel.xlsx
   ... and 87 more


In [11]:
import requests
requested_files = [3, 4, 7, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 41, 42, 43, 44, 45, 47, 48, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 65, 70, 74, 75, 76, 77, 78, 79, 80, 81, 86, 87, 88, 89, 90, 91, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117]

existing_files = []

# request http://localhost:9877/files/ and get all the ids
response = requests.get("http://localhost:9877/files/")
ids = [file["id"] for file in response.json()]

print(ids)

# compare the ids with the requested_files
for id in requested_files:
    if id not in ids:
        print(f"File with id {id} not found")

# save the ids to a json file

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117]


In [13]:
output_file_json = load_json_file(OUTPUT_FILE)

# Find duplicate entries by id
id_counts = {}
for entry in output_file_json:
    file_id = entry['id']
    if file_id in id_counts:
        id_counts[file_id].append(entry)
    else:
        id_counts[file_id] = [entry]

# Print duplicates
duplicates = {id: entries for id, entries in id_counts.items() if len(entries) > 1}
if duplicates:
    print("\nFound duplicate entries:")
    for file_id, entries in duplicates.items():
        print(f"\nID {file_id} appears {len(entries)} times:")
        for entry in entries:
            print(f"  - {entry['filename']} ({entry['file_path']})")
else:
    print("\nNo duplicate IDs found")



✅ Successfully loaded ../example_upload_data/uploaded_files-merged-100.json
   Contains 98 entries

Found duplicate entries:

ID 53 appears 2 times:
  - Alles.xlsx (app/data/uploads/Alles.xlsx)
  - Alles.xlsx (app/data/uploads/Alles.xlsx)

ID 112 appears 2 times:
  - Businessplan 02.12.21 DRAW.docx (app/data/uploads/Businessplan 02.12.21 DRAW.docx)
  - Businessplan 02.12.21 DRAW.docx (app/data/uploads/Businessplan 02.12.21 DRAW.docx)

ID 75 appears 2 times:
  - Hauptformular.docx (app/data/uploads/Hauptformular.docx)
  - Hauptformular.docx (app/data/uploads/Hauptformular.docx)

ID 76 appears 2 times:
  - AVV.doc (app/data/uploads/AVV.doc)
  - AVV.doc (app/data/uploads/AVV.doc)

ID 78 appears 2 times:
  - AGB.docx (app/data/uploads/AGB.docx)
  - AGB.docx (app/data/uploads/AGB.docx)

ID 87 appears 2 times:
  - AVV_Vorlage.doc (app/data/uploads/AVV_Vorlage.doc)
  - AVV_Vorlage.doc (app/data/uploads/AVV_Vorlage.doc)


In [14]:
# Regenerate the merged file with duplicate handling
print("🔄 Regenerating merged file with duplicate removal...\n")

if 'source_data' in locals() and 'target_data' in locals():
    # Step 1: Create filename to ID mapping from target file
    print("Step 1: Creating filename to ID mapping...")
    id_mapping = create_filename_to_id_mapping(target_data)
    
    # Step 2: Merge files with duplicate removal
    print(f"\nStep 2: Merging files with duplicate removal...")
    merged_data_clean = merge_json_files(source_data, id_mapping)
    
    # Step 3: Save the clean result
    print(f"\nStep 3: Saving clean merged file...")
    if save_json_file(merged_data_clean, OUTPUT_FILE):
        display_sample_data(merged_data_clean, "Sample from Clean Merged File")
        
        # Verify no duplicates remain
        print(f"\n🔍 Verifying no duplicates remain...")
        id_counts_clean = {}
        for entry in merged_data_clean:
            file_id = entry['id']
            if file_id in id_counts_clean:
                id_counts_clean[file_id].append(entry)
            else:
                id_counts_clean[file_id] = [entry]
        
        duplicates_clean = {id: entries for id, entries in id_counts_clean.items() if len(entries) > 1}
        if duplicates_clean:
            print("❌ Still found duplicate entries:")
            for file_id, entries in duplicates_clean.items():
                print(f"   ID {file_id} appears {len(entries)} times")
        else:
            print("✅ No duplicate IDs found - file is clean!")
        
        print(f"\n🎉 Clean merged file saved as: {OUTPUT_FILE}")
        print(f"   Total unique entries: {len(merged_data_clean)}")
    else:
        print("❌ Failed to save clean merged file")
else:
    print("❌ Source and target data not available. Please run the cells above first.")


🔄 Regenerating merged file with duplicate removal...

Step 1: Creating filename to ID mapping...
Created mapping for 108 filenames

Step 2: Merging files with duplicate removal...
Updated 'Datenschutzerklärung Heier Grundschule.docx': ID 12 → 3
Updated 'Drop shipping.docx': ID 13 → 4
Updated 'Book.xlsx': ID 16 → 7
Updated 'Rechner 2k.docx': ID 19 → 10
Updated 'Erklärung.pdf': ID 21 → 39
Updated 'Empfehlungsschreiben Leon Bartz.pdf': ID 23 → 41
Updated 'anlage-1-zb-rmv.pdf': ID 24 → 42
Updated 'vn-2Bsb_ab_01.10.2020-1.docx': ID 25 → 43
Updated '2101gr002a_Bartz_ZB_27.01.2021.pdf': ID 26 → 44
Updated 'anlage-1.pdf': ID 27 → 45
Updated '20210302-1000552834-umsatz.CSV': ID 29 → 47
Updated '388802_Quittung.pdf': ID 30 → 48
Updated 'Alles.xlsx': ID 31 → 53
Updated 'Router und Telefon.pdf': ID 32 → 50
Updated 'Druckerpatrone.pdf': ID 33 → 51
Updated '88667.PDF': ID 34 → 52
Updated 'Alles.xlsx': ID 35 → 53
Updated '04-2021.pdf': ID 36 → 54
Updated '12-2021.pdf': ID 37 → 55
Updated '08-2021.pdf