diff --git a/Duplicate Finder/Readme.md b/Duplicate Finder/Readme.md index 14055ff..ee74bef 100644 --- a/Duplicate Finder/Readme.md +++ b/Duplicate Finder/Readme.md @@ -2,6 +2,7 @@ This script scans a given directory for duplicate files based on their MD5 hash. It provides options to delete or move the duplicate files to another directory. +The metadata script scans a given directry and finds duplicate files based on MD5 hash, file size and other metadata, it does not include the move, and delete features. ## Features - Scan a directory recursively for duplicate files. diff --git a/Duplicate Finder/metadata-duplicate-finder.py b/Duplicate Finder/metadata-duplicate-finder.py new file mode 100644 index 0000000..5a8e181 --- /dev/null +++ b/Duplicate Finder/metadata-duplicate-finder.py @@ -0,0 +1,54 @@ +import os +import hashlib +from collections import defaultdict + +def find_duplicates(folder_path): + file_metadata = defaultdict(list) + + for root, _, files in os.walk(folder_path): + for file_name in files: + file_path = os.path.join(root, file_name) + metadata = get_file_metadata(file_path) + if metadata: + file_metadata[metadata].append(file_path) + + duplicates = [files for files in file_metadata.values() if len(files) > 1] + + if not duplicates: + print("No duplicates found.") + else: + for index, files in enumerate(duplicates, start=1): + print(f"\nDuplicate Group {index}:") + for file in files: + print(f" {file}") + +def get_file_metadata(file_path): + try: + file_size = os.path.getsize(file_path) + mod_time = os.path.getmtime(file_path) + file_hash = calculate_file_hash(file_path) + return (file_size, mod_time, file_hash) + except (OSError, IOError) as e: + print(f"Error accessing file {file_path}: {e}") + return None + +def calculate_file_hash(file_path, hash_algo=hashlib.md5): + try: + hash_obj = hash_algo() + with open(file_path, 'rb') as file: + while chunk := file.read(8192): + hash_obj.update(chunk) + return hash_obj.hexdigest() + except (OSError, IOError) as e: + print(f"Error reading file {file_path}: {e}") + return None + + + +if __name__ == "__main__": + folder_path = input("Enter the path to the folder to scan for duplicates: ").strip() + if os.path.isdir(folder_path): + find_duplicates(folder_path) + else: + print("The specified path is not a valid directory.") +