Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added a script to check duplicate files using metadata, also updated the read.me #266

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Duplicate Finder/Readme.md
Original file line number Diff line number Diff line change
@@ -2,6 +2,7 @@

This script scans a given directory for duplicate files based on their MD5 hash. It provides options to delete or move the duplicate files to another directory.

The metadata script scans a given directry and finds duplicate files based on MD5 hash, file size and other metadata, it does not include the move, and delete features.
## Features

- Scan a directory recursively for duplicate files.
54 changes: 54 additions & 0 deletions Duplicate Finder/metadata-duplicate-finder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import os
import hashlib
from collections import defaultdict

def find_duplicates(folder_path):
file_metadata = defaultdict(list)

for root, _, files in os.walk(folder_path):
for file_name in files:
file_path = os.path.join(root, file_name)
metadata = get_file_metadata(file_path)
if metadata:
file_metadata[metadata].append(file_path)

duplicates = [files for files in file_metadata.values() if len(files) > 1]

if not duplicates:
print("No duplicates found.")
else:
for index, files in enumerate(duplicates, start=1):
print(f"\nDuplicate Group {index}:")
for file in files:
print(f" {file}")

def get_file_metadata(file_path):
try:
file_size = os.path.getsize(file_path)
mod_time = os.path.getmtime(file_path)
file_hash = calculate_file_hash(file_path)
return (file_size, mod_time, file_hash)
except (OSError, IOError) as e:
print(f"Error accessing file {file_path}: {e}")
return None

def calculate_file_hash(file_path, hash_algo=hashlib.md5):
try:
hash_obj = hash_algo()
with open(file_path, 'rb') as file:
while chunk := file.read(8192):
hash_obj.update(chunk)
return hash_obj.hexdigest()
except (OSError, IOError) as e:
print(f"Error reading file {file_path}: {e}")
return None



if __name__ == "__main__":
folder_path = input("Enter the path to the folder to scan for duplicates: ").strip()
if os.path.isdir(folder_path):
find_duplicates(folder_path)
else:
print("The specified path is not a valid directory.")