In [1]:
import os
import librosa
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import soundfile as sf
from collections import defaultdict
import hashlib
from pydub.utils import mediainfo

In [2]:
def aggregate_audio_properties(directory):
    sample_rate_counts = defaultdict(int)
    mono_count = 0
    stereo_count = 0
    other_channels_count = 0
    file_count = 0

    # Iterate through all files in the directory (and subdirectories)
    for root, _, files in os.walk(directory):
        for file in files:
            # Process only common audio file types
            if file.lower().endswith(('.wav', '.flac', '.mp3', '.ogg')):
                file_path = os.path.join(root, file)
                try:
                    data, samplerate = sf.read(file_path)
                    file_count += 1
                    sample_rate_counts[samplerate] += 1

                    # Determine the number of channels
                    if data.ndim == 1:
                        mono_count += 1
                    elif data.ndim == 2:
                        channels = data.shape[1]
                        if channels == 1:
                            mono_count += 1
                        elif channels == 2:
                            stereo_count += 1
                        else:
                            other_channels_count += 1
                    else:
                        other_channels_count += 1
                except Exception as e:
                    print(f"Error reading {file_path}: {e}")

    # Print aggregated results
    print(f"Total files processed: {file_count}")
    print("Sample Rate Distribution:")
    for sr, count in sample_rate_counts.items():
        print(f"  {sr} Hz: {count} file(s)")
    print(f"Mono files: {mono_count}")
    print(f"Stereo files: {stereo_count}")
    print(f"Other channels: {other_channels_count}")

In [4]:
directory = "C:/Users/sassi/Documents/DeepFakePhishing/Data/for-norm" 
aggregate_audio_properties(directory)

Total files processed: 69300
Sample Rate Distribution:
  16000 Hz: 69300 file(s)
Mono files: 69300
Stereo files: 0
Other channels: 0


In [None]:
def hash_file(filepath, block_size=65536):
    """
    Calculate the MD5 hash of a file.
    Reads the file in chunks to handle large files.
    """
    hasher = hashlib.md5()
    try:
        with open(filepath, 'rb') as file:
            buf = file.read(block_size)
            while buf:
                hasher.update(buf)
                buf = file.read(block_size)
    except Exception as e:
        print(f"Error reading {filepath}: {e}")
        return None
    return hasher.hexdigest()

def find_duplicates(directory):
    """
    Walk through the directory (including subdirectories),
    and group files by their MD5 hash.
    """
    files_by_hash = {}
    for root, _, files in os.walk(directory):
        for file in files:
            filepath = os.path.join(root, file)
            file_hash = hash_file(filepath)
            if file_hash is None:
                continue  # Skip files that couldn't be read
            files_by_hash.setdefault(file_hash, []).append(filepath)
    return files_by_hash


# Set the directory you want to search for duplicates
directory = "C:/Users/sassi/Documents/DeepFakePhishing/Data/for-norm/for-norm"  # Replace with your target directory

duplicates = find_duplicates(directory)
duplicate_count = 0

print("Duplicate Files Found:")
for file_hash, files in duplicates.items():
    if len(files) > 1:
        print(f"\nHash: {file_hash}")
        for f in files:
            print(f"  {f}")
        # Count duplicates: only count files beyond the first as duplicates
        duplicate_count += len(files) - 1

print(f"\nTotal number of duplicate files: {duplicate_count}")

Duplicate Files Found:

Total number of duplicate files: 0


In [10]:
def find_empty_files(directory):
    """
    Walk through the directory (including subdirectories),
    and find files with 0 bytes size.
    """
    empty_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            filepath = os.path.join(root, file)
            if os.path.getsize(filepath) == 0:
                empty_files.append(filepath)
    return empty_files

empty_files = find_empty_files(directory)

print(f"Found {len(empty_files)} empty files:")
for empty_file in empty_files:
    print(empty_file)

Found 0 empty files:
