In [None]:
!pip install ktoolbox
!mkdir -p /kaggle/working/ktoolbox && \
cd /kaggle/working/ktoolbox && \
ktoolbox sync-creator https://kemono.su/fanbox/user/5850450


In [None]:
import os
import shutil

src_dir = '/kaggle/working/ktoolbox'
dst_dir = '/kaggle/working/work'

os.makedirs(dst_dir, exist_ok=True)

valid_exts = ['.jpg', '.jpeg', '.png', '.webp', '.bmp', '.tiff']

for root, _, files in os.walk(src_dir):
    for file in files:
        ext = os.path.splitext(file)[1].lower()
        if ext in valid_exts:
            src_path = os.path.join(root, file)
            dst_path = os.path.join(dst_dir, file)
            shutil.move(src_path, dst_path)


In [None]:
import os
from PIL import Image, ImageChops

def is_color_image(img, threshold=10):
    if img.mode != 'RGB':
        img = img.convert('RGB')

    r, g, b = img.split()
    diff_rg = ImageChops.difference(r, g)
    diff_rb = ImageChops.difference(r, b)
    diff_gb = ImageChops.difference(g, b)

    def mean_diff(diff_img):
        hist = diff_img.histogram()
        pixels = sum(hist)
        total = sum(i * hist[i] for i in range(256))
        return total / pixels if pixels else 0

    mean_rg = mean_diff(diff_rg)
    mean_rb = mean_diff(diff_rb)
    mean_gb = mean_diff(diff_gb)

    if mean_rg > threshold or mean_rb > threshold or mean_gb > threshold:
        return True
    else:
        return False

def clean_work_folder():
    work_dir = '/kaggle/working/work'
    for file in os.listdir(work_dir):
        path = os.path.join(work_dir, file)
        if not file.lower().endswith(('.jpg', '.jpeg', '.png')):
            os.remove(path)
            continue
        try:
            with Image.open(path) as img:
                if img.width < 256 or img.height < 256 or not is_color_image(img):
                    img.close()
                    os.remove(path)
        except Exception as e:
            print(f"处理失败: {path}，错误：{e}")
            if os.path.exists(path):
                os.remove(path)

if __name__ == '__main__':
    clean_work_folder()


In [None]:
import os
from PIL import Image
import imagehash
from tqdm import tqdm

# 获取当前脚本所在文件夹路径
image_folder = '/kaggle/working/work'
hash_func = imagehash.phash
threshold = 8

hash_dict = {}
visited = set()
similar_groups = []

print("计算图片哈希中...")
for filename in tqdm(os.listdir(image_folder)):
    if not filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.webp')):
        continue
    filepath = os.path.join(image_folder, filename)
    try:
        with Image.open(filepath) as img:
            h = hash_func(img)
            hash_dict[filename] = h
    except Exception:
        continue

print("查找相似图片中...")
filenames = list(hash_dict.keys())
for i in tqdm(range(len(filenames))):
    if filenames[i] in visited:
        continue
    group = [filenames[i]]
    visited.add(filenames[i])
    for j in range(i+1, len(filenames)):
        if filenames[j] in visited:
            continue
        if hash_dict[filenames[i]] - hash_dict[filenames[j]] <= threshold:
            group.append(filenames[j])
            visited.add(filenames[j])
    if len(group) > 1:
        similar_groups.append(group)

print(f"发现 {len(similar_groups)} 组相似图片，开始删除...")
for group in similar_groups:
    for filename in group[1:]:
        try:
            os.remove(os.path.join(image_folder, filename))
        except Exception:
            continue

print("去重完成。")


In [None]:
import shutil
shutil.rmtree("/kaggle/working/ktoolbox")
shutil.make_archive('/kaggle/working/result', 'zip', '/kaggle/working/work')
