In [2]:
!pip install requests tqdm trimesh thingi10k numpy-stl numpy



In [2]:
import os
import requests
import zipfile
import shutil
import trimesh
import thingi10k
import numpy as np
from stl import mesh
from tqdm import tqdm
from urllib.parse import urlencode
import warnings
!pip install scipy



In [None]:
import os
import scipy.io
import requests
import zipfile
import shutil
import trimesh
import numpy as np
import thingi10k
from stl import mesh
from tqdm import tqdm
import warnings
from urllib.parse import urlencode
from huggingface_hub import login, snapshot_download

class DatasetsManager:
    """
    A manager class to download, process, and structure 3D datasets for ML tasks.
    Follows a consistent structure: data/[dataset]/[category]/[type]/[file]
    """
    def __init__(self, root_dir="data"):
        self.root_dir = root_dir
        self.thingi10k_dir = os.path.join(root_dir, "thingi10k")
        self.modelnet_dir = os.path.join(root_dir, "ModelNet40")
        self.abc_dir = os.path.join(root_dir, "abc_dataset")
        self.objectnet_dir = os.path.join(root_dir, "objectnet3d")
        self.shapenet_dir = os.path.join(root_dir, "shapenet")
        self.custom_data_dir = os.path.join(root_dir, "custom_dataset")
        
        os.makedirs(self.root_dir, exist_ok=True)
        print(f"DatasetsManager initialized. All data will be stored in '{self.root_dir}'")

    def prepare_all_datasets(self):
        print("\n--- Starting all dataset preparation processes ---")
        self.prepare_thingi10k()
        self.prepare_modelnet40()
        self.prepare_abc_dataset()
        self.prepare_objectnet3d()
        self.prepare_shapenet()
        self.prepare_custom_dataset()
        print("\n--- All dataset preparation processes are complete! ---")
    
    # ... [Остальные методы (prepare_thingi10k, prepare_modelnet40, и т.д.) остаются без изменений] ...
    def prepare_thingi10k(self):
        print("\n[1/6] Processing Thingi10k...")
        models_out_dir = os.path.join(self.thingi10k_dir, "models")
        os.makedirs(models_out_dir, exist_ok=True)

        warnings.filterwarnings("ignore", category=UserWarning, module="thingi10k")
        try:
            thingi10k.init()
        except Exception as e:
            print(f"Could not initialize Thingi10k dataset. Maybe servers are down? Error: {e}")
            return

        for entry in tqdm(thingi10k.dataset(), desc="Converting Thingi10k"):
            file_id = entry["file_id"]
            output_filepath = os.path.join(models_out_dir, f"{file_id}.stl")
            if os.path.exists(output_filepath):
                continue
            
            try:
                with np.load(entry["file_path"]) as data:
                    vertices = np.asarray(data["vertices"], dtype=np.float64)
                    facets = np.asarray(data["facets"], dtype=np.int32)
                
                if vertices.shape[0] < 3 or facets.shape[0] == 0: continue

                mesh_data = vertices[facets]
                stl_mesh = mesh.Mesh(np.zeros(mesh_data.shape[0], dtype=mesh.Mesh.dtype))
                stl_mesh.vectors = mesh_data
                stl_mesh.save(output_filepath)
            except Exception as e:
                print(f"Skipping Thingi10k file_id {file_id}. Error: {e}")
        print("Thingi10k preparation complete.")

    def prepare_modelnet40(self):
        print("\n[2/6] Processing ModelNet40...")
        url = "http://modelnet.cs.princeton.edu/ModelNet40.zip"
        zip_path = os.path.join(self.root_dir, "ModelNet40.zip")
        
        if os.path.exists(self.modelnet_dir) and any(os.scandir(self.modelnet_dir)):
            is_processed = True
            for item in os.scandir(self.modelnet_dir):
                if item.is_dir():
                    # Check if 'train' or 'test' folders are gone
                    if 'train' in os.listdir(item.path) or 'test' in os.listdir(item.path):
                        is_processed = False
                        break
            if is_processed:
                print("ModelNet40 appears to be processed already. Skipping.")
                return

        if not os.path.exists(os.path.join(self.root_dir, "ModelNet40")):
             self._download_file(url, zip_path)
             print(f"Extracting {zip_path}...")
             with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                 zip_ref.extractall(self.root_dir)
             os.remove(zip_path)

        for category_d in tqdm(os.scandir(self.modelnet_dir), desc="Processing ModelNet40 categories"):
            if not category_d.is_dir() or category_d.name.startswith('__'):
                continue
            
            for subfolder in ["train", "test"]:
                subfolder_path = os.path.join(category_d.path, subfolder)
                if not os.path.exists(subfolder_path):
                    continue
                
                for off_file in os.scandir(subfolder_path):
                    if off_file.name.endswith(".off"):
                        stl_filename = off_file.name.replace(".off", ".stl")
                        stl_filepath = os.path.join(category_d.path, stl_filename)
                        if os.path.exists(stl_filepath):
                            continue
                        try:
                            mesh = trimesh.load_mesh(off_file.path)
                            mesh.export(stl_filepath)
                        except Exception as e:
                            print(f"Failed to process {off_file.path}: {e}")
                shutil.rmtree(subfolder_path)
        print("ModelNet40 preparation complete.")
    
    def prepare_abc_dataset(self):
        """[3/6] Prepares ABC Dataset: downloads, extracts, filters for '512' .obj, and converts to .stl."""
        print("\n[3/6] Processing ABC Dataset...")
        url = "https://archive.nyu.edu/retrieve/120666/abc_full_100k_v00.zip"
        zip_path = os.path.join(self.root_dir, "abc_full_100k_v00.zip")
        
        # Correct path to the extracted folder, as seen in your screenshot.
        extracted_path = os.path.join(self.root_dir, "100k") 
        
        models_out_dir = os.path.join(self.abc_dir, "models")
        os.makedirs(models_out_dir, exist_ok=True)

        if os.path.exists(models_out_dir) and len(os.listdir(models_out_dir)) > 1000:
             print("ABC Dataset appears to be processed already. Skipping.")
             return

        if not os.path.exists(extracted_path):
            self._download_file(url, zip_path)
            print(f"Extracting {zip_path} (this may take a while)...")
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(self.root_dir)
            os.remove(zip_path)

        # Paths to the '512' folders are built from the correct base path '100k'.
        dirs_to_process = [
            os.path.join(extracted_path, "train", "512"),
            os.path.join(extracted_path, "test", "512")
        ]

        for source_dir in tqdm(dirs_to_process, desc="Processing ABC splits"):
            if not os.path.exists(source_dir):
                print(f"Warning: Source directory not found: {source_dir}")
                continue
            
            for obj_file in tqdm(os.scandir(source_dir), desc=f"Converting from {os.path.basename(source_dir)}", leave=False):
                if not obj_file.name.endswith(".obj"):
                    continue
                
                stl_filename = obj_file.name.replace(".obj", ".stl")
                stl_filepath = os.path.join(models_out_dir, stl_filename)
                
                if os.path.exists(stl_filepath):
                    continue
                    
                try:
                    mesh = trimesh.load_mesh(obj_file.path)
                    mesh.export(stl_filepath)
                except Exception as e:
                    print(f"Failed to process {obj_file.path}: {e}")

        # Cleanup will now target the correct folder.
        print(f"Cleaning up original extracted folder: {extracted_path}")
        shutil.rmtree(extracted_path)
        print("ABC Dataset preparation complete.")
        
    def prepare_objectnet3d(self):
        """[4/6] Prepares ObjectNet3D: downloads, links images to models via .mat files, and converts to .stl."""
        print("\n[4/6] Processing ObjectNet3D...")
        
        os.makedirs(self.objectnet_dir, exist_ok=True)
        if len(os.listdir(self.objectnet_dir)) > 0:
            print("ObjectNet3D appears to be processed already. Skipping.")
            return

        temp_dir = os.path.join(self.root_dir, "ObjectNet3D_temp_processing")
        os.makedirs(temp_dir, exist_ok=True)

        try:
            urls = {
                "annotations": "ftp://cs.stanford.edu/cs/cvgl/ObjectNet3D/ObjectNet3D_annotations.zip",
                "cads": "ftp://cs.stanford.edu/cs/cvgl/ObjectNet3D/ObjectNet3D_cads.zip",
                "images": "ftp://cs.stanford.edu/cs/cvgl/ObjectNet3D/ObjectNet3D_images.zip"
            }
            unpacked_base_path = os.path.join(temp_dir, "ObjectNet3D")

            for name, url in urls.items():
                if name == "annotations" and os.path.exists(os.path.join(unpacked_base_path, "Annotations")):
                     print(f"{name} data already exists. Skipping download and extraction.")
                     continue
                
                zip_path = os.path.join(temp_dir, os.path.basename(url))
                self._download_file(url, zip_path)
                print(f"Extracting {name}...")
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(temp_dir)
                os.remove(zip_path)

            print("Processing CAD models (.off -> .stl)...")
            source_cad_dir = os.path.join(unpacked_base_path, "CAD", "off")
            if os.path.exists(source_cad_dir):
                for category_dir in tqdm(os.scandir(source_cad_dir), desc="Converting CAD categories"):
                    if not category_dir.is_dir(): continue
                    
                    category_name = category_dir.name
                    dest_model_dir = os.path.join(self.objectnet_dir, category_name, "models")
                    os.makedirs(dest_model_dir, exist_ok=True)
                    
                    for off_file in os.scandir(category_dir.path):
                        if not (off_file.name.endswith(".off") and len(off_file.name.split('.')[0]) == 6): continue
                        
                        stl_filename = off_file.name.replace(".off", ".stl")
                        stl_filepath = os.path.join(dest_model_dir, stl_filename)
                        
                        try:
                            mesh = trimesh.load_mesh(off_file.path)
                            mesh.export(stl_filepath)
                        except Exception as e:
                            print(f"Failed to process CAD {off_file.path}: {e}")

            print("Linking images to categories using .mat annotations...")
            source_ann_dir = os.path.join(unpacked_base_path, "Annotations")
            source_img_dir = os.path.join(unpacked_base_path, "Images")
            
            if os.path.exists(source_ann_dir) and os.path.exists(source_img_dir):
                for mat_file in tqdm(os.scandir(source_ann_dir), desc="Processing annotations"):
                    if not (mat_file.name.endswith(".mat") and len(mat_file.name.split('.')[0]) == 6): continue
                    
                    try:
                        mat = scipy.io.loadmat(mat_file.path)
                        record = mat['record'][0, 0]
                        img_filename = str(record['filename'][0])
                        
                        category_name = str(record['objects'][0, 0]['class'][0])
                        
                        source_img_path = os.path.join(source_img_dir, img_filename)
                        if not os.path.exists(source_img_path): continue
                            
                        dest_img_dir = os.path.join(self.objectnet_dir, category_name, "images")
                        os.makedirs(dest_img_dir, exist_ok=True)
                        dest_img_path = os.path.join(dest_img_dir, os.path.basename(source_img_path))

                        shutil.copy(source_img_path, dest_img_path)
                    except Exception as e:
                        print(f"Could not process annotation {mat_file.name}. Error: {e}")

        finally:
            print(f"Cleaning up temporary directory: {temp_dir}")
            if os.path.exists(temp_dir):
                shutil.rmtree(temp_dir)
        print("ObjectNet3D preparation complete.")
        
    # --- FINALLY CORRECTED METHOD FOR SHAPENET ---
    def prepare_shapenet(self):
        """[5/6] Prepares ShapeNetCore: downloads from HF, filters, and structures models with their screenshots."""
        print("\n[5/6] Processing ShapeNetCore from Hugging Face...")
        os.makedirs(self.shapenet_dir, exist_ok=True)
        
        hf_token = os.environ.get("HF_TOKEN")
        if not hf_token:
            print("Error: Hugging Face token not found. Please set the HF_TOKEN environment variable.")
            return
            
        try:
            login(token=hf_token)
            print("Hugging Face login successful.")
            repo_id = "ShapeNet/ShapeNetCore"
            hf_cache_dir = snapshot_download(repo_id=repo_id, repo_type="dataset")
            print(f"ShapeNet data available in cache: {hf_cache_dir}")
        except Exception as e:
            print(f"Failed to download from Hugging Face Hub: {e}")
            return
            
        temp_obj_path = os.path.join(self.root_dir, "shapenet_temp_model.obj")

        for zip_filename in tqdm(os.listdir(hf_cache_dir), desc="Processing ShapeNet Categories"):
            if not zip_filename.endswith('.zip'):
                continue
            
            category_id = zip_filename.replace('.zip', '')
            category_path = os.path.join(self.shapenet_dir, category_id)
            
            # --- FIX START ---
            # This is the new, robust check.
            # We check if the final destination 'models' folder exists AND is not empty.
            dest_models_dir = os.path.join(category_path, "models")
            if os.path.exists(dest_models_dir) and len(os.listdir(dest_models_dir)) > 0:
                # print(f"Skipping already processed category: {category_id}")
                continue
            # --- FIX END ---
            
            dest_images_dir = os.path.join(category_path, "images")
            os.makedirs(dest_models_dir, exist_ok=True)
            os.makedirs(dest_images_dir, exist_ok=True)
            
            zip_filepath = os.path.join(hf_cache_dir, zip_filename)
            
            try:
                with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
                    all_files = zip_ref.namelist()
                    
                    model_ids = sorted(list(set(
                        [f.split('/')[1] for f in all_files if f.count('/') >= 2]
                    )))
                    print(model_ids)


                    for model_id in tqdm(model_ids, desc=f"Models in {category_id}", leave=False):
                        internal_prefix = f"{category_id}/{model_id}"
                        screenshot_path_prefix = f"{internal_prefix}/screenshots/"

                        if not any(f.startswith(screenshot_path_prefix) for f in all_files):
                            continue
                        
                        model_path_prefix = f"{internal_prefix}/models/"
                        model_file_path_norm = f"{model_path_prefix}model_normalized.obj"
                        
                        model_file_path = model_file_path_norm if model_file_path_norm in all_files else next((f for f in all_files if f.startswith(model_path_prefix) and f.endswith(".obj")), None)
                        
                        if not model_file_path:
                            continue

                        stl_filepath = os.path.join(dest_models_dir, f"{model_id}.stl")
                        if os.path.exists(stl_filepath):
                            continue

                        try:
                            with zip_ref.open(model_file_path) as source_file, open(temp_obj_path, "wb") as target_file:
                                shutil.copyfileobj(source_file, target_file)
                            
                            mesh = trimesh.load_mesh(temp_obj_path)
                            mesh.export(stl_filepath)
                        finally:
                            if os.path.exists(temp_obj_path):
                                os.remove(temp_obj_path)

                        screenshot_files = [f for f in all_files if f.startswith(screenshot_path_prefix) and (f.endswith('.png') or f.endswith('.jpg'))]
                        for i, screenshot_file_path in enumerate(screenshot_files):
                            ext = ".png" if screenshot_file_path.endswith(".png") else ".jpg"
                            img_dest_path = os.path.join(dest_images_dir, f"{model_id}_{i}{ext}")
                            
                            with zip_ref.open(screenshot_file_path) as source, open(img_dest_path, "wb") as target:
                                shutil.copyfileobj(source, target)
            except Exception as e:
                print(f"An error occurred while processing {zip_filename}: {e}")
        
        print("ShapeNetCore preparation complete.")

    def prepare_custom_dataset(self):
        """[6/6] Downloads and structures the custom dataset from Yandex.Disk."""
        print("\n[6/6] Processing Custom Dataset from Yandex.Disk...")
        os.makedirs(self.custom_data_dir, exist_ok=True)
        
        if os.path.exists(self.custom_data_dir) and len(os.listdir(self.custom_data_dir)) > 0:
            print("Custom dataset appears to be prepared. Skipping.")
            return

        files_to_download = {
            "train_data": "https://disk.yandex.ru/d/RRXJu9ZtEmSXzQ",
            "test_data": "https://disk.yandex.ru/d/TmbB7BsGzg1dQQ",
        }
        for key, val in files_to_download.items():
            self.download_and_unzip_yandex_disk(val, self.custom_data_dir)

        print("Custom dataset preparation complete.")
        
    def _download_file(self, url, filename):
        print(f"Downloading {url} to {os.path.basename(filename)}...")
        try:
            if url.startswith('ftp://'):
                import urllib.request
                urllib.request.urlretrieve(url, filename)
                print(f"FTP download of {os.path.basename(filename)} complete.")
            else:
                with requests.get(url, stream=True) as r:
                    r.raise_for_status()
                    total_size = int(r.headers.get('content-length', 0))
                    with open(filename, 'wb') as f, tqdm(
                        total=total_size, unit='iB', unit_scale=True, desc=os.path.basename(filename)
                    ) as bar:
                        for chunk in r.iter_content(chunk_size=8192):
                            size = f.write(chunk)
                            bar.update(size)
        except Exception as e:
            print(f"Error downloading file {url}: {e}")
            if os.path.exists(filename): os.remove(filename)
            raise

    def download_and_unzip_yandex_disk(self, public_url, extract_to_folder):
        print(f"Starting to process Yandex.Disk link: {public_url}")
        base_api_url = "https://cloud-api.yandex.net/v1/disk/public/resources/download?"
        api_url = base_api_url + urlencode(dict(public_key=public_url))
        try:
            response = requests.get(api_url)
            response.raise_for_status()
            download_url = response.json().get("href")
            if not download_url:
                print(f"Error: Could not get a direct download link for {public_url}"); return

            zip_filename = os.path.join(self.root_dir, "temp_yandex_download.zip")
            self._download_file(download_url, zip_filename)

            with zipfile.ZipFile(zip_filename, "r") as zip_ref:
                zip_ref.extractall(extract_to_folder)
            
            os.remove(zip_filename)
        except requests.exceptions.RequestException as e:
            print(f"A network or API error occurred: {e}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    manager = DatasetsManager(root_dir="my_3d_datasets")
    manager.prepare_shapenet()

DatasetsManager initialized. All data will be stored in 'my_3d_datasets'

[5/6] Processing ShapeNetCore from Hugging Face...


Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


Hugging Face login successful.


Fetching 58 files:   0%|          | 0/58 [00:00<?, ?it/s]

ShapeNet data available in cache: /home/dima/.cache/huggingface/hub/datasets--ShapeNet--ShapeNetCore/snapshots/0efb24cbe6828a85771a28335c5f7b5626514d9b


Processing ShapeNet Categories: 100%|██████████| 58/58 [05:33<00:00,  5.75s/it]  

ShapeNetCore preparation complete.



