In [2]:
!pip install requests tqdm trimesh thingi10k numpy-stl numpy



In [None]:
import os
import requests
import zipfile
import shutil
import trimesh
import thingi10k
import numpy as np
from stl import mesh
from tqdm import tqdm
from urllib.parse import urlencode
import warnings

In [None]:

class DatasetsManager:
    """
    A manager class to download, process, and structure 3D datasets for ML tasks.
    """
    def __init__(self, root_dir="data"):
        """
        Initializes the manager with a root directory for all data.
        """
        self.root_dir = root_dir
        self.thingi10k_dir = os.path.join(root_dir, "thingi10k")
        self.modelnet_dir = os.path.join(root_dir, "ModelNet40")
        self.custom_data_dir = os.path.join(root_dir, "custom_dataset")
        os.makedirs(self.root_dir, exist_ok=True)
        print(f"DatasetsManager initialized. All data will be stored in '{self.root_dir}'")

    def prepare_all_datasets(self):
        """Runs the preparation process for all configured datasets."""
        print("\n--- Starting all dataset preparation processes ---")
        self.prepare_thingi10k()
        self.prepare_modelnet40()
        self.prepare_custom_dataset()
        print("\n--- All dataset preparation processes are complete! ---")

    def prepare_thingi10k(self):
        """Prepares the Thingi10k dataset: converts .npz to .stl into a 'models' folder."""
        print("\n[1/3] Processing Thingi10k...")
        models_out_dir = os.path.join(self.thingi10k_dir, "models")
        os.makedirs(models_out_dir, exist_ok=True)

        warnings.filterwarnings("ignore", category=UserWarning, module="thingi10k")
        thingi10k.init()

        for entry in tqdm(thingi10k.dataset(), desc="Converting Thingi10k"):
            file_id = entry["file_id"]
            output_filepath = os.path.join(models_out_dir, f"{file_id}.stl")
            if os.path.exists(output_filepath):
                continue
            
            try:
                with np.load(entry["file_path"]) as data:
                    vertices = np.asarray(data["vertices"], dtype=np.float64)
                    facets = np.asarray(data["facets"], dtype=np.int32)
                
                mesh_data = vertices[facets]
                stl_mesh = mesh.Mesh(np.zeros(mesh_data.shape[0], dtype=mesh.Mesh.dtype))
                stl_mesh.vectors = mesh_data
                stl_mesh.save(output_filepath)
            except Exception as e:
                print(f"Skipping Thingi10k file_id {file_id}. Error: {e}")
        print("Thingi10k preparation complete.")

    def prepare_modelnet40(self):
        """Prepares ModelNet40: downloads, flattens structure, and converts .off to .stl."""
        print("\n[2/3] Processing ModelNet40...")
        url = "http://modelnet.cs.princeton.edu/ModelNet40.zip"
        zip_path = os.path.join(self.root_dir, "ModelNet40.zip")

        if not os.path.exists(self.modelnet_dir):
            self._download_file(url, zip_path)
            print(f"Extracting {zip_path}...")
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(self.root_dir)
            os.remove(zip_path)

        for category_d in tqdm(os.scandir(self.modelnet_dir), desc="Processing ModelNet40 categories"):
            if not category_d.is_dir() or category_d.name.startswith('__'):
                continue
            
            models_out_dir = os.path.join(category_d.path, "models")
            os.makedirs(models_out_dir, exist_ok=True)

            for subfolder in ["train", "test"]:
                subfolder_path = os.path.join(category_d.path, subfolder)
                if not os.path.exists(subfolder_path):
                    continue
                
                for off_file in os.scandir(subfolder_path):
                    if off_file.name.endswith(".off"):
                        stl_filename = off_file.name.replace(".off", ".stl")
                        stl_filepath = os.path.join(models_out_dir, stl_filename)
                        if os.path.exists(stl_filepath):
                            continue
                        try:
                            mesh = trimesh.load_mesh(off_file.path)
                            mesh.export(stl_filepath)
                        except Exception as e:
                            print(f"Failed to process {off_file.path}: {e}")
                shutil.rmtree(subfolder_path)
        print("ModelNet40 preparation complete.")
        
    def prepare_custom_dataset(self):
        """Downloads and structures the custom dataset from Yandex.Disk."""
        print("\n[3/3] Processing Custom Dataset from Yandex.Disk...")
        os.makedirs(self.custom_data_dir, exist_ok=True)
        
        files_to_download = {
            "train_data": "https://disk.yandex.ru/d/RRXJu9ZtEmSXzQ",
            "test_data": "https://disk.yandex.ru/d/TmbB7BsGzg1dQQ",
        }
        for key, val in files_to_download.items():
            self.download_and_unzip_yandex_disk(val, self.root_dir)

        print("Custom dataset preparation complete.")

    def _download_file(self, url, filename):
        """Helper to download a file with a progress bar."""
        print(f"Downloading {url}...")
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            total_size = int(r.headers.get('content-length', 0))
            with open(filename, 'wb') as f, tqdm(
                total=total_size, unit='iB', unit_scale=True, desc=filename.split('/')[-1]
            ) as bar:
                for chunk in r.iter_content(chunk_size=8192):
                    size = f.write(chunk)
                    bar.update(size)

    def download_and_unzip_yandex_disk(self, public_url, extract_to_folder):
        """
        Downloads and unzips a file from Yandex.Disk using a public link.

        :param public_url: Public URL to the file on Yandex.Disk.
        :param extract_to_folder: The folder where the archive will be extracted.
        """
        print(f"Starting to process link: {public_url}")

        # 1. Get the direct download link from the Yandex.Disk API
        base_api_url = "https://cloud-api.yandex.net/v1/disk/public/resources/download?"
        api_url = base_api_url + urlencode(dict(public_key=public_url))

        try:
            response = requests.get(api_url)
            response.raise_for_status()  # Check for HTTP errors
            download_url = response.json().get("href")

            if not download_url:
                print(f"Error: Could not get a direct download link for {public_url}")
                return

            print("Direct download link obtained.")

            # 2. Download the file
            print("Downloading file...")
            download_response = requests.get(download_url)
            download_response.raise_for_status()

            # Temporary name for the zip file
            zip_filename = "temp_download.zip"

            with open(zip_filename, "wb") as f:
                f.write(download_response.content)
            print(f"File '{zip_filename}' downloaded successfully.")

            # 3. Unzip the archive
            print(f"Unzipping archive to folder '{extract_to_folder}'...")
            if not os.path.exists(extract_to_folder):
                os.makedirs(extract_to_folder)

            with zipfile.ZipFile(zip_filename, "r") as zip_ref:
                zip_ref.extractall(extract_to_folder)
            print(f"Files successfully unzipped to '{extract_to_folder}'.")

            # 4. Remove the downloaded zip archive
            os.remove(zip_filename)
            print(f"Temporary file '{zip_filename}' has been removed.")

        except requests.exceptions.RequestException as e:
            print(f"A network or API error occurred: {e}")
            print("Tip: Make sure your environment can access 'cloud-api.yandex.net'.")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    # --- How to use the manager ---
    # 1. Create an instance of the manager
    manager = DatasetsManager(root_dir="my_3d_datasets")

    # 2. Run the preparation for all datasets
    manager.prepare_all_datasets()

DatasetsManager initialized. All data will be stored in 'my_3d_datasets'

[3/3] Processing Custom Dataset from Yandex.Disk...
Starting to process link: https://disk.yandex.ru/d/RRXJu9ZtEmSXzQ
Direct download link obtained.
Downloading file...
File 'temp_download.zip' downloaded successfully.
Unzipping archive to folder 'my_3d_datasets'...
Files successfully unzipped to 'my_3d_datasets'.
Temporary file 'temp_download.zip' has been removed.
Starting to process link: https://disk.yandex.ru/d/TmbB7BsGzg1dQQ
Direct download link obtained.
Downloading file...
File 'temp_download.zip' downloaded successfully.
Unzipping archive to folder 'my_3d_datasets'...
Files successfully unzipped to 'my_3d_datasets'.
Temporary file 'temp_download.zip' has been removed.
Custom dataset preparation complete.
