In [1]:
import os
import shutil
import pandas as pd
import numpy as np
import cv2
import rasterio
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# ==========================================
# 1. SETUP PATHS (EDIT THESE)
# ==========================================
CSV_PATH = 'C:/Users/FA004/Desktop/satimg2/data.csv'   # Path to your data.csv
IMAGE_DIR = 'C:/Users/FA004/Desktop/satimg2/images'    # Path to your images folder
QUARANTINE_DIR = 'corrupted_images_quarantine'         # Where bad images will be copied

# ==========================================
# 2. HELPER TO MOVE BAD FILES
# ==========================================
def quarantine_file(src_path, filename, reason):
    """Copies a problematic file to the quarantine folder and logs the reason."""
    if not os.path.exists(QUARANTINE_DIR):
        os.makedirs(QUARANTINE_DIR)
    
    # Create specific subfolder for the error type to stay organized
    error_folder = os.path.join(QUARANTINE_DIR, reason.replace(" ", "_"))
    if not os.path.exists(error_folder):
        os.makedirs(error_folder)
        
    dst_path = os.path.join(error_folder, filename)
    
    print(f"❌ REJECTED: {filename} | Reason: {reason}")
    
    # Copy the file if it exists, otherwise just log it
    if os.path.exists(src_path):
        try:
            shutil.copy2(src_path, dst_path)
        except Exception as e:
            print(f"   Could not copy file: {e}")
    else:
        print(f"   File path does not exist: {src_path}")

# ==========================================
# 3. MAIN INSPECTION LOOP
# ==========================================
def inspect_dataset():
    print(f"Reading CSV from: {CSV_PATH}")
    df = pd.read_csv(CSV_PATH)
    
    # --- Exact Pre-processing Logic from your Notebook to match filenames ---
    # Filter rows (as per your code)
    df = df[(df['Energy Use per Capita (kWh)'] > 0) & (df['Population'] > 0) & (df['Area (Sq. Km)'] > 0)]
    
    # Date Processing
    df['date'] = pd.to_datetime(df['Date (month/year)'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    
    # Sort (to match your loader)
    df = df.sort_values(['Country', 'date']).reset_index(drop=True)
    
    print(f"Inspecting {len(df)} entries...")
    print("-" * 50)

    good_count = 0
    bad_count = 0

    for idx, row in df.iterrows():
        # --- Exact Path Construction Logic ---
        filename = f"{row['Country']}_{row['year']}_{row['month']:02d}.tif"
        country_folder = row['Country']
        img_path = os.path.join(IMAGE_DIR, country_folder, filename)

        # 1. Check File Existence
        if not os.path.exists(img_path):
            quarantine_file(img_path, filename, "File_Not_Found")
            bad_count += 1
            continue

        # 2. Check Image Content (Rasterio / NaN / Inf)
        try:
            with rasterio.open(img_path) as src:
                # Read the first channel
                image = src.read(1)
                
                # Check if image data is totally empty (None)
                if image is None:
                    quarantine_file(img_path, filename, "Image_Is_None")
                    bad_count += 1
                    continue
                
                # --- The Critical Corruption Checks ---
                # This is the strict logic from your notebook
                has_nan = np.isnan(image).any()
                has_inf = np.isinf(image).any()

                if has_nan or has_inf:
                    reason = "Contains_NaN" if has_nan else "Contains_Infinity"
                    if has_nan and has_inf: reason = "NaN_and_Infinity"
                    
                    quarantine_file(img_path, filename, reason)
                    bad_count += 1
                    continue

                # Optional: Check if resize works (sometimes corrupt headers fail here)
                try:
                    cv2.resize(image, (64, 64), interpolation=cv2.INTER_LINEAR)
                except Exception as e:
                    quarantine_file(img_path, filename, f"Resize_Failed_{str(e)}")
                    bad_count += 1
                    continue

                # If we made it here, the image is VALID
                good_count += 1

        except Exception as e:
            # Catches rasterio opening errors (broken headers, unreadable files)
            quarantine_file(img_path, filename, "Read_Error_Corrupt_File")
            bad_count += 1
            continue

    print("-" * 50)
    print("INSPECTION COMPLETE")
    print(f"✅ Valid Images: {good_count}")
    print(f"❌ Corrupted/Missing: {bad_count}")
    print(f"📁 Check the folder '{QUARANTINE_DIR}' to see the rejected files.")

inspect_dataset()

Reading CSV from: C:/Users/FA004/Desktop/satimg2/data.csv
Inspecting 10842 entries...
--------------------------------------------------
❌ REJECTED: Argentina_2025_01.tif | Reason: File_Not_Found
   File path does not exist: C:/Users/FA004/Desktop/satimg2/images\Argentina\Argentina_2025_01.tif
❌ REJECTED: Australia_2010_01.tif | Reason: File_Not_Found
   File path does not exist: C:/Users/FA004/Desktop/satimg2/images\Australia\Australia_2010_01.tif
❌ REJECTED: Australia_2010_02.tif | Reason: File_Not_Found
   File path does not exist: C:/Users/FA004/Desktop/satimg2/images\Australia\Australia_2010_02.tif
❌ REJECTED: Australia_2010_03.tif | Reason: File_Not_Found
   File path does not exist: C:/Users/FA004/Desktop/satimg2/images\Australia\Australia_2010_03.tif
❌ REJECTED: Australia_2010_04.tif | Reason: File_Not_Found
   File path does not exist: C:/Users/FA004/Desktop/satimg2/images\Australia\Australia_2010_04.tif
❌ REJECTED: Australia_2010_05.tif | Reason: File_Not_Found
   File path d

In [2]:
import os
import rasterio
import cv2
import numpy as np

# ==========================================
# 1. SETUP PATHS
# ==========================================
SOURCE_TIF_DIR = 'C:/Users/FA004/Desktop/satimg2/images'  # Your original folder
DEST_PNG_DIR = 'C:/Users/FA004/Desktop/satimg2/images_png_view'  # New folder for PNGs

def convert_tifs_to_png():
    # Create the root destination folder if it doesn't exist
    if not os.path.exists(DEST_PNG_DIR):
        os.makedirs(DEST_PNG_DIR)
        print(f"Created destination directory: {DEST_PNG_DIR}")

    print("Starting conversion... this may take a few minutes.")
    
    count = 0
    # Walk through the entire TIF directory structure
    for root, dirs, files in os.walk(SOURCE_TIF_DIR):
        for file in files:
            if file.lower().endswith('.tif') or file.lower().endswith('.tiff'):
                # 1. Construct full source path
                tif_path = os.path.join(root, file)
                
                # 2. Determine relative path to maintain folder structure
                relative_path = os.path.relpath(root, SOURCE_TIF_DIR)
                target_folder = os.path.join(DEST_PNG_DIR, relative_path)
                
                # Create the subfolder (e.g., PNG_Exports/CountryName)
                if not os.path.exists(target_folder):
                    os.makedirs(target_folder)
                
                # 3. Construct target PNG filename
                png_filename = os.path.splitext(file)[0] + ".png"
                png_path = os.path.join(target_folder, png_filename)

                try:
                    # 4. Read the TIF file
                    with rasterio.open(tif_path) as src:
                        img = src.read(1) # Read the first band
                    
                    # 5. Handle NaNs and Infs (same as your model logic)
                    img = np.nan_to_num(img, nan=0.0, posinf=0.0, neginf=0.0)

                    # 6. Normalize to 0-255 for visibility 
                    # Satellite data is often 0-1 or 0-10000; this makes it viewable as a standard image.
                    if img.max() > img.min():
                        img_normalized = (img - img.min()) / (img.max() - img.min()) * 255.0
                    else:
                        img_normalized = img * 0.0 # Handle flat/empty images
                        
                    img_8bit = img_normalized.astype(np.uint8)

                    # 7. Save as PNG
                    cv2.imwrite(png_path, img_8bit)
                    count += 1
                    
                    if count % 500 == 0:
                        print(f"Converted {count} images...")

                except Exception as e:
                    print(f"Error converting {tif_path}: {e}")

    print("-" * 30)
    print(f"SUCCESS: {count} images converted to PNG.")
    print(f"You can find them here: {DEST_PNG_DIR}")

convert_tifs_to_png()

Created destination directory: C:/Users/FA004/Desktop/satimg2/images_png_view
Starting conversion... this may take a few minutes.
Converted 500 images...
Converted 1000 images...
Converted 1500 images...
Converted 2000 images...
Converted 2500 images...
Converted 3000 images...
Converted 3500 images...
Converted 4000 images...
Converted 4500 images...
Converted 5000 images...
Converted 5500 images...
Converted 6000 images...
Converted 6500 images...
Converted 7000 images...
Converted 7500 images...
Converted 8000 images...
Converted 8500 images...
Converted 9000 images...
Converted 9500 images...
Converted 10000 images...
Converted 10500 images...
Converted 11000 images...
Converted 11500 images...
Converted 12000 images...
Converted 12500 images...
Converted 13000 images...
------------------------------
SUCCESS: 13416 images converted to PNG.
You can find them here: C:/Users/FA004/Desktop/satimg2/images_png_view
