## Standardizing Dataset

In this notebook we are trying to standardize the dataset available in our database. We will do the following:

**i-** We will make a copy of each geojson in each folder: 

        - One only with name and location information
        - One with all the information (location + emissions)

**ii-** We will combine the data based on location first for all the available assets in a single standardized file

**iii-** We will then combine the files with emission information into a single file.

#### The primary objective of this exercise is to align the nomenclature of the dataset available and make it interoperable with other machines and programs. 

In [1]:
import os
import json
import glob
from collections import defaultdict
import shutil


In [2]:

def list_geojson_files(folder):
    """ List all GeoJSON files in the folder and let the user choose one. """
    files = [f for f in os.listdir(folder) if f.endswith('.geojson')]
    
    if not files:
        print("❌ No GeoJSON files found in the directory!")
        return None

    print("\nAvailable GeoJSON files:")
    for idx, file in enumerate(files, 1):
        print(f"{idx}. {file}")

    choice = int(input("\nEnter the number of the file you want to process: ")) - 1
    return os.path.join(folder, files[choice])

def load_geojson(file_path):
    """ Load a GeoJSON file and return its dictionary. """
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def save_geojson(data, output_file):
    """ Save GeoJSON to a file. """
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)

def select_columns_to_drop(properties_list):
    """ Ask the user which columns to drop interactively. """
    print("\nAvailable columns in the dataset:")
    for idx, col in enumerate(properties_list, 1):
        print(f"{idx}. {col}")

    user_input = input("\nEnter the numbers of columns to drop (comma-separated, or press Enter to keep all): ")
    
    if not user_input.strip():
        return []  # No columns to drop
    
    drop_indices = list(map(int, user_input.split(',')))
    return [properties_list[idx - 1] for idx in drop_indices if 0 <= idx - 1 < len(properties_list)]

def drop_columns_from_geojson(data, columns_to_drop):
    """ Drop selected columns from the GeoJSON properties. """
    for feature in data.get("features", []):
        properties = feature.get("properties", {})
        for col in columns_to_drop:
            properties.pop(col, None)
    return data

def process_geojson(input_folder):
    """ Main function to process the GeoJSON file. """
    file_path = list_geojson_files(input_folder)
    
    if not file_path:
        return
    
    # Load original GeoJSON
    geojson_data = load_geojson(file_path)

    # Backup original file
    backup_file = file_path.replace(".geojson", "_backup.geojson")
    shutil.copy(file_path, backup_file)
    print(f"\n✅ Backup created: {backup_file}")

    # Get unique columns in the dataset
    all_columns = set()
    for feature in geojson_data.get("features", []):
        all_columns.update(feature.get("properties", {}).keys())

    all_columns = sorted(list(all_columns))  # Sort for better user experience

    # Ask user which columns to drop
    columns_to_drop = select_columns_to_drop(all_columns)

    if columns_to_drop:
        print(f"\n📌 Dropping columns: {columns_to_drop}")
        geojson_data = drop_columns_from_geojson(geojson_data, columns_to_drop)
    else:
        print("\n✅ No columns were dropped.")

    # Save the new cleaned GeoJSON
    output_file = file_path.replace(".geojson", "_cleaned.geojson")
    save_geojson(geojson_data, output_file)
    print(f"\n✅ Cleaned GeoJSON saved as: {output_file}")


In [9]:
# Define input directory
input_folder = "Asset_Data_IGP/Waste"  # Change this to your folder containing the GeoJSON file
# Run the function
process_geojson(input_folder)



Available GeoJSON files:
1. plastic_waste_IGP.geojson
2. plastic_waste_IGP_backup.geojson
3. plastic_waste_IGP_cleaned.geojson
4. solid_waste_disposal_IGP.geojson

✅ Backup created: Asset_Data_IGP/Waste\plastic_waste_IGP_backup_backup.geojson

Available columns in the dataset:
1. Latitude
2. Longitude
3. area
4. country
5. id
6. name

📌 Dropping columns: ['area']

✅ Cleaned GeoJSON saved as: Asset_Data_IGP/Waste\plastic_waste_IGP_backup_cleaned.geojson


In [None]:
# Define input directory and output file
input_folder = "IGP/"  # Change this to your main folder
output_file = "merged_output.geojson"

# Define naming rules for secondary IDs based on asset type and country
id_prefixes = {
    "coal": "cpi",
    "steel": "cpp",
    "cement": "cpc",
    "brick kilns": "bkp"
}

def load_geojson(file_path):
    """ Load a GeoJSON file and return its dictionary. """
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def get_all_columns(geojson_files):
    """ Extract all unique attribute keys from multiple GeoJSON files. """
    all_columns = set()
    for file in geojson_files:
        data = load_geojson(file)
        for feature in data.get('features', []):
            all_columns.update(feature.get('properties', {}).keys())
    return list(all_columns)

def merge_geojson_files(input_folder):
    """ Combines multiple GeoJSON files into one standardized GeoJSON. """
    merged_features = []
    unique_id_counter = 1  # Unique ID counter
    assettype_counters = defaultdict(lambda: defaultdict(int))  # {assettype: {country: count}}

    all_geojson_files = glob.glob(os.path.join(input_folder, "*", "*.geojson"))  # Find all files inside subfolders
    all_columns = get_all_columns(all_geojson_files)

    for file in all_geojson_files:
        folder_name = os.path.basename(os.path.dirname(file))  # Get asset type from folder name
        data = load_geojson(file)

        for feature in data.get('features', []):
            properties = feature.get('properties', {})
            geometry = feature.get('geometry', {})

            # Fill missing columns with 0
            standardized_properties = {col: properties.get(col, 0) for col in all_columns}

            # Extract country code
            country = standardized_properties.get("country", "UNK")  # Use "UNK" if no country found

            # Assign unique ID
            standardized_properties["Unique_ID"] = f"IGP-{unique_id_counter}"

            # Assign AssetType_ID based on asset type and country
            asset_prefix = id_prefixes.get(folder_name.lower(), "gen")  # Default to "gen" if not in mapping
            assettype_counters[folder_name][country] += 1
            standardized_properties["AssetType_ID"] = f"{asset_prefix}{assettype_counters[folder_name][country]}"

            # Create standardized feature
            merged_features.append({
                "type": "Feature",
                "geometry": geometry,
                "properties": standardized_properties
            })

            unique_id_counter += 1

    # Create final merged GeoJSON
    merged_geojson = {
        "type": "FeatureCollection",
        "features": merged_features
    }

    # Save to file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(merged_geojson, f, indent=4)

    print(f"✅ Merged GeoJSON saved as {output_file}")

# Run the merge function
merge_geojson_files(input_folder)
