In [34]:
import pandas as pd

def transform_ogd_to_dublin_core(original_df):
    # OGD to Dublin Core metadata mapping object
    ogd_to_dublin_core = {
        "title": "Title",
        "short_description": "Alternative Title/Abstract",
        "cdos_state_ministry": "Creator",
        "cdos_state_ministry": "Contributor",  # Note: duplicate mapping
        "group_sector_sector_resource_keyword": "Subject",
        "long_description": "Description",
        "note": "Note",
        "ministry_department_state_department_cdos_state_ministry": "Publisher",
        "created": "Date Created",
        "published_date": "Date Issued", 
        "changed": "Date Modified",
        "external_contributor": "Contributor",
        "resource_category": "Type",
        "file_format": "Format",
        "field_reference_url": "Source",
        "language": "Language",
        "domain": "Spatial Coverage",
        "duration_of_date": "Temporal Coverage",
        "rights": "Rights",
        "govt_type": "Jurisdiction",
        "released_under": "Rights Statement",
        "license": "License",
        "access_type": "Access Rights",
        "frequency": "Accrual Periodicity",
        "granularity": "Coverage",
        "datafile_url": "Relation",
        "is_visualized": "Unmappable",
        "odgp_view_count": "Unmappable",
        "ogdp_download_count": "Unmappable",
        "api_request_count": "Unmappable",
        "field_show_export": "Unmappable",
        "is_rated": "Unmappable",
        "external_api_reference": "Unmappable",
        "field_from_api": "Relation",
        "catalog_title": "Unmappable",
        "field_resource_type": "Unmappable",
        "sector_resource": "Subject",
        "field_high_value_dataset": "High Value Dataset Category",
        "is_api_available": "Unmappable",
        "image_url": "Depiction",
        "has_part": "Has Part",
        "is_part_of": "Is Part Of",
        "is_referenced_by": "Is Referenced By",
        "is_replaced_by": "Is Replaced By",
        "is_version_of": "Is Version Of"
    }
    
    # Get all unique Dublin Core column names (excluding "Unmappable")
    dublin_core_columns = [col for col in set(ogd_to_dublin_core.values()) if col != "Unmappable"]
    
    # Add unmapped columns to the column list
    unmapped_columns = []
    for ogd_col, dublin_col in ogd_to_dublin_core.items():
        if dublin_col == "Unmappable":
            unmapped_columns.append(f"unmapped_{ogd_col}")
    
    # Combine all columns
    all_columns = dublin_core_columns + unmapped_columns
    new_df = pd.DataFrame(index=original_df.index, columns=all_columns)
    
    # Initialize all values as empty strings
    for col in all_columns:
        new_df[col] = ""
    
    # Process each row
    for idx in original_df.index:
        # Handle direct mappings
        for ogd_col, dublin_col in ogd_to_dublin_core.items():
            if ogd_col in original_df.columns:
                if dublin_col == "Unmappable":
                    # Handle unmappable columns separately
                    continue
                elif dublin_col in ["Subject", "Publisher", "Identifier"]:
                    # Handle combined columns separately
                    continue
                else:
                    # Direct mapping
                    new_df.at[idx, dublin_col] = str(original_df.at[idx, ogd_col])
        
        # Handle Subject (combine sector and sector_resource)
        subject_parts = []
        if "sector" in original_df.columns:
            subject_parts.append(str(original_df.at[idx, "sector"]))
        if "sector_resource" in original_df.columns:
            subject_parts.append(str(original_df.at[idx, "sector_resource"]))
        new_df.at[idx, "Subject"] = " | ".join([part for part in subject_parts if part != "nan" and part != ""])
        
        # Handle Identifier (combine domain and node_alias)
        identifier_parts = []
        if "domain" in original_df.columns:
            identifier_parts.append(str(original_df.at[idx, "domain"]))
        if "node_alias" in original_df.columns:
            identifier_parts.append(str(original_df.at[idx, "node_alias"]))
        new_df.at[idx, "Identifier"] = "".join([part for part in identifier_parts if part != "nan" and part != ""])
        
        # Handle Publisher (combine ministry_department, state_department, cdos_state_ministry)
        publisher_parts = []
        if "ministry_department" in original_df.columns:
            publisher_parts.append(str(original_df.at[idx, "ministry_department"]))
        if "state_department" in original_df.columns:
            publisher_parts.append(str(original_df.at[idx, "state_department"]))
        if "cdos_state_ministry" in original_df.columns:
            publisher_parts.append(str(original_df.at[idx, "cdos_state_ministry"]))
        new_df.at[idx, "Publisher"] = " | ".join([part for part in publisher_parts if part != "nan" and part != ""])
        
        # Handle Creator and Contributor (both from cdos_state_ministry)
        if "cdos_state_ministry" in original_df.columns:
            new_df.at[idx, "Creator"] = str(original_df.at[idx, "cdos_state_ministry"])
            new_df.at[idx, "Contributor"] = str(original_df.at[idx, "cdos_state_ministry"])
        
        # Handle Unmappable columns as separate columns
        for ogd_col, dublin_col in ogd_to_dublin_core.items():
            if dublin_col == "Unmappable" and ogd_col in original_df.columns:
                unmapped_col_name = f"unmapped_{ogd_col}"
                new_df.at[idx, unmapped_col_name] = str(original_df.at[idx, ogd_col])
    
    # Reorder columns alphabetically
    new_df = new_df.reindex(sorted(new_df.columns), axis=1)
    
    return new_df

# Example usage:
# transformed_df = transform_ogd_to_dublin_core(original_dataframe)

In [35]:
ogd_df=pd.read_csv("/home/prajna/civicdatalab/nic-metadata/nic-metadata-cleaning/sample_datasets_metadata/sample_dataset_nic.csv")

In [36]:
transformed_df=transform_ogd_to_dublin_core(ogd_df)

In [37]:
transformed_df.to_csv("sample_dataset_nic_dublin_core.csv")