In [1]:
import pandas as pd


In [None]:
# OGD to Dublin Core metadata mapping object
ogd_to_dublin_core={
    "title":"Title",
    "short_description":"Alternative Title/Abstract",
    "cdos_state_ministry":"Creator",
    "cdos_state_ministry":"Contributor",
    "group_sector_sector_resource_keyword":"Subject",
    "long_description":"Description",
    "note":"Note",
    "ministry_department_state_department_cdos_state_ministry":"Publisher",
    "created":"Date Created",
    "published":"Date Issued",
    "changed":"Date Modified",
    "external_contributor":"Contributor",
    "resource_category":"Type",
    "file_format":"Format",
    "domain_node_alias":"Identifier",
    "field_reference_url":"Source",
    "language":"Language",
    "domain":"Spatial Coverage",
    "duration_of_date":"Temporal Coverage",
    "rights":"Rights",
    "govt_type":"Jurisdiction",
    "released_under":"Rights Statement",
    "license":"License",
    "access_type":"Access Rights",
    "frequency":"Accrual Periodicity",
    "granularity":"Coverage",
    "datafile_url":"Relation",
    "is_visualized":"Unmappable",
    "odgp_view_count":"Unmappable",
    "ogdp_download_count":"Unmappable",
    "api_request_count":"Unmappable",
    "field_show_export":"Unmappable",
    "is_rated":"Unmappable",
    "external_api_reference":"Unmappable",
    "field_from_api":"Relation",
    "catalog_title":"Unmappable",
    "field_resource_type":"Unmappable",
    "sector_resource":"Subject",
    "field_high_value_dataset":"High Value Dataset Category",
    "is_api_available":"Unmappable",
    "image_url":"Depiction",
    "has_part":"Has Part",
    "is_part_of":"Is Part Of",
    "is_referenced_by":"Is Referenced By",
    "is_replaced_by":"Is Replaced By",
    "is_version_of":"Is Version Of"

}

SyntaxError: expression expected after dictionary key and ':' (3751421064.py, line 21)

In [3]:
# Updated mapping object based on actual CSV columns
ogd_to_dublin_core_updated = {
    "title": "Title",
    "catalog_title": "Alternative Title",
    "cdos_state_ministry": "Creator",
    "ministry_department": "Contributor", 
    "state_department": "Contributor",
    "sector": "Subject",
    "sector_resource": "Subject",
    "note": "Description",
    "published_date": "Date Issued",
    "changed": "Date Modified", 
    "created": "Date Created",
    "resource_category": "Type",
    "file_format": "Format",
    "node_alias": "Identifier",
    "datafile_url": "Source",
    "domain": "Spatial Coverage",
    "frequency": "Accrual Periodicity",
    "granularity": "Coverage",
    "datafile": "Relation",
    "field_from_api": "Relation",
    "govt_type": "Jurisdiction",
    "field_high_value_dataset": "High Value Dataset Category",
    "file_size": "Extent",
    # Unmappable fields
    "is_visualized": "Unmappable",
    "ogdp_view_count": "Unmappable", 
    "ogdp_download_count": "Unmappable",
    "api_request_count": "Unmappable",
    "field_show_export": "Unmappable",
    "is_rated": "Unmappable",
    "external_api_reference": "Unmappable",
    "field_resource_type": "Unmappable",
    "is_api_available": "Unmappable"
}

In [None]:
def convert_ogd_to_dublin_core(path, mapping_object):
    """takes in path of the OGD CSV and outputs dublin_core mapped CSV"""
    import pandas as pd
    
    # Read the OGD CSV
    ogd_df = pd.read_csv(path)
    
    # Create a new DataFrame for Dublin Core format
    dublin_core_df = pd.DataFrame()
    
    # Create a reverse mapping to handle multiple OGD fields mapping to same Dublin Core field
    dublin_core_fields = {}
    
    # Process each column in the original dataset
    for ogd_column in ogd_df.columns:
        if ogd_column in mapping_object:
            dublin_core_field = mapping_object[ogd_column]
            
            # Handle unmapped fields
            if dublin_core_field == "Unmappable":
                dublin_core_df[f"Unmapped_{ogd_column}"] = ogd_df[ogd_column]
            else:
                # Check if this Dublin Core field already exists
                if dublin_core_field in dublin_core_fields:
                    # Concatenate with existing values using semicolon
                    existing_values = dublin_core_fields[dublin_core_field]
                    new_values = ogd_df[ogd_column].fillna('').astype(str)
                    combined_values = []
                    
                    for i in range(len(existing_values)):
                        existing = str(existing_values.iloc[i]) if existing_values.iloc[i] else ''
                        new = str(new_values.iloc[i]) if new_values.iloc[i] else ''
                        
                        # Combine non-empty values with semicolon
                        if existing and new:
                            combined_values.append(f"{existing};{new}")
                        elif existing:
                            combined_values.append(existing)
                        elif new:
                            combined_values.append(new)
                        else:
                            combined_values.append('')
                    
                    dublin_core_fields[dublin_core_field] = pd.Series(combined_values)
                else:
                    # First occurrence of this Dublin Core field
                    dublin_core_fields[dublin_core_field] = ogd_df[ogd_column].fillna('')
        else:
            # Column not in mapping - treat as unmapped
            dublin_core_df[f"Unmapped_{ogd_column}"] = ogd_df[ogd_column]
    
    # Add all mapped Dublin Core fields to the final dataframe
    for dc_field, values in dublin_core_fields.items():
        dublin_core_df[dc_field] = values
    
    # Reorder columns to put standard Dublin Core fields first
    standard_dc_fields = [col for col in dublin_core_df.columns if not col.startswith('Unmapped_')]
    unmapped_fields = [col for col in dublin_core_df.columns if col.startswith('Unmapped_')]
    
    # Reorder the dataframe
    dublin_core_df = dublin_core_df[standard_dc_fields + unmapped_fields]
    
    return dublin_core_df.to_csv('dublin_core_mapped.csv')


    