# **Metadata**

In [3]:
def print_file_variables(self):
    variables = list(self.variables.keys())
    print(variables)

In [None]:
def get_file_variables(self):
    variables = list(self.variables.keys())
    return variables

In [None]:
import pandas as pd #noqa
import polars as pl #noqa
import xarray as xr #noqa
def check_dataframe_type(self):
    if isinstance(self, pd.DataFrame):
        print("This is a Pandas DataFrame.")
    elif isinstance(self, pl.DataFrame):
        print("This is a Polars DataFrame.")
    elif isinstance(self, xr.Dataset):
        print("This is a xarray Dataset")
    else:
        print("Unknown DataFrame/Dataset type.")

## netCDF

### Get

### Print


Variable

In [45]:
def print_netCDF_metadata(self, variables=None, attributes=None):
    """
    Get metadata for variables.

    Args:
        variables (list): List of variable names. If None, metadata for all variables will be retrieved.
        attributes (list): List of attribute names. If None, all attributes will be retrieved.

    Returns:
        None
    """
    def print_variable_metadata(var_name, var):
        """
        Print metadata for a variable.

        Args:
            var_name (str): Name of the variable.
            var (Variable): Variable object.

        Returns:
            None
        """
        print(f"Variable: {var_name}")
        if not var.attrs:
            if var.values is not None:
                print(f"   Values: {var.values}")
            else:
                print("No values were found")
            print("   No attributes were found for this variable.")
        else:
            print(f" Values: {var.values}")
            print(" Attributes:")
            for key, value in var.attrs.items():
                if attributes is None or key in attributes:
                    print(f"   {key}: {value}")

    if variables is None:
        variables = (key for key in self.coords.keys())
    for var_name in variables:
        try:
            coord_var = self.coords[var_name]
            print_variable_metadata(var_name, coord_var)
        except (KeyError, AttributeError) as e:
            print(f"Error occurred while retrieving metadata for variable {var_name}: {str(e)}")

Global

In [6]:
def get_attrs(self):
    return self.attrs

def print_global_metadata(self, attributes=None):
    """
    Print the global metadata attributes of the dataset.

    Args:
        attributes (list): List of attribute names to print. If None, all attributes will be printed.
    """
    attrs = get_attrs(self)
    if not attrs:
        print("No Global Attributes were found.")
    else:
        if attributes is None:
            for attr_name, attr_value in attrs.items():
                print(attr_name, ":", attr_value)
        else:
            for attr_name, attr_value in attrs.items():
                if attr_name in attributes:
                    print(attr_name, ":", attr_value)


### Write

Input

In [59]:
import os
import pathlib
import os.path
def insert_netCDF_metadata_input(self, variables=None, attributes=None, new_file=False, filename="new_file.nc",):
    """
    This function prompts the user to input metadata for the specified variables in a netCDF file.
    
    Parameters:
    - filename (str): Name of the netCDF file.
    - variables (list): List of variable names. If None, all coordinate variables are used.
    - attributes (list): List of attribute names. If None, default attributes are used.
    - new_file (bool): If True, a new netCDF file is created. If False, the existing file is used.
    
    Raises:
    - KeyError: If a variable was not found.
    - FileExistsError: If the specified file already exists.
    - ValueError: If the filename is invalid.
    """
    
    # Define default attributes if not provided
    default_attributes = [
        "Units", "Long_Name", "Standard_Name/Short_Name", 
        "Valid_Min", "Valid_Max", "Missing_Value", 
        "Fill_Value", "Scale_Factor", "Add_Offset", 
        "Coordinates", "Axis", "Description"
    ]
    if attributes is None:
        attributes = default_attributes

    if variables is None:
        variables = (key for key in self.coords.keys())

    for coord_name in variables:
        try:
            for attribute in attributes:
                self[coord_name].attrs[attribute] = input(f"{coord_name}: {attribute} - Enter value: ")
        except KeyError as e:
            raise KeyError(f"Variable {coord_name} not found.") from e
    if new_file:
        if not os.path.isfile(filename):
            if pathlib.Path(filename).suffix == ".nc":
                write_netcdf_file(self, filename)
            else:
                raise ValueError("Invalid filename. Please provide a valid filename.")
        else:
            raise FileExistsError(f"{filename} already exists. Please change it or delete it.")

def write_netcdf_file(self, filename):
    """
    This function writes the netCDF file.
    
    Parameters:
    - filename (str): Name of the netCDF file.
    """
    
    if not os.path.isfile(filename):
        self.to_netcdf(filename)
    else:
        print(f"{filename} already exists. Please change it or delete it.")

Dictionary

In [7]:
import os
import pathlib
def insert_netCDF_metadata_dict(self, dictionary, variables=None, new_file=False, filename="new_file.nc"):
    """
    Insert metadata into a netCDF file using a dictionary.

    Parameters:
    - self: The netCDF object.
    - dictionary: A dictionary containing the metadata to be inserted.
    - filename: The name of the netCDF file to be created or modified.
    - variables: A list of variables to insert the metadata into. If None, all variables will be used.
    - new_file: If True, a new file will be created. If False, the metadata will be inserted into an existing file.

    Raises:
    - ValueError: If dictionary is None.
    - AttributeError: If dictionary is not a dictionary.
    - FileExistsError: If the specified file already exists.
    - ValueError: If the filename is invalid.

    Returns:
    - None
    """
    if dictionary is None:
        raise ValueError("Please provide a dictionary.")
    if variables is None:
        variables = (key for key in self.coords.keys())
    if isinstance(dictionary, dict):
        for var in variables:
            for key, value in dictionary.items():
                self[var].attrs[key] = value
    else:
        raise AttributeError(f"{dictionary} is not a dictionary.")
    if new_file:
        if not os.path.isfile(filename):
            if pathlib.Path(filename).suffix == ".nc":
                write_netcdf_file(self, filename)
            else:
                raise ValueError("Invalid filename. Please provide a valid filename.")
        else:
            raise FileExistsError(f"{filename} already exists. Please change it or delete it.")

Json

In [40]:
import json
import jsonschema
from jsonschema.exceptions import ValidationError 

def insert_netCDF_metadata_json(self, json_file, new_file=False, filename="new_file.nc"):
    """
    Inserts metadata from a JSON file into a netCDF file.

    Args:
        self: The instance of the class that the function belongs to.
        json_file (str): The path to the JSON file containing the metadata.
        new_file (bool, optional): A boolean flag indicating whether a new netCDF file should be created. Defaults to False.
        filename (str, optional): The name of the new netCDF file. Defaults to "new_file.nc".

    Raises:
        FileNotFoundError: If the specified filename already exists.

    Returns:
        None: The function modifies the attributes of the netCDF file directly.
    """
    schema = {
    "type": "object",
    "patternProperties": {
        ".*": {
            "type": "object",


    # Existing code continues below
        "patternProperties": {
            ".*": {
                "type": "string",
            }
        }
    },
    "additionalProperties": False
    }   
}   
    try:
        with open(json_file, 'r') as file:
            metadata = json.load(file)
    except IOError:
        raise IOError("Error opening JSON file. Please check if the file exists or if there are any permission issues.")
    try:
        # Validate JSON against schema
        jsonschema.validate(instance=metadata, schema=schema)
    except ValidationError as e:
        raise ValidationError(str(e))
    for var, attributes in metadata.items():
        for attr, value in attributes.items():
            self[var].attrs[attr] = value    
    if new_file:
        if not os.path.isfile(filename):
            if pathlib.Path(filename).suffix == ".nc":
                write_netcdf_file(self, filename)
            else:
                raise ValueError("Invalid filename. Please provide a valid filename.")
        else:
            raise FileExistsError(f"{filename} already exists. Please change it or delete it.")

All

In [17]:
def insert_netCDF_metadata(self, via="input", **kwargs):
    """
    Insert metadata into the netCDF file.

    Parameters:
        via (str, optional): The method of providing metadata. Can be "dict", "json", or "input". Defaults to "input".
        **kwargs: Additional keyword arguments for the specific method.

    Raises:
        ValueError: If `via` is not a valid metadata input.
    """
    via_lower = via.lower()
    try:
        if via_lower == "dict":
            insert_netCDF_metadata_dict(self, **kwargs)
        elif via_lower == "json":
            insert_netCDF_metadata_json(self, **kwargs)
        elif via_lower == "input":
            insert_netCDF_metadata_input(self, **kwargs)
        else:
            raise ValueError(f"{via} is not a valid metadata input.")
    except Exception as e:
        raise ValueError(f"Error inserting netCDF metadata: {str(e)}")

### Testing

### Global Metadata

In [None]:
def insert_netCDF_global_metadata_input(self, attributes=None):
    """
    Insert global netCDF metadata attributes.

    Args:
        attributes (list): List of attributes to insert. If None, default attributes will be used.

    Returns:
        None
    """
    default_attributes = [
        "Title", "Institution", "Source",
        "History", "References", "Conventions",
        "Creator_Author", "Project", "Description"
    ]
    if attributes is None:
        attributes = default_attributes
    try:
        if not isinstance(attributes, list):
            raise ValueError("attributes must be a list")
        for attribute in attributes:
            if not isinstance(attribute, str):
                raise ValueError("attributes must contain only strings")
            self.attrs[attribute] = input(f"{attribute} - Enter value: ")
    except ValueError as e:
        print(f"An error occurred: {e}")

In [None]:
import os
import pathlib
def insert_netCDF_global_metadata_dict(self, dictionary, new_file=False, filename="new_file.nc"):
    """
    Inserts a dictionary of global netCDF metadata into a netCDF file.

    Args:
        self: The instance of the class that the function belongs to.
        dictionary (dict): The dictionary of global netCDF metadata to be inserted.
        new_file (bool, optional): A boolean flag indicating whether to create a new netCDF file or not. Default is True.
        filename (str, optional): The name of the new netCDF file to be created. Default is "new_file.nc".

    Raises:
        TypeError: If the dictionary parameter is not a valid dictionary.
        FileNotFoundError: If the filename is invalid.
        FileExistsError: If the specified filename already exists.

    Returns:
        None. The function doesn't return any value.
    """
    if not isinstance(dictionary, dict):
        raise TypeError(f"{dictionary} is not a dictionary.")
    
    for key, value in dictionary.items():
        self.attrs[key] = value
    if new_file:
        if not os.path.isfile(filename):
            if pathlib.Path(filename).suffix == ".nc":
                write_netcdf_file(self, filename)
            else:
                raise ValueError("Invalid filename. Please provide a valid filename.")
        else:
            raise FileExistsError(f"{filename} already exists. Please change it or delete it.")

In [None]:
import jsonschema
from jsonschema.exceptions import ValidationError
import os
import pathlib
import json 
def insert_netCDF_global_metadata_json(self, json_file, new_file=False, filename="new_file.nc"):
    """
    Inserts global metadata from a JSON file into a netCDF file.

    Args:
        self: The instance of the class calling the function.
        json_file (str): The path to the JSON file containing the metadata.
        new_file (bool, optional): Indicates whether a new netCDF file should be created. Default is False.
        filename (str, optional): Specifies the name of the new netCDF file. Default is "new_file.nc".

    Raises:
        FileNotFoundError: If there is an error opening the JSON file.
        json.JSONDecodeError: If there is an error decoding the JSON file.
        ValueError: If the filename is invalid.
        FileExistsError: If the filename already exists.
        ValidationError: If the JSON file does not match the specified schema.

    Returns:
        None
    """
    schema = {
        "type": "object",
        "patternProperties": {
            ".*": { "type": "string" }
        },
        "additionalProperties": False
    }

    try:
        with open(json_file, 'r') as file:
            metadata = json.load(file)
    except FileNotFoundError:
        raise FileNotFoundError("Error opening JSON file. Please check if the file exists or if there are any permission issues.")
    except json.JSONDecodeError:
        raise json.JSONDecodeError("Error decoding JSON file. Please check if the file contains valid JSON.")
    
    try:
        # Validate JSON against schema
        jsonschema.validate(instance=metadata, schema=schema)
    except ValidationError as e:
        raise ValidationError(str(e))
        
    if new_file:
        if pathlib.Path(filename).suffix == ".nc":
            if not os.path.isfile(filename):
                write_netcdf_file(self, filename)
            else:
                raise FileExistsError(f"{filename} already exists. Please change it or delete it.")
        else:
            raise ValueError("Invalid filename. Please provide a valid filename.")

## Parquet

In [15]:
import pyarrow.parquet as pq #noqa
import pandas as pd
df = pd.read_parquet("Titanic.parquet")
wdwd = [str(dtype) for dtype in df.columns.tolist()]
print(wdwd)

['Gender', 'Age', 'Siblings_on_Board', 'Parents_on_Board', 'Ticket_Price', 'Port_of_Embarkation', 'Class', 'Adult/Child', 'Alone', 'Survived']


In [18]:
import pyarrow.parquet as pq #noqa
import pandas as pd
df = pd.read_parquet("Titanic.parquet")
tipos_de_dados_legiveis = [str(dtype) for dtype in df.dtypes.to_list()]
print(tipos_de_dados_legiveis)

['category', 'float64', 'int8', 'int8', 'float64', 'category', 'category', 'category', 'bool', 'int64']


In [8]:
lista_de_listas = [[x, y] for x, y in zip(wdwd, tipos_de_dados_legiveis)]
print(lista_de_listas)

[['Gender', 'category'], ['Age', 'float64'], ['Siblings_on_Board', 'int8'], ['Parents_on_Board', 'int8'], ['Ticket_Price', 'float64'], ['Port_of_Embarkation', 'category'], ['Class', 'category'], ['Adult/Child', 'category'], ['Alone', 'bool'], ['Survived', 'int64']]


In [9]:
for coluna in lista_de_listas:
    if coluna[1] == 'category':
        coluna[1] = 'string'

In [10]:
print(lista_de_listas)

[['Gender', 'string'], ['Age', 'float64'], ['Siblings_on_Board', 'int8'], ['Parents_on_Board', 'int8'], ['Ticket_Price', 'float64'], ['Port_of_Embarkation', 'string'], ['Class', 'string'], ['Adult/Child', 'string'], ['Alone', 'bool'], ['Survived', 'int64']]


In [11]:
import pyarrow as pa
# Define the schema for the PyArrow Table
table_schema = pa.schema(
    [
        pa.field(
            lista_de_listas[0][0], lista_de_listas[0][1], metadata={"Description": "The passenger's Gender"}
        ),
        pa.field(
            lista_de_listas[1][0],
            lista_de_listas[1][1],
            metadata={"Description": "The passenger's Age", "Calculation": "No"},
        ),
        pa.field(
            lista_de_listas[2][0],
            lista_de_listas[2][1],
            metadata={
                "Description": "Number of sibilings that the passenger had on board",
                "Calculation": "No",
            },
        ),
        pa.field(
            lista_de_listas[3][0],
            lista_de_listas[3][1],
            metadata={
                "Description": "Number of parents that the passenger had on board",
                "Calculation": "No",
            },
        ),
        pa.field(
            lista_de_listas[4][0],
            lista_de_listas[4][1],
            metadata={"Description": "Ticket's Price", "Calculation": "No"},
        ),
        pa.field(
            lista_de_listas[5][0],
            lista_de_listas[5][1],
            metadata={"Description": "The port were the passenger embarked"},
        ),
        pa.field(
            lista_de_listas[6][0],
            lista_de_listas[6][1],
            metadata={"Description": "The passenger's class on the ship"},
        ),
        pa.field(
            lista_de_listas[7][0],
            lista_de_listas[7][1],
            metadata={"Description": "If the passenger is child or not"},
        ),
        pa.field(
            lista_de_listas[8][0],
            lista_de_listas[8][1],
            metadata={"Description": "If the passenger is travelling alone or not"},
        ),
        pa.field(
            lista_de_listas[9][0],
            lista_de_listas[9][1],
            metadata={
                "Description": "If the passenger survived or not",
                "Calculation": "No",
            },
        ),
    ]
)

In [32]:
table = pa.Table.from_pandas(df)  # Converting the pandas DataFrame to a PyArrow Table
table = table.cast(table_schema)  # Cast the PyArrow Table to the specified schema
print(str(table.field(0)))
type(table.field(0).metadata)

pyarrow.Field<Gender: string>


dict

In [83]:
converted_metadata = {key.decode('utf-8'): value.decode('utf-8') for key, value in table.field(0).metadata.items()}
# Now 'converted_metadata' contains regular strings
print(converted_metadata)
for word, value in converted_metadata.items():
    print(f'{word} : {value}')

{'Description': "The passenger's Gender"}
Description : The passenger's Gender


In [85]:
import re
teste = str(table.field(0))
teste = list(re.split("<|:", teste))
print(teste[1])


Gender


In [90]:
import re
teste = str(table.field(0))
teste = list(re.split("<|:", teste))
print(teste[1])
converted_metadata = {key.decode('utf-8'): value.decode('utf-8') for key, value in table.field(0).metadata.items()}
# Now 'converted_metadata' contains regular strings
# print(converted_metadata)
for word, value in converted_metadata.items():
    print(f'    {word} : {value}')

Gender
    Description : The passenger's Gender


In [100]:
for i in range(table.num_columns):
    col = str(table.field(i))
    col = list(re.split("<|:", col))
    metadata = {key.decode('utf-8'): value.decode('utf-8') for key, value in table.field(i).metadata.items()}
    print(col[1])
    for word, value in metadata.items():
        print(f'    {word} : {value}')

Gender
    Description : The passenger's Gender
Age
    Description : The passenger's Age
    Calculation : No
Siblings_on_Board
    Description : Number of sibilings that the passenger had on board
    Calculation : No
Parents_on_Board
    Description : Number of parents that the passenger had on board
    Calculation : No
Ticket_Price
    Description : Ticket's Price
    Calculation : No
Port_of_Embarkation
    Description : The port were the passenger embarked
Class
    Description : The passenger's class on the ship
Adult/Child
    Description : If the passenger is child or not
Alone
    Description : If the passenger is travelling alone or not
Survived
    Description : If the passenger survived or not
    Calculation : No
