## Imports

In [2]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import json
import jsonschema
from jsonschema.exceptions import ValidationError
import os
import pathlib
import os.path
import polars as pl 
import xarray as xr
import matplotlib.pyplot as plt
import requests
import seaborn as sns
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
import time
import os
import numpy as np 

# netCDF

## Variable Metadata

### Read

In [3]:
def read_netCDF_metadata(self, variables=None, attributes=None):
    """
    Get metadata for variables.

    Args:
        variables (list): List of variable names. If None, metadata for all variables will be retrieved.
        attributes (list): List of attribute names. If None, all attributes will be retrieved.

    Returns:
        None
    """
    def read_variable_metadata(var_name, var):
        """
        Print metadata for a variable.

        Args:
            var_name (str): Name of the variable.
            var (Variable): Variable object.

        Returns:
            None
        """
        print(f"Variable: {var_name}")
        if not var.attrs:
            if var.values is not None:
                print(f"    Values: {var.values}")
            else:
                print("No values were found")
            print("    No attributes were found for this variable.")
        else:
            print(f"    Values: {var.values}")
            print("    Attributes:")
            for key, value in var.attrs.items():
                if attributes is None or key in attributes:
                    print(f"     {key}: {value}")

    if variables is None:
        variables = get_file_variables(self)
    for var_name in variables:
        try:
            coord_var = self.coords[var_name]
            read_variable_metadata(var_name, coord_var)
        except (KeyError, AttributeError) as e:
            print(f"Error occurred while retrieving metadata for variable {var_name}: {str(e)}")

### Insert

#### Input

In [4]:
def insert_netCDF_metadata_input(self, variables=None, attributes=None, new_file=False, filename="new_file.nc",):
    """
    This function prompts the user to input metadata for the specified variables in a netCDF file.
    
    Parameters:
    - filename (str): Name of the netCDF file.
    - variables (list): List of variable names. If None, all coordinate variables are used.
    - attributes (list): List of attribute names. If None, default attributes are used.
    - new_file (bool): If True, a new netCDF file is created. If False, the existing file is used.
    
    Raises:
    - KeyError: If a variable was not found.
    - FileExistsError: If the specified file already exists.
    - ValueError: If the filename is invalid.
    """
    
    # Define default attributes if not provided
    default_attributes = [
        "Units", "Long_Name", "Standard_Name/Short_Name", 
        "Valid_Min", "Valid_Max", "Missing_Value", 
        "Fill_Value", "Scale_Factor", "Add_Offset", 
        "Coordinates", "Axis", "Description"
    ]
    if attributes is None:
        attributes = default_attributes

    if variables is None:
        variables = get_file_variables(self)

    for coord_name in variables:
        try:
            for attribute in attributes:
                self[coord_name].attrs[attribute] = input(f"{coord_name}: {attribute} - Enter value: ")
        except KeyError as e:
            raise KeyError(f"Variable {coord_name} not found.") from e
    if new_file:
        export_to_file(self,filename)
    read_netCDF_metadata(self)

#### Dictionary

In [5]:
def insert_netCDF_metadata_dict(self, dictionary, variables=None, new_file=False, filename="new_file.nc"):
    """
    Insert metadata into a netCDF file using a dictionary.

    Parameters:
    - self: The netCDF object.
    - dictionary: A dictionary containing the metadata to be inserted.
    - filename: The name of the netCDF file to be created or modified.
    - variables: A list of variables to insert the metadata into. If None, all variables will be used.
    - new_file: If True, a new file will be created. If False, the metadata will be inserted into an existing file.

    Raises:
    - ValueError: If dictionary is None.
    - AttributeError: If dictionary is not a dictionary.
    - FileExistsError: If the specified file already exists.
    - ValueError: If the filename is invalid.

    Returns:
    - None
    """
    if dictionary is None:
        raise ValueError("Please provide a dictionary.")
    if variables is None:
        variables = get_file_variables(self)
    if isinstance(dictionary, dict):
        for var in variables:
            for key, value in dictionary.items():
                self[var].attrs[key] = value
    else:
        raise AttributeError(f"{dictionary} is not a dictionary.")
    if new_file:
        export_to_file(self,filename)
    read_netCDF_metadata(self)

#### Json

In [6]:
def insert_netCDF_metadata_json(self, json_file, new_file=False, filename="new_file.nc"):
    """
    Inserts metadata from a JSON file into a netCDF file.

    Args:
        self: The instance of the class that the function belongs to.
        json_file (str): The path to the JSON file containing the metadata.
        new_file (bool, optional): A boolean flag indicating whether a new netCDF file should be created. Defaults to False.
        filename (str, optional): The name of the new netCDF file. Defaults to "new_file.nc".

    Raises:
        FileNotFoundError: If the specified filename already exists.

    Returns:
        None: The function modifies the attributes of the netCDF file directly.
    """
    schema = {
        "type": "object",
        "patternProperties": {
            ".*": {
                "type": "object",
                "patternProperties": {
                    ".*": {
                        "type": "string",
                    }
                }
            },
            "additionalProperties": False
            }   
        }   
    try:
        with open(json_file, 'r') as file:
            metadata = json.load(file)
    except IOError:
        raise IOError("Error opening JSON file. Please check if the file exists or if there are any permission issues.")
    try:
        # Validate JSON against schema
        jsonschema.validate(instance=metadata, schema=schema)
    except ValidationError as e:
        raise ValidationError(str(e))
    for var, attributes in metadata.items():
        for attr, value in attributes.items():
            self[var].attrs[attr] = value    
    if new_file:
        export_to_file(self,filename)
    read_netCDF_metadata(self)

#### All

In [7]:
def insert_netCDF_metadata(self, via="input", **kwargs):
    """
    Insert metadata into the netCDF file.

    Parameters:
        via (str, optional): The method of providing metadata. Can be "dict", "json", or "input". Defaults to "input".
        **kwargs: Additional keyword arguments for the specific method.

    Raises:
        ValueError: If `via` is not a valid metadata input.
    """
    via_lower = via.lower()
    try:
        if via_lower == "dict":
            insert_netCDF_metadata_dict(self, **kwargs)
        elif via_lower == "json":
            insert_netCDF_metadata_json(self, **kwargs)
        elif via_lower == "input":
            insert_netCDF_metadata_input(self, **kwargs)
        else:
            raise ValueError(f"{via} is not a valid metadata input.")
    except Exception as e:
        raise ValueError(f"Error inserting netCDF metadata: {str(e)}")

## Global Metadata

### Read

In [8]:
def get_attrs(self):
    return self.attrs

def read_global_metadata(self, attributes=None):
    """
    Print the global metadata attributes of the dataset.

    Args:
        attributes (list): List of attribute names to print. If None, all attributes will be printed.
    """
    attrs = get_attrs(self)
    if not attrs:
        print("No Global Attributes were found.")
    else:
        if attributes is None:
            for attr_name, attr_value in attrs.items():
                print(attr_name, ":", attr_value)
        else:
            for attr_name, attr_value in attrs.items():
                if attr_name in attributes:
                    print(attr_name, ":", attr_value)

### Insert

#### Input

In [9]:
def insert_netCDF_global_metadata_input(self, attributes=None, new_file=False, filename="new_file.nc"):
    """
    Insert global netCDF metadata attributes.

    Args:
        attributes (list): List of attributes to insert. If None, default attributes will be used.

    Returns:
        None
    """
    default_attributes = [
        "Title", "Institution", "Source",
        "History", "References", "Conventions",
        "Creator_Author", "Project", "Description"
    ]
    if attributes is None:
        attributes = default_attributes
    try:
        if not isinstance(attributes, list):
            raise ValueError("attributes must be a list")
        for attribute in attributes:
            if not isinstance(attribute, str):
                raise ValueError("attributes must contain only strings")
            self.attrs[attribute] = input(f"{attribute} - Enter value: ")
    except ValueError as e:
        print(f"An error occurred: {e}")
    if new_file:
        export_to_file(self, filename)

#### Dictionary

In [10]:
def insert_netCDF_global_metadata_dict(self, dictionary, new_file=False, filename="new_file.nc"):
    """
    Inserts a dictionary of global netCDF metadata into a netCDF file.

    Args:
        self: The instance of the class that the function belongs to.
        dictionary (dict): The dictionary of global netCDF metadata to be inserted.
        new_file (bool, optional): A boolean flag indicating whether to create a new netCDF file or not. Default is True.
        filename (str, optional): The name of the new netCDF file to be created. Default is "new_file.nc".

    Raises:
        TypeError: If the dictionary parameter is not a valid dictionary.
        FileNotFoundError: If the filename is invalid.
        FileExistsError: If the specified filename already exists.

    Returns:
        None. The function doesn't return any value.
    """
    if not isinstance(dictionary, dict):
        raise TypeError(f"{dictionary} is not a dictionary.")
    
    for key, value in dictionary.items():
        self.attrs[key] = value
    if new_file:
        export_to_file(self, filename)

#### Json

In [11]:
def insert_netCDF_global_metadata_json(self, json_file, new_file=False, filename="new_file.nc"):
    """
    Inserts global metadata from a JSON file into a netCDF file.

    Args:
        self: The instance of the class calling the function.
        json_file (str): The path to the JSON file containing the metadata.
        new_file (bool, optional): Indicates whether a new netCDF file should be created. Default is False.
        filename (str, optional): Specifies the name of the new netCDF file. Default is "new_file.nc".

    Raises:
        FileNotFoundError: If there is an error opening the JSON file.
        json.JSONDecodeError: If there is an error decoding the JSON file.
        ValueError: If the filename is invalid.
        FileExistsError: If the filename already exists.
        ValidationError: If the JSON file does not match the specified schema.

    Returns:
        None
    """
    schema = {
        "type": "object",
        "patternProperties": {
            ".*": { "type": "string" }
        },
        "additionalProperties": False
    }

    try:
        with open(json_file, 'r') as file:
            metadata = json.load(file)
    except FileNotFoundError:
        raise FileNotFoundError("Error opening JSON file. Please check if the file exists or if there are any permission issues.")
    except json.JSONDecodeError:
        raise json.JSONDecodeError("Error decoding JSON file. Please check if the file contains valid JSON.")
    
    try:
        # Validate JSON against schema
        jsonschema.validate(instance=metadata, schema=schema)
    except ValidationError as e:
        raise ValidationError(str(e))
    if new_file:
        export_to_file(self, filename)

# Parquet

### Read

In [12]:
def read_parquet_metadata(self, attributes=None, cols=None):
    """
    Reads the metadata of a Parquet file and prints the attributes of each column.

    Args:
        attributes (list, optional): A list of attributes to filter the metadata. If not provided, all attributes will be printed.
        cols (list, optional): A list of column names to filter the columns. If not provided, metadata of all columns will be printed.

    Returns:
        None

    Example Usage:
        # Example 1: Read metadata of all columns
        read_parquet_metadata()

        # Example 2: Read metadata of specific columns
        read_parquet_metadata(cols=['column1', 'column2'])

        # Example 3: Read metadata of specific attributes
        read_parquet_metadata(attributes=['attribute1', 'attribute2'])

        # Example 4: Read metadata of specific columns and attributes
        read_parquet_metadata(cols=['column1', 'column2'], attributes=['attribute1', 'attribute2'])
    """
    if isinstance(self, pd.DataFrame):
        self = pa.Table.from_pandas(self)
    if cols is None:
        for i in range(self.num_columns):
            field = self.field(i)
            col = field.name
            print(col)
            if field.metadata is None:
                print("    No attributes were found for this column.")
            else:
                metadata = {key.decode('utf-8'): value.decode('utf-8') for key, value in field.metadata.items()}
                for key, value in metadata.items():
                    if attributes is None or key in attributes:
                        print(f"    {key}: {value}")
    else:
        for i in range(self.num_columns):
            field = self.field(i)
            col = field.name
            if col in cols:
                print(col)
                if field.metadata is None:
                    print("    No attributes were found for this column.")
                else:
                    metadata = {key.decode('utf-8'): value.decode('utf-8') for key, value in field.metadata.items()}
                    if attributes:
                        for attr in attributes:
                            if attr in metadata:
                                print(f"    {attr}: {metadata[attr]}")
                            else:
                                print(f"    The '{attr}' attribute was not found in this column's metadata.")
                    else:
                        for key, value in metadata.items():
                            print(f"    {key}: {value}") 
        # TODO: Check why the else statement is much bigger than the if statement

### Insert

#### Input

In [13]:
def insert_parquet_metadata_input(self, attributes=None, cols=None, new_file=False, filename="new_file.parquet"):
    """
    Inserts metadata into a Parquet file.

    Args:
        attributes (list, optional): A list of attribute names to be used as metadata keys. Default value is ['Description', 'Units', 'Data Source', 'Valid Range or Categories'].
        cols (list, optional): A list of column names for which metadata needs to be inserted. Default value is all the columns in the DataFrame.

    Returns:
        pyarrow.Table: A Parquet table with metadata inserted.
    """
    default_attributes = ['Description', 'Units', 'Data Source', 'Valid Range or Categories']
    if attributes is None:
        attributes = default_attributes
    if cols is None:
        cols = list(self.columns)  # Suggestion 1: Use list(self.columns) instead of self.columns.tolist()
    metadata = []
    columns = self.columns  # Suggestion 2: Store self.columns in a variable
    cols_set = set(cols)  # Suggestion 3: Convert cols to a set for faster lookup
    for col in columns:
        if col in cols_set:
            col_metadata = {}
            for attribute in attributes:
                data = input(f"{col}: {attribute} - Enter value: ")
                col_metadata[attribute] = data
            metadata.append(col_metadata)
        else:
            metadata.append(None)
    dtypes = self.dtypes  # Suggestion 4: Get all column data types at once
    dtypes = ["string" if dtype == "category" else str(dtype) for dtype in dtypes]
    cols_dtypes = zip(columns, dtypes, metadata)
    schema = [pa.field(col, pa.type_for_alias(dtype), metadata=meta) for col, dtype, meta in cols_dtypes]
    table_schema = pa.schema(schema)
    table = pa.Table.from_pandas(self, schema=table_schema)
    if new_file:
        export_to_file(table, filename)
    return table

#### Dictionary

In [14]:
def insert_parquet_metadata_dict(self, dictionary, cols=None, new_file=False, filename="new_file.parquet"):
    """
    Insert metadata into a netCDF file using a dictionary.

    Parameters:
    - self: The netCDF object.
    - dictionary: A dictionary containing the metadata to be inserted.
    - filename: The name of the netCDF file to be created or modified.
    - variables: A list of variables to insert the metadata into. If None, all variables will be used.
    - new_file: If True, a new file will be created. If False, the metadata will be inserted into an existing file.

    Raises:
    - ValueError: If dictionary is None.
    - AttributeError: If dictionary is not a dictionary.
    - FileExistsError: If the specified file already exists.
    - ValueError: If the filename is invalid.

    Returns:
    - None
    """
    if dictionary is None:
        raise ValueError("Please provide a dictionary.")
    if cols is None:
        cols = list(self.columns)
    columns = self.columns  # Suggestion 2: Store self.columns in a variable
    dtypes = self.dtypes  # Suggestion 4: Get all column data types at once
    dtypes = ["string" if dtype == "category" else str(dtype) for dtype in dtypes]
    metadata = []
    if isinstance(dictionary, dict):
        cols_set = set(cols)  # Suggestion 3: Convert cols to a set for faster lookup
        for col in columns:
            if col in cols_set:
                metadata.append(dictionary)
            else:
                metadata.append(None)
        cols_dtypes = zip(columns, dtypes, metadata)
        schema = [pa.field(col, pa.type_for_alias(dtype), metadata=meta) for col, dtype, meta in cols_dtypes]
        table_schema = pa.schema(schema)
        table = pa.Table.from_pandas(self, schema=table_schema)
        if new_file:
            export_to_file(table, filename)
        return table  
    else:
        raise AttributeError(f"{dictionary} is not a dictionary.")  

#### Json

In [15]:
def insert_parquet_metadata_json(self, json_file, new_file=False, filename="new_file.parquet"):
    """
    Inserts metadata from a JSON file into a netCDF file.

    Args:
        self: The instance of the class that the function belongs to.
        json_file (str): The path to the JSON file containing the metadata.
        new_file (bool, optional): A boolean flag indicating whether a new netCDF file should be created. Defaults to False.
        filename (str, optional): The name of the new netCDF file. Defaults to "new_file.nc".

    Raises:
        FileNotFoundError: If the specified filename already exists.

    Returns:
        None: The function modifies the attributes of the netCDF file directly.
    """
    schema = {
    "type": "object",
    "patternProperties": {
        ".*": {
            "type": "object",
            "patternProperties": {
                ".*": {
                    "type": "string",
                }
            }
        },
        "additionalProperties": False
        }   
    }   
    try:
        with open(json_file, 'r') as file:
            json_data = json.load(file)
    except IOError:
        raise IOError("Error opening JSON file. Please check if the file exists or if there are any permission issues.")
    try:
        # Validate JSON against schema
        jsonschema.validate(instance=json_data, schema=schema)
    except ValidationError as e:
        raise ValidationError(str(e))
    cols_dtypes = get_df_cols_dtypes(self)
    cols_dtypes = [[col, "string"] if dtype == "category" else [col, str(dtype)] for col, dtype in cols_dtypes]
    metadata = []
    for col in cols_dtypes:
        if col[0] in json_data:
            col_metadata = json_data[col[0]]
            metadata.append(col_metadata)
        else:
            metadata.append(None)
    cols_dtypes = zip(cols_dtypes, metadata)
    schema = [pa.field(col_dtype[0], pa.type_for_alias(col_dtype[1]), metadata=meta) for col_dtype, meta in cols_dtypes]
    table_schema = pa.schema(schema)
    table = pa.Table.from_pandas(self, schema=table_schema)
    if new_file:
        export_to_file(table, filename)
    return table

# Export to File

In [16]:
def get_file_extension(filename):
    suffix = pathlib.Path(filename).suffix
    return suffix

In [17]:
def export_to_file(self, filename):
    suffixs = [".nc", ".parquet"]
    if not os.path.isfile(filename):
        if get_file_extension(filename) in suffixs:
            if get_file_extension(filename) == ".nc":
                self.to_netcdf(filename)
            elif get_file_extension(filename) == ".parquet":
                pq.write_table(self, filename, compression=None)        
        else:
            raise ValueError(f"Invalid file extension. Please provide a valid filename. Valid file extesions {suffixs}.")
    else:
        raise FileExistsError(f"{filename} already exists. Please change it or delete it.")

In [18]:
def get_file_variables(self):
    variables = list(self.variables.keys())
    return variables

In [19]:
def get_dtypes(self):
    dtypes = (str(dtype) for dtype in self.dtypes)
    return dtypes

In [20]:
def get_dtype(self, col):
    return self[col].dtype

In [21]:
def get_cols(self):
    cols = list(self.columns)
    return cols

In [22]:
def get_cols_dtypes(self):
    cols = get_cols(self)
    dtypes = get_dtypes(self)
    cols_dtypes = list(zip(cols, dtypes))
    return cols_dtypes

In [23]:
def convert_python_type(min_value, max_value):
            if isinstance(min_value, (int, np.integer)) and isinstance(max_value, (int, np.integer)):
                return int(min_value), int(max_value)
            elif isinstance(min_value, (float, np.floating)) and isinstance(max_value, (float, np.floating)):
                return float(min_value), float(max_value)
            elif isinstance(min_value, (np.bool_, bool)) and isinstance(max_value, (np.bool_, bool)):
                return bool(min_value), bool(max_value)
            else:
                return min_value, max_value

In [24]:
# Example usage:
col_min = np.bool_(True)  # Replace this line with self[column].min() in your code
col_max = np.bool_(False)  # Replace this line with self[column].max() in your code

# Convert to Python types
col_min, col_max = convert_python_type(col_min, col_max)

print(type(col_min), type(col_max))
if isinstance(col_min, int) and isinstance(col_max, int):
    if col_min >= -128 and col_max <= 127:
         print("a")
    elif col_min >= -32768 and col_max <= 32767:
         print("b")
    elif col_min >= -2147483648 and col_max <= 2147483647:
         print("c")
    else:
         print("d")

<class 'bool'> <class 'bool'>
a


In [25]:
import numpy as np

# Example numpy bool value
numpy_bool_value = np.bool_(False)  # Replace this with your numpy.bool_ value

# Convert numpy.bool_ to bool
python_bool_value = bool(numpy_bool_value)

print(type(python_bool_value))  # Check the type after conversion

<class 'bool'>


In [26]:
def get_best_dtypes(self, change_dtype=False):
    dtype_mapping = {
        (-128, 127): 'int8',
        (-32768, 32767): 'int16',
        (-2147483648, 2147483647): 'int32',
        (np.finfo(np.float16).min, np.finfo(np.float16).max): 'float16',
        (np.finfo(np.float32).min, np.finfo(np.float32).max): 'float32',
        (np.finfo(np.float64).min, np.finfo(np.float64).max): 'float64',
        (False, True): 'bool'
    }

    for column in self.columns:
        if self[column].dtype != "category":
            col_min = self[column].min()
            col_max = self[column].max()
            col_min, col_max = convert_python_type(col_min, col_max)
            print(col_min, col_max)
            print(column, type(col_min), type(col_max))
            if isinstance(col_min, int) and isinstance(col_max, int):
                if col_min >= -128 and col_max <= 127:
                    return "int8"
                elif col_min >= -32768 and col_max <= 32767:
                    return "int16"
                elif col_min >= -2147483648 and col_max <= 2147483647:
                    return "int32"
                else:
                    return "int64"
            elif isinstance(col_min, float) and isinstance(col_max, float):
                if col_min >= np.finfo(np.float16).min and col_min <= np.finfo(np.float16).max:
                    return "float16"
                elif col_max >= np.finfo(np.float32).min and col_max <= np.finfo(np.float32).max:
                    return "float32"
                else:
                    return "float64"
        else:
            return "category"
        # else:
        #     print("Debug4")
        #     dtype = 'categorical'
        # print(f'The best dtype for {column} is {dtype}')
    #     else:
    #         dtype = dtype_mapping.get((col_min, col_max), 'category')
    #         self[column] = self[column].astype(dtype)

    # print('New Dtypes')
    # print(self.dtypes)
    # return self

#get_best_dtypes(df)

In [27]:
def get_best_dtype(self, column, change_dtype=False):
        if self[column].dtype != "category":
            col_min = self[column].min()
            col_max = self[column].max()
            col_min, col_max = convert_python_type(col_min, col_max)
            if isinstance(col_min, bool) and isinstance(col_max, bool):
                return "bool"
            elif isinstance(col_min, int) and isinstance(col_max, int):
                if col_min >= -128 and col_max <= 127:
                    return "int8"
                elif col_min >= -32768 and col_max <= 32767:
                    return "int16"
                elif col_min >= -2147483648 and col_max <= 2147483647:
                    return "int32"
                else:
                    return "int64"
            elif isinstance(col_min, float) and isinstance(col_max, float):
                if col_min >= np.finfo(np.float16).min and col_min <= np.finfo(np.float16).max:
                    return "float16"
                elif col_max >= np.finfo(np.float32).min and col_max <= np.finfo(np.float32).max:
                    return "float32"
                else:
                    return "float64"
        else:
            return "category"

In [53]:
df = pd.read_parquet("./aa.parquet")
df.memory_usage().idxmax()
df.head()

Unnamed: 0,Gender,Age,Siblings_on_Board,Parents_on_Board,Ticket_Price,Port_of_Embarkation,Class,Adult/Child,Alone,Survived
0,M,22.0,1,0,7.25,Southampton,Third,Adult,False,0
1,F,38.0,1,0,71.28,Cherbourg,First,Adult,False,1
2,F,26.0,0,0,7.92,Southampton,Third,Adult,True,1
3,F,35.0,1,0,53.1,Southampton,First,Adult,False,1
4,M,35.0,0,0,8.05,Southampton,Third,Adult,True,0


In [29]:
def get_col_memory_usage(self, col, unit="kb"):
    memory_usage = self[col].memory_usage(deep=True)
    if unit == "kb":
        return f"{round(memory_usage / 1024, 2)} kb"    
    elif unit == "mb":
        return f"{round(memory_usage / (1024**2), 2)} mb"
    else:
        return memory_usage

In [91]:
def get_memory_usage(self, unit="kb"):
    total = 0
    for col in self.columns:
        memory_usage = self[col].memory_usage(deep=True)
        total += memory_usage
    if unit == "kb":
        return f"{round(total / 1024, 2)} kb"    
    elif unit == "mb":
        return f"{round(total / (1024**2), 2)} mb"
    else:
        return total

In [92]:
get_memory_usage(df)

'29.26 kb'

In [30]:
def get_null_count(self, col):
    return self[col].isnull().sum()

In [82]:
def get_null_count_percentage(self, col):
    value = self[col].isnull().sum()
    value = round((value/len(self[col])) * 100, 2)  
    return f"{value}%"

In [31]:
def get_unique_values_count(self, col):
    return self[col].nunique()

In [32]:
def get_max_value(self, col):
    if self[col].dtype != "category":
        value = self[col].max()
        return value
    else:
        value = self[col].value_counts().idxmax()
        return value

In [33]:
def get_max_value_count(self, col):
    if self[col].dtype != "category":
        value = self[col].max()
        value = self[self[col] == value][col].count()
        return value
    else:
        value = self[col].value_counts().max()
        return value

In [78]:
def get_max_value_percentage(self, col):
    if self[col].dtype != "category":
        value = self[col].max()
        value = self[self[col] == value][col].count()
        value = round((value/len(self[col])) * 100, 2)
        return f"{value}%"
    else:
        value = self[col].value_counts().max()
        value = round((value/len(self[col])) * 100, 2)
        return f"{value}%"
# TODO: Display the name of the value and col, add tip about numerical values.  

In [34]:
def get_min_value(self, col):
    if self[col].dtype != "category":
        value = self[col].min()
        return value
    else:
        value = self[col].value_counts().idxmin()
        return value

In [35]:
def get_min_value_count(self, col):
    if self[col].dtype != "category":
        value = self[col].min()
        value = self[self[col] == value][col].count()
        return value
    else:
        value = self[col].value_counts().min()
        return value

In [67]:
def get_min_value_percentage(self, col):
    if self[col].dtype != "category":
        value = self[col].min()
        value = self[self[col] == value][col].count()
        value = round((value/len(self[col])) * 100, 2)
        return f"{value}%"
    else:
        value = self[col].value_counts().min()
        value = round((value/len(self[col])) * 100, 2)
        return f"{value}%"

In [68]:
get_min_value_percentage(df, "Survived")

'61.62%'

In [84]:
dataframe = []
for col in df.columns:
    col_info = [
        col,
        get_dtype(df, col),
        get_best_dtype(df, col),
        get_col_memory_usage(df, col),
        get_null_count(df, col),
        get_null_count_percentage(df, col),
        get_unique_values_count(df, col),
        get_max_value(df, col),
        get_max_value_count(df, col),
        get_max_value_percentage(df, col),
        get_min_value(df, col),
        get_min_value_count(df, col),
        get_min_value_percentage(df, col)
    ]
    dataframe.append(col_info)

column_names = [
    'Column',
    'Dtype',
    'Recommend_Dtype',
    'Memory',
    'Missing_Values',
    'Percentage_of_Missing_Values',
    'Distinct_Values',
    'Most_Common/Max_Value',
    'Occurrences_of_Max_Value',
    'Percentages_of_Occurrences_of_Max_Value',
    'Less_Common/Min_Value',
    'Occurrences_of_Min_Value',
    'Percentage_of_Occurrences_of_Min_Value'
]

dataframe = pd.DataFrame(dataframe, columns=column_names)
dataframe.head(len(df.columns))

Unnamed: 0,Column,Dtype,Recommend_Dtype,Memory,Missing_Values,Percentage_of_Missing_Values,Distinct_Values,Most_Common/Max_Value,Occurrences_of_Max_Value,Percentages_of_Occurrences_of_Max_Value,Less_Common/Min_Value,Occurrences_of_Min_Value,Percentage_of_Occurrences_of_Min_Value
0,Gender,category,category,1.21 kb,0,0.0%,2,M,577,64.76%,F,314,35.24%
1,Age,float64,float16,7.09 kb,177,19.87%,88,80.0,1,0.11%,0.42,1,0.11%
2,Siblings_on_Board,int8,int8,1.0 kb,0,0.0%,7,8,7,0.79%,0,608,68.24%
3,Parents_on_Board,int8,int8,1.0 kb,0,0.0%,7,6,1,0.11%,0,678,76.09%
4,Ticket_Price,float64,float16,7.09 kb,0,0.0%,236,512.33,3,0.34%,0.0,15,1.68%
5,Port_of_Embarkation,category,category,1.3 kb,2,0.22%,3,Southampton,644,72.28%,Queenstown,77,8.64%
6,Class,category,category,1.28 kb,0,0.0%,3,Third,491,55.11%,Second,184,20.65%
7,Adult/Child,category,category,1.22 kb,0,0.0%,2,Adult,808,90.68%,Child,83,9.32%
8,Alone,bool,bool,1.0 kb,0,0.0%,2,True,537,60.27%,False,354,39.73%
9,Survived,int64,int8,7.09 kb,0,0.0%,2,1,342,38.38%,0,549,61.62%


# Telegram

## Send images

In [40]:
base_url = "https://api.telegram.org/bot6148622889:AAFHdvQ_CxImlx1VEXE_vYhg4_2NFXk1OyU/sendPhoto"
def send_images_via_telegram(file_path):
    my_file = open(file_path, 'rb')
    parameters = {
    "chat_id" : "-935188347",
    "caption" : "This is a caption"
    }
    files = {   
    "photo" : my_file
    }
    resp = requests.post(base_url, data=parameters, files=files)
    print(resp.status_code)

# Slack

## Send images

In [41]:
slack_token = "xoxp-5452682117826-5446024818310-5452588829571-92e60adc3ecd07a736b6faea910b8831"
channel_id = "C05DAGDAPEX"
client = WebClient(token=slack_token)
def send_images_via_slack(file_path):
    try:
        response = client.files_upload(
                channels=channel_id,
                file=file_path
                )
        print(response)
    except SlackApiError as e:
        print(f"Error uploading file: {e.response['error']}")

# Testing

## netCDF

In [42]:
# df = xr.open_dataset("example.nc")
# read_netCDF_metadata(df)

In [43]:
# variables = ["lat", "lon"]
# attributes= ["Units"]
# insert_netCDF_metadata(df, via="input", variables=variables, attributes=attributes, new_file=True, filename="wdwee.nbc")

In [44]:
# wdwee = xr.open_dataset("wdwee.nc")
# read_netCDF_metadata(wdwee)

## Parquet

In [45]:
# df = pq.read_table("./updated_Titanic.parquet")
# read_parquet_metadata(df)

In [46]:
# df = pd.read_parquet("./Titanic.parquet")
# get_df_cols_dtypes(df)
# df = insert_parquet_metadata_json(df, "./bosses.json", True, "ddwd.parquet")
# df = pq.read_table("./ddwd.parquet")
# read_parquet_metadata(df)

Gender
    units: Celsius
    description: Temperature dwd
Age
    No attributes were found for this column.
Siblings_on_Board
    No attributes were found for this column.
Parents_on_Board
    No attributes were found for this column.
Ticket_Price
    No attributes were found for this column.
Port_of_Embarkation
    No attributes were found for this column.
Class
    No attributes were found for this column.
Adult/Child
    No attributes were found for this column.
Alone
    units: Pascal
    description: Pressure readings
Survived
    No attributes were found for this column.


In [47]:
# df = xr.open_dataset("./dw.nc")
# get_file_variables(df)

['lat', 'lon', 'los', 'pair', 'ref', 'rep']