## Imports

In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import json
import jsonschema
from jsonschema.exceptions import ValidationError
import os
from tqdm.notebook  import tqdm
import pathlib
import os.path
import polars as pl 
import xarray as xr
import matplotlib.pyplot as plt
import requests
import seaborn as sns
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
import time
import os
import numpy as np
from IPython.display import display

In [2]:
class Metadata:
    def read_netCDF_metadata(self, variables=None, attributes=None):
        """
        Read and print metadata information from a NetCDF file.

        Args:
            variables (list, optional): A list of variable names to retrieve metadata for. If not specified, all variables in the NetCDF file will be retrieved.
            attributes (list, optional): A list of attribute names to retrieve for each variable. If not specified, all attributes for each variable will be retrieved.

        Returns:
            None

        Example Usage:
            # Read metadata for all variables in the NetCDF file
            read_netCDF_metadata()

            # Read metadata for specific variables in the NetCDF file
            read_netCDF_metadata(variables=['temperature', 'humidity'])

            # Read metadata for specific attributes of all variables in the NetCDF file
            read_netCDF_metadata(attributes=['units', 'long_name'])
        """

        def read_variable_metadata(var_name, var):
            print(f"Variable: {var_name}")
            if not var.attrs:
                if var.values is not None:
                    print(f"    Values: {var.values}")
                else:
                    print("No values were found")
                print("    No attributes were found for this variable.")
            else:
                print(f"    Values: {var.values}")
                print("    Attributes:")
                for key, value in var.attrs.items():
                    if attributes is None or key in attributes:
                        print(f"     {key}: {value}")

        if variables is None:
            variables = Statistics.get_file_variables(self)
        for var_name in variables:
            try:
                coord_var = self.coords[var_name]
                read_variable_metadata(var_name, coord_var)
            except (KeyError, AttributeError) as e:
                print(f"Error occurred while retrieving metadata for variable {var_name}: {str(e)}")
    def insert_netCDF_metadata_input(self, variables=None, attributes=None, new_file=False, filename="new_file.nc",):
        """
        This function prompts the user to input metadata for the specified variables in a netCDF file.
        
        Parameters:
        - filename (str): Name of the netCDF file.
        - variables (list): List of variable names. If None, all coordinate variables are used.
        - attributes (list): List of attribute names. If None, default attributes are used.
        - new_file (bool): If True, a new netCDF file is created. If False, the existing file is used.
        
        Raises:
        - KeyError: If a variable was not found.
        - FileExistsError: If the specified file already exists.
        - ValueError: If the filename is invalid.
        """
        
        # Define default attributes if not provided
        default_attributes = [
            "Units", "Long_Name", "Standard_Name/Short_Name", 
            "Valid_Min", "Valid_Max", "Missing_Value", 
            "Fill_Value", "Scale_Factor", "Add_Offset", 
            "Coordinates", "Axis", "Description"
        ]
        if attributes is None:
            attributes = default_attributes

        if variables is None:
            variables = get_file_variables(self)

        for coord_name in variables:
            try:
                for attribute in attributes:
                    self[coord_name].attrs[attribute] = input(f"{coord_name}: {attribute} - Enter value: ")
            except KeyError as e:
                raise KeyError(f"Variable {coord_name} not found.") from e
        if new_file:
            File.export_to_file(self,filename)
        Metadata.read_netCDF_metadata(self)
    def insert_netCDF_metadata_dict(self, dictionary, variables=None, new_file=False, filename="new_file.nc"):
        """
        Insert metadata into a netCDF file using a dictionary.

        Parameters:
        - self: The netCDF object.
        - dictionary: A dictionary containing the metadata to be inserted.
        - filename: The name of the netCDF file to be created or modified.
        - variables: A list of variables to insert the metadata into. If None, all variables will be used.
        - new_file: If True, a new file will be created. If False, the metadata will be inserted into an existing file.

        Raises:
        - ValueError: If dictionary is None.
        - AttributeError: If dictionary is not a dictionary.
        - FileExistsError: If the specified file already exists.
        - ValueError: If the filename is invalid.

        Returns:
        - None
        """
        if dictionary is None:
            raise ValueError("Please provide a dictionary.")
        if variables is None:
            variables = get_file_variables(self)
        if isinstance(dictionary, dict):
            for var in variables:
                for key, value in dictionary.items():
                    self[var].attrs[key] = value
        else:
            raise AttributeError(f"{dictionary} is not a dictionary.")
        if new_file:
            File.export_to_file(self,filename)
        Metadata.read_netCDF_metadata(self)
    def insert_netCDF_metadata_json(self, json_file, new_file=False, filename="new_file.nc"):
        """
        Inserts metadata from a JSON file into a netCDF file.

        Args:
            self: The instance of the class that the function belongs to.
            json_file (str): The path to the JSON file containing the metadata.
            new_file (bool, optional): A boolean flag indicating whether a new netCDF file should be created. Defaults to False.
            filename (str, optional): The name of the new netCDF file. Defaults to "new_file.nc".

        Raises:
            FileNotFoundError: If the specified filename already exists.

        Returns:
            None: The function modifies the attributes of the netCDF file directly.
        """
        schema = {
            "type": "object",
            "patternProperties": {
                ".*": {
                    "type": "object",
                    "patternProperties": {
                        ".*": {
                            "type": "string",
                        }
                    }
                },
                "additionalProperties": False
                }   
            }   
        try:
            with open(json_file, 'r') as file:
                metadata = json.load(file)
        except IOError:
            raise IOError("Error opening JSON file. Please check if the file exists or if there are any permission issues.")
        try:
            # Validate JSON against schema
            jsonschema.validate(instance=metadata, schema=schema)
        except ValidationError as e:
            raise ValidationError(str(e))
        for var, attributes in metadata.items():
            for attr, value in attributes.items():
                self[var].attrs[attr] = value    
        if new_file:
            File.export_to_file(self,filename)
        Metadata.read_netCDF_metadata(self)
    def insert_netCDF_metadata(self, via="input", **kwargs):
        """
        Insert metadata into the netCDF file.

        Parameters:
            via (str, optional): The method of providing metadata. Can be "dict", "json", or "input". Defaults to "input".
            **kwargs: Additional keyword arguments for the specific method.

        Raises:
            ValueError: If `via` is not a valid metadata input.
        """
        via_lower = via.lower()
        try:
            if via_lower == "dict":
                self.insert_netCDF_metadata_dict(self, **kwargs)
            elif via_lower == "json":
                self.insert_netCDF_metadata_json(self, **kwargs)
            elif via_lower == "input":
                self.insert_netCDF_metadata_input(self, **kwargs)
            else:
                raise ValueError(f"{via} is not a valid metadata input.")
        except Exception as e:
            raise ValueError(f"Error inserting netCDF metadata: {str(e)}")
    def get_attrs(self):
        return self.attrs
    def read_global_metadata(self, attributes=None):
        """
        Print the global metadata attributes of the dataset.

        Args:
            attributes (list): List of attribute names to print. If None, all attributes will be printed.
        """
        attrs = Metadata.get_attrs(self)
        if not attrs:
            print("No Global Attributes were found.")
        else:
            if attributes is None:
                for attr_name, attr_value in attrs.items():
                    print(attr_name, ":", attr_value)
            else:
                for attr_name, attr_value in attrs.items():
                    if attr_name in attributes:
                        print(attr_name, ":", attr_value)
    def insert_netCDF_global_metadata_input(self, attributes=None, new_file=False, filename="new_file.nc"):
        """
        Insert global metadata into a netCDF file.

        Args:
            attributes (list, optional): A list of attribute names for which the user will be prompted to enter values. 
                If not provided, a default list of attributes will be used.
            new_file (bool, optional): A boolean indicating whether a new file should be created. 
                If True, the metadata will be exported to a file specified by the filename parameter. 
                Default is False.
            filename (str, optional): The name of the file to which the metadata should be exported if new_file is True. 
                Default is "new_file.nc".

        Returns:
            None. The function modifies the metadata of the netCDF file and optionally exports it to a new file.
        """
        default_attributes = [
            "Title", "Institution", "Source",
            "History", "References", "Conventions",
            "Creator_Author", "Project", "Description"
        ]
        if attributes is None:
            attributes = default_attributes
        try:
            if not isinstance(attributes, list):
                raise ValueError("attributes must be a list")
            for attribute in attributes:
                if not isinstance(attribute, str):
                    raise ValueError("attributes must contain only strings")
                self.attrs[attribute] = input(f"{attribute} - Enter value: ")
        except ValueError as e:
            print(f"An error occurred: {e}")
        if new_file:
            File.export_to_file(self, filename)
        Metadata.read_global_metadata(self)
    def insert_netCDF_global_metadata_dict(self, dictionary, new_file=False, filename="new_file.nc"):
        """
        Insert global metadata into a netCDF file.

        Args:
            self (NetCDFFile): An instance of the NetCDFFile class.
            dictionary (dict): A dictionary containing the global metadata to be inserted into the netCDF file.
            new_file (bool, optional): A boolean flag indicating whether to export the modified netCDF file to a new file. Default is False.
            filename (str, optional): The filename of the new netCDF file to be exported. Default is 'new_file.nc'.

        Raises:
            TypeError: If the dictionary input is not of type dict.

        Returns:
            None. The function modifies the netCDF file by inserting the global metadata attributes. If new_file is True, it also exports the modified netCDF file to a new file.
        """
        if not isinstance(dictionary, dict):
            raise TypeError(f"{dictionary} is not a dictionary.")
        
        for key, value in dictionary.items():
            self.attrs[key] = value
        if new_file:
            File.export_to_file(self, filename)
        Metadata.read_global_metadata(self)
    def insert_netCDF_global_metadata_json(self, json_file, new_file=False, filename="new_file.nc"):
        """
        Inserts global metadata from a JSON file into a netCDF file.

        Args:
            self: The instance of the class calling the function.
            json_file (str): The path to the JSON file containing the metadata.
            new_file (bool, optional): Indicates whether a new netCDF file should be created. Default is False.
            filename (str, optional): Specifies the name of the new netCDF file. Default is "new_file.nc".

        Raises:
            FileNotFoundError: If there is an error opening the JSON file.
            json.JSONDecodeError: If there is an error decoding the JSON file.
            ValueError: If the filename is invalid.
            FileExistsError: If the filename already exists.
            ValidationError: If the JSON file does not match the specified schema.

        Returns:
            None
        """
        schema = {
            "type": "object",
            "patternProperties": {
                ".*": { "type": "string" }
            },
            "additionalProperties": False
        }

        try:
            with open(json_file, 'r') as file:
                metadata = json.load(file)
        except FileNotFoundError:
            raise FileNotFoundError("Error opening JSON file. Please check if the file exists or if there are any permission issues.")
        except json.JSONDecodeError:
            raise json.JSONDecodeError("Error decoding JSON file. Please check if the file contains valid JSON.")
        
        try:
            # Validate JSON against schema
            jsonschema.validate(instance=metadata, schema=schema)
        except ValidationError as e:
            raise ValidationError(str(e))
        if new_file:
            File.export_to_file(self, filename)
        Metadata.read_global_metadata(self)
    def read_parquet_metadata(self, attributes=None, cols=None):
        """
        Reads the metadata of a Parquet file and prints the attributes of each column.

        Args:
            attributes (list, optional): A list of attributes to filter the metadata. If not provided, all attributes will be printed.
            cols (list, optional): A list of column names to filter the columns. If not provided, metadata of all columns will be printed.

        Returns:
            None

        Example Usage:
            # Example 1: Read metadata of all columns
            read_parquet_metadata()

            # Example 2: Read metadata of specific columns
            read_parquet_metadata(cols=['column1', 'column2'])

            # Example 3: Read metadata of specific attributes
            read_parquet_metadata(attributes=['attribute1', 'attribute2'])

            # Example 4: Read metadata of specific columns and attributes
            read_parquet_metadata(cols=['column1', 'column2'], attributes=['attribute1', 'attribute2'])
        """
        if isinstance(self, pd.DataFrame):
            self = pa.Table.from_pandas(self)
        if cols is None:
            for i in range(self.num_columns):
                field = self.field(i)
                col = field.name
                print(col)
                if field.metadata is None:
                    print("    No attributes were found for this column.")
                else:
                    metadata = {key.decode('utf-8'): value.decode('utf-8') for key, value in field.metadata.items()}
                    if attributes:
                        for attr in attributes:
                            if attr in metadata:
                                print(f"    {attr}: {metadata[attr]}")
                            else:
                                print(f"    The '{attr}' attribute was not found in this column's metadata.")
                    else:
                        for key, value in metadata.items():
                            print(f"    {key}: {value}") 
        else:
            for i in range(self.num_columns):
                field = self.field(i)
                col = field.name
                if col in cols:
                    print(col)
                    if field.metadata is None:
                        print("    No attributes were found for this column.")
                    else:
                        metadata = {key.decode('utf-8'): value.decode('utf-8') for key, value in field.metadata.items()}
                        if attributes:
                            for attr in attributes:
                                if attr in metadata:
                                    print(f"    {attr}: {metadata[attr]}")
                                else:
                                    print(f"    The '{attr}' attribute was not found in this column's metadata.")
                        else:
                            for key, value in metadata.items():
                                print(f"    {key}: {value}")
    def insert_parquet_metadata_input(self, attributes=None, cols=None, new_file=False, filename="new_file.parquet"):
        """
        Insert metadata for columns in a Parquet file.

        Args:
            attributes (list, optional): A list of attributes for which metadata needs to be inserted. If not provided, default attributes are used.
            cols (list, optional): A list of columns for which metadata needs to be inserted. If not provided, metadata is inserted for all columns in the DataFrame.
            new_file (bool, optional): A boolean indicating whether to export the DataFrame to a new Parquet file. Default is False.
            filename (str, optional): The name of the new Parquet file. Default is "new_file.parquet".

        Returns:
            pyarrow.Table: A Parquet table with the inserted metadata.

        Example Usage:
            # Insert metadata for all columns in a DataFrame and export it to a Parquet file
            df.insert_parquet_metadata_input()

            # Insert metadata for specific columns in a DataFrame and export it to a new Parquet file
            df.insert_parquet_metadata_input(attributes=['Description', 'Units'], cols=['col1', 'col2'], new_file=True, filename='metadata.parquet')
        """
        default_attributes = ['Description', 'Units', 'Data Source', 'Valid Range or Categories']
        if attributes is None:
            attributes = default_attributes
        if cols is None:
            cols = list(self.columns)
        metadata = []
        columns = self.columns  
        cols_set = set(cols)  
        for col in columns:
            if col in cols_set:
                col_metadata = {}
                for attribute in attributes:
                    data = input(f"{col}: {attribute} - Enter value: ")
                    col_metadata[attribute] = data
                metadata.append(col_metadata)
            else:
                metadata.append(None)
        dtypes = self.dtypes
        dtypes = ["string" if dtype == "category" else str(dtype) for dtype in dtypes]
        cols_dtypes = zip(columns, dtypes, metadata)
        schema = [pa.field(col, pa.type_for_alias(dtype), metadata=meta) for col, dtype, meta in cols_dtypes]
        table_schema = pa.schema(schema)
        table = pa.Table.from_pandas(self, schema=table_schema)
        if new_file:
            File.export_to_file(table, filename)
        return table
    def insert_parquet_metadata_dict(self, dictionary, cols=None, new_file=False, filename="new_file.parquet"):
        """
        Inserts metadata into a Parquet file based on a given dictionary.

        Args:
            dictionary (dict): A dictionary containing the metadata to be inserted into the Parquet file.
            cols (list, optional): A list of column names to specify which columns the metadata should be inserted into. 
                If not provided, metadata will be inserted into all columns. Default is None.
            new_file (bool, optional): A boolean value indicating whether to create a new Parquet file with the inserted metadata. 
                Default is False.
            filename (str, optional): The name of the new Parquet file to be created. Default is "new_file.parquet".

        Returns:
            pyarrow.Table: A Parquet table with the inserted metadata.

        Raises:
            ValueError: If the dictionary parameter is not provided.
            AttributeError: If the dictionary parameter is not a dictionary.

        Example Usage:
            # Create a DataFrame
            df = pd.DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'c']})

            # Define a dictionary with metadata
            metadata_dict = {'A': 'This is column A', 'B': 'This is column B'}

            # Insert metadata into the Parquet file
            df.insert_parquet_metadata_dict(metadata_dict, new_file=True, filename='new_file.parquet')
        """

        if dictionary is None:
            raise ValueError("Please provide a dictionary.")
        if cols is None:
            cols = list(self.columns)
        columns = self.columns
        dtypes = self.dtypes
        dtypes = ["string" if dtype == "category" else str(dtype) for dtype in dtypes]
        metadata = []
        if isinstance(dictionary, dict):
            cols_set = set(cols)
            for col in columns:
                if col in cols_set:
                    metadata.append(dictionary)
                else:
                    metadata.append(None)
            cols_dtypes = zip(columns, dtypes, metadata)
            schema = [pa.field(col, pa.type_for_alias(dtype), metadata=meta) for col, dtype, meta in cols_dtypes]
            table_schema = pa.schema(schema)
            table = pa.Table.from_pandas(self, schema=table_schema)
            if new_file:
                File.export_to_file(table, filename)
            return table  
        else:
            raise AttributeError(f"{dictionary} is not a dictionary.")
    def insert_parquet_metadata_json(self, json_file, new_file=False, filename="new_file.parquet"):
        """
        Inserts metadata from a JSON file into a Parquet file.

        Args:
            json_file (str): The path to the JSON file containing the metadata.
            new_file (bool, optional): Indicates whether a new Parquet file should be created. Defaults to False.
            filename (str, optional): The name of the new Parquet file. Defaults to "new_file.parquet".

        Returns:
            pyarrow.Table: The Parquet table with the updated metadata.

        Raises:
            IOError: If there is an error opening the JSON file.
            ValidationError: If the JSON data does not match the predefined schema.
        """
        schema = {
            "type": "object",
            "patternProperties": {
                ".*": {
                    "type": "object",
                    "patternProperties": {
                        ".*": {
                            "type": "string",
                        }
                    }
                },
                "additionalProperties": False
            }
        }
        try:
            with open(json_file, 'r') as file:
                json_data = json.load(file)
        except IOError:
            raise IOError("Error opening JSON file. Please check if the file exists or if there are any permission issues.")
        try:
            # Validate JSON against schema
            jsonschema.validate(instance=json_data, schema=schema)
        except ValidationError as e:
            raise ValidationError(str(e))
        cols_dtypes = Statistics.get_cols_dtypes(self)
        cols_dtypes = [[col, "string"] if dtype == "category" else [col, str(dtype)] for col, dtype in cols_dtypes]
        metadata = []
        for col in cols_dtypes:
            if col[0] in json_data:
                col_metadata = json_data[col[0]]
                metadata.append(col_metadata)
            else:
                metadata.append(None)
        cols_dtypes = zip(cols_dtypes, metadata)
        schema = [pa.field(col_dtype[0], pa.type_for_alias(col_dtype[1]), metadata=meta) for col_dtype, meta in cols_dtypes]
        table_schema = pa.schema(schema)
        table = pa.Table.from_pandas(self, schema=table_schema)
        if new_file:
            File.export_to_file(table, filename)
        return table 

In [3]:
import os
import pandas as pd
import xarray as xr

class File:
    @staticmethod
    def get_file_extension(path):
        return os.path.splitext(path)[1]

    @staticmethod
    def read_file(path, **kwargs):
        """
        Read a file and return a DataFrame object.

        Args:
            cls: The class to be instantiated with the DataFrame object.
            path: The path to the file.
            **kwargs: Additional keyword arguments to be passed to the file reader.

        Returns:
            An instance of the specified class with the DataFrame object.

        Raises:
            TypeError: If the path is not a string.
            ValueError: If the file format is not supported.
            RuntimeError: If there is an error in reading the file.
        """
        if not os.path.isfile(path):
            raise ValueError("Invalid file path.")

        try:
            extension = File.get_file_extension(path)
            if extension == ".csv":
                return pd.read_csv(path, **kwargs)
            elif extension == ".parquet":
                return pd.read_parquet(path, **kwargs)
            elif extension == ".json":
                return pd.read_json(path, **kwargs)
            elif extension == ".xlsx":
                return pd.read_excel(path, **kwargs)
            elif extension == ".xml":
                return pd.read_xml(path, **kwargs)
            elif extension == ".feather":
                return pd.read_feather(path, **kwargs)
            elif extension == ".html":
                return pd.read_html(path, **kwargs)
            elif extension == ".nc":
                return xr.open_dataset(path, **kwargs)
            else:
                raise ValueError(f"Unsupported file format for {path}. Supported formats: CSV, Parquet, Json, Excel, Avro, Arrow")
        except Exception as e:
            raise RuntimeError(f"Error in reading the file {path}: {e}")
def get_file_extension(filename):
    suffix = pathlib.Path(filename).suffix
    return suffix
def export_to_file(self, filename):
    """
    Exports data to a file with a specified filename.

    Args:
        self: The data object that needs to be exported.
        filename (str): The name of the file to export the data to.

    Raises:
        ValueError: If the file extension is not valid.
        FileExistsError: If the file already exists.
    """
    suffixs = [".nc", ".parquet"]
    if not os.path.isfile(filename):
        if self.get_file_extension(filename) in suffixs:
            if self.get_file_extension(filename) == ".nc":
                self.to_netcdf(filename)
            elif self.get_file_extension(filename) == ".parquet":
                pq.write_table(self, filename, compression=None)        
        else:
            raise ValueError(f"Invalid file extension. Please provide a valid filename. Valid file extesions {suffixs}.")
    else:
        raise FileExistsError(f"{filename} already exists. Please change it or delete it.")

In [4]:
class Statistics:
    def get_file_variables(self):
        """
        Get the variables of the file.

        Returns:
            list: A list of variables in the file.
        """
        variables = list(self.variables.keys())
        return variables
    def get_dtypes(self, cols=None, output=True):
        """
        Get the data types of the specified columns.

        Args:
            cols (list): List of column names. If None, all columns will be used.
            output (bool): If True, print the data types. Default is True.

        Returns:
            list: List of data types.

        """
        if cols is None:
            cols = self.columns
        if output:
            for col in cols:
                print(f"{col} dtype is {self[col].dtype.name}")
        dtypes = [self[col].dtype.name for col in cols]
        return dtypes
    def get_cols(self):
        """
        Get the column names of the DataFrame.

        Returns:
            list: A list of column names.
        """
        try:
            return self.columns.tolist()
        except Exception as e:
            print(f"Error occurred while accessing self.columns: {e}")
            return []
    def get_cols_dtypes(self, cols=None, get_df=True):
        """
        Returns the data types of the specified columns in a DataFrame.

        Args:
            cols (list, optional): A list of column names to get the data types for. If not provided, it gets the data types for all columns in the DataFrame.
            get_df (bool, optional): A boolean flag indicating whether to return the data types as a DataFrame. Default is True.

        Returns:
            If get_df is True, returns a DataFrame with the column names and their data types.
            If get_df is False, returns a dictionary with column names as keys and their corresponding data types as values.
        
        Raises:
            ValueError: If the number of columns and the number of data types do not match.
        """
        if cols is None:
            cols = self.columns
        dtypes = []
        for col in cols:
            dtypes.append(Statistics.get_dtypes(self, [col], output=False))
        if len(cols) != len(dtypes):
            raise ValueError("Number of columns and number of data types do not match.")
        cols_dtypes = {col: dtype for col, dtype in zip(cols, dtypes)}
        if get_df:
            cols_info = [[col, str(dtype).strip("[]'")] for col, dtype in zip(cols, dtypes)]
            columns_name = ["Column_Name", "Dtype"]
            dataframe = pd.DataFrame(cols_info, columns=columns_name)
            return dataframe
        return cols_dtypes
    def convert_python_type(min_value, max_value):
        """
        Convert the minimum and maximum values of a given type to the appropriate Python data type.

        Args:
            min_value: The minimum value of a given type.
            max_value: The maximum value of a given type.

        Returns:
            A tuple containing the converted min_value and max_value.

        Raises:
            ValueError: If min_value and max_value are not of the same type or if they are not of a valid numeric or boolean type.
        """
        if type(min_value) != type(max_value):
            raise ValueError("min_value and max_value must be of the same type")

        if not isinstance(min_value, (int, np.integer, float, np.floating, np.bool_, bool)):
            raise ValueError("Invalid input: min_value must be numeric or boolean.")
        if not isinstance(max_value, (int, np.integer, float, np.floating, np.bool_, bool)):
            raise ValueError("Invalid input: max_value must be numeric or boolean.")

        if isinstance((min_value, max_value), (int, np.integer)):
            return int(min_value), int(max_value)
        elif isinstance((min_value, max_value), (float, np.floating)):
            return float(min_value), float(max_value)
        elif isinstance((min_value, max_value), (np.bool_, bool)):
            return bool(min_value), bool(max_value)
        else:
            return min_value, max_value
    def get_best_dtypes(self, cols=None, convert=False, output=True, show_df=False, get_dict=False):
        """
        Determines the best data type for each column in a DataFrame.

        Args:
            cols (list, optional): A list of column names. If not provided, all columns in the DataFrame will be considered.
            convert (bool, optional): Indicates whether to convert the columns to the best data type. Default is False.
            output (bool, optional): Indicates whether to print the best data type for each column. Default is True.
            show_df (bool, optional): Indicates whether to return a DataFrame with the column names and their best data types. Default is False.

        Returns:
            str or DataFrame or None: 
                - If convert and show_df parameters are False, returns the best data type for each column as a string.
                - If convert parameter is True, returns the modified DataFrame with columns converted to the best data types.
                - If show_df parameter is True, returns a DataFrame with the column names and their best data types.
                - Otherwise, returns None.

        Raises:
            Exception: If an error occurs while processing a column.
        """
        if cols is None:
            cols = self.columns
        if show_df:
            output = False
            dataframe = []
            dataframe1 = Statistics.get_cols_dtypes(self, get_df=True)
        if get_dict:
            num_of_memory = {}
        for col in cols:
            try:
                is_numeric = pd.api.types.is_numeric_dtype(self[col])
                is_bool = pd.api.types.is_bool_dtype(self[col])
                is_integer = pd.api.types.is_integer_dtype(self[col])
                is_float = pd.api.types.is_float_dtype(self[col])

                if is_numeric:
                    col_min = self[col].min()
                    col_max = self[col].max()
                    col_min, col_max = Statistics.convert_python_type(col_min, col_max)

                    if is_bool:
                        col_dtype = "bool"
                    elif is_integer:
                        if col_min >= -128 and col_max <= 127:
                            col_dtype = "int8"
                        elif col_min >= -32768 and col_max <= 32767:
                            col_dtype = "int16"
                        elif col_min >= -2147483648 and col_max <= 2147483647:
                            col_dtype = "int32"
                        else:
                            col_dtype = "int64"
                    elif is_float:
                        if col_min >= np.finfo(np.float16).min and col_min <= np.finfo(np.float16).max:
                            col_dtype = "float16"
                        elif col_max >= np.finfo(np.float32).min and col_max <= np.finfo(np.float32).max:
                            col_dtype = "float32"
                        else:
                            col_dtype = "float64"
                    else:
                        col_dtype = "category"

                    if output:
                        print(f"The best dtype for {col} is {col_dtype}")
                        if col_dtype == 'int8':
                            if self[col].nunique(dropna=False) == 2:
                                print("But consider changing it to bool, has you have 2 unique values so you can map the numbers to be True or False")
                            if convert:
                                self[col] = self[col].astype(col_dtype)
                    elif show_df:
                        col_info = [col, col_dtype]
                        dataframe.append(col_info)
                        if convert:
                            self[col] = self[col].astype(col_dtype)
                    elif convert:
                        self[col] = self[col].astype(col_dtype)
                    else:
                        return col_dtype

                else:
                    col_dtype = "category"
                    if output:
                        print(f"The best dtype for {col} is {col_dtype}")
                        if self[col].nunique(dropna=False) == 2:
                            print("But consider changing it to bool, has you have 2 unique values so you can map the numbers to be True or False")
                        if convert:
                            self[col] = self[col].astype(col_dtype)
                    elif show_df:
                        col_info = [col, col_dtype]
                        dataframe.append(col_info)
                        if convert:
                            self[col] = self[col].astype(col_dtype)
                    elif convert:
                        self[col] = self[col].astype(col_dtype)
                    else:
                        return col_dtype

            except Exception as e:
                print(f"Error on processing columm {col}: {e}")

        if show_df and convert:
            dataframe = pd.DataFrame(dataframe, columns=["Column_Name", "Best_Dtype"])
            dataframe = dataframe1.merge(dataframe, how="inner", on="Column_Name")
            display(dataframe)
            return self
        elif convert:
            return self
        elif show_df:
            dataframe1 = Statistics.get_cols_dtypes(self, get_df=True)
            dataframe = pd.DataFrame(dataframe, columns=["Column_Name", "Best_Dtype"])
            dataframe = dataframe1.merge(dataframe, how="inner", on="Column_Name")
            return dataframe
    def get_memory_usage(self, cols=None, output=True, get_total=True, show_df=False, unit="kb", use_deep=True, get_dict=False):
        """
        Calculate the memory usage of each column in a DataFrame and provide options to display the results, calculate the total memory usage, and return the information as a DataFrame or dictionary.

        Parameters:
        - cols (optional): A list of column names to calculate the memory usage for. If not provided, memory usage will be calculated for all columns in the DataFrame.
        - output (optional): A boolean flag indicating whether to print the memory usage for each column. Default is True.
        - get_total (optional): A boolean flag indicating whether to calculate the total memory usage. Default is True.
        - show_df (optional): A boolean flag indicating whether to return the memory usage as a DataFrame. Default is False.
        - unit (optional): The unit of memory usage to be displayed. Supported values are "kb" (kilobytes), "mb" (megabytes), and "b" (bytes). Default is "kb".
        - use_deep (optional): A boolean flag indicating whether to include the memory usage of referenced objects. Default is True.
        - get_dict (optional): A boolean flag indicating whether to return the memory usage as a dictionary. Default is False.

        Returns:
        - If output parameter is True, the memory usage for each column will be printed.
        - If get_total parameter is True, the total memory usage will be returned as a float.
        - If show_df parameter is True, a DataFrame with the column names and memory usage will be returned.
        - If get_dict parameter is True, a dictionary with the column names as keys and memory usage as values will be returned.
        """

        if cols is None:
            cols = self.columns
        supported_bytes = ["kb", "mb", "b"]
        assert unit in supported_bytes, f"{unit} not supported. Units supported is bytes(b), kilobytes(kb) and megabytes(mb)."
        if get_total:
            total = 0
        if show_df:
            dataframe = []
            output = False
        if get_dict:
            get_total = False
            num_of_memory = {}
            num_of_memory.update([("unit", unit)])
        conversion_factors = {
            "kb": 1024,
            "mb": 1024**2,
            "b": 1
        }
        conversion_factor = conversion_factors[unit]
        for col in cols:
            memory_usage = self[col].memory_usage(deep=use_deep)
            value = round(memory_usage / conversion_factor, 2)
            if output:
                print(f"Column: {col} uses {value}{unit}.")
            if get_total:
                total += value   
            if show_df:
                col_info = [col, value]
                dataframe.append(col_info)
            if get_dict:
                num_of_memory.update([(col, value)])    
        if show_df:
            collums = ["Col_Name", f"Memory_Usage({unit})"]
            if get_total:
                dataframe.append(["Total", total])
            dataframe = pd.DataFrame(dataframe, columns=collums)
            if get_total:
                n_rows = len(self.columns) + 1
                display(dataframe.head(n_rows))
                return total
            else:
                return dataframe
        if output:   
                print(f"Total: {total} {unit}")
        if get_total:
            return total
        if get_dict:
            return num_of_memory
    def get_memory_usage_percentage(self, cols=None, output=True, unit="kb", get_total=True, show_df=False, use_deep=True, get_dict=False):
        """
        Calculate the memory usage percentage of each column in a DataFrame.

        Args:
            cols (list, optional): A list of column names. If not provided, all columns in the DataFrame will be considered.
            output (bool, optional): Indicates whether to print the memory usage percentage for each column. Default is True.
            unit (str, optional): The unit of memory usage to be displayed. Supported units are bytes (b), kilobytes (kb), and megabytes (mb). Default is kb.
            get_total (bool, optional): Indicates whether to calculate the total memory usage percentage. Default is True.
            show_df (bool, optional): Indicates whether to return a DataFrame with the column names and their memory usage percentages. Default is False.
            use_deep (bool, optional): Indicates whether to use deep memory usage calculation. Default is True.
            get_dict (bool, optional): Indicates whether to return a dictionary with column names as keys and their memory usage percentages as values. Default is False.

        Returns:
            float or DataFrame or None: Depending on the parameters, the method returns the total memory usage percentage as a float, a DataFrame with the column names and their memory usage percentages, or None.
        """
        if cols is None:
            cols = self.columns
        supported_bytes = ["kb", "mb", "b"]
        assert unit in supported_bytes, f"{unit} not supported. Units supported is bytes(b), kilobytes(kb) and megabytes(mb)."
        if get_total:
            total = 0
        if show_df:
            dataframe = []
            output = False
        if get_dict:
            get_total = False
            percentage_of_memory = {}
            percentage_of_memory.update([("unit", unit)])
        for col in cols:
            total_usage = Statistics.get_memory_usage(self, output=False)
            col_usage = Statistics.get_memory_usage(self, [col], output=False, unit=unit, use_deep=use_deep)
            value = round((col_usage/total_usage) * 100, 2)
            if output:
                print(f"Column: {col} uses {value}{unit}.")
            if get_total:
                total += value   
            if show_df:
                col_info = [col, f"{value}%"]
                dataframe.append(col_info)
            if get_dict:
                percentage_of_memory.update([(col, f"{value}%")])
        if show_df:
            collums = ["Col_Name", f"Percentage_of_Memory_Usage({unit})"]
            if get_total:
                dataframe.append(["Total", f"{total}%"])
            dataframe = pd.DataFrame(dataframe, columns=collums)
            if get_total:
                n_rows = len(self.columns) + 1
                display(dataframe.head(n_rows))
                return total
            else:
                return dataframe
        if get_total:
            if output:   
                print(f"Total: {total} {unit}")
            return total
        if get_dict:
            if output:   
                print(f"Total: {total} {unit}")
            return percentage_of_memory
    def get_nulls_count(self, cols=None, output=True, show_df=False, get_total=True, get_dict=False):
        """
        Calculate the number of null values in each column of a DataFrame.

        Args:
            cols (list, optional): A list of column names to calculate the number of null values for. If not provided, all columns in the DataFrame will be considered.
            output (bool, optional): A boolean flag indicating whether to print the number of null values for each column. Default is True.
            show_df (bool, optional): A boolean flag indicating whether to return a DataFrame with the column names and their corresponding null value counts. Default is False.
            get_total (bool, optional): A boolean flag indicating whether to return the total number of null values in the DataFrame. Default is True.
            get_dict (bool, optional): A boolean flag indicating whether to return a dictionary with column names as keys and their corresponding null value counts as values. Default is False.

        Returns:
            DataFrame or int or dict: Depending on the input parameters, the method returns:
                - If show_df is True, a DataFrame with the column names and their corresponding null value counts.
                - If get_total is True, the total number of null values in the DataFrame.
                - If get_dict is True, a dictionary with column names as keys and their corresponding null value counts as values.
        """
        if cols is None:
            cols = self.columns
        if get_total:
            total = 0
        if show_df:
            dataframe = [] 
            output = False
        if get_dict:
            get_total = False
            num_of_nulls = {}
        for col in cols:
            value = self[col].isnull().sum() 
            if output:
                print(f"The number of null values in {col} is {value}")
            if get_total:
                total += value   
            if show_df:
                col_info = [col, value]
                dataframe.append(col_info)
            if get_dict:
                num_of_nulls.update([(col, value)])
        if show_df:
            collums = ["Col_Name", "Null_Values"]
            if get_total:
                dataframe.append(["Total", total])
            dataframe = pd.DataFrame(dataframe, columns=collums)
            if get_total:
                n_rows = len(dataframe.columns)
                display(dataframe.head(n_rows))
                return total
            else:
                return dataframe
        if get_total:
            if output:   
                print(f"In this dataframe are missing a total {total} of null values.")
            return total
        if get_dict:
            return num_of_nulls

    def get_null_percentage(self, cols=None, output=True, show_df=False, get_total=True, get_dict=False):
        """
        Calculate the percentage of null values in each column of a DataFrame.

        Args:
            cols (list, optional): A list of column names. If not provided, all columns in the DataFrame will be considered.
            output (bool, optional): Indicates whether to print the percentage of null values in each column. Default is True.
            show_df (bool, optional): Indicates whether to return a DataFrame with the column names and their percentage of null values. Default is False.
            get_total (bool, optional): Indicates whether to return the total percentage of null values in the DataFrame. Default is True.
            get_dict (bool, optional): Indicates whether to return a dictionary with column names as keys and their corresponding percentage of null values as values. Default is False.

        Returns:
            If output is True, the percentage of null values in each column is printed.
            If show_df is True, a DataFrame with the column names and their percentage of null values is returned.
            If get_total is True, the total percentage of null values in the DataFrame is returned.
            If get_dict is True, a dictionary with column names as keys and their corresponding percentage of null values as values is returned.
        """
        if cols is None:
            cols = self.columns
        if get_total:
            total = 0
        if show_df:
            dataframe = [] 
            output = False
        if get_dict:
            get_total = False
            percentage_of_nulls = {}
        for col in cols:
            value = round((Statistics.get_nulls_count(self, [col], False)/len(self[col])) * 100, 2)
            if output:
                print(f"The percentage of null values in {col} is {value}%")
            if get_total:
                total += value   
            if show_df:
                col_info = [col, f"{value}%"]
                dataframe.append(col_info)
            if get_dict:
                percentage_of_nulls.update([(col, f"{value}%")])
        if show_df:
            collums = ["Col_Name", "Percentage_of_Null_Values"]
            if get_total:
                dataframe.append(["Total", f"{total}%"])
            dataframe = pd.DataFrame(dataframe, columns=collums)
            if get_total:
                n_rows = len(self.columns) + 1
                display(dataframe.head(n_rows))
                return total
            else:
                return dataframe
        elif get_total:
            if output:   
                print(f"{total}% of the values in this dataframe are missing.")
            return total
        elif get_dict:
            return percentage_of_nulls
    def get_num_of_unique_values(self, cols=None, output=True, show_df=False):
        """
        Calculate the number of unique values in specified columns of a DataFrame.

        Args:
            cols (list, optional): A list of column names. If not provided, all columns in the DataFrame will be considered.
            output (bool, optional): A boolean flag indicating whether to print the number of unique values. Default is True.
            show_df (bool, optional): A boolean flag indicating whether to return a DataFrame with the column names and their corresponding number of unique values. Default is False.

        Returns:
            dict or DataFrame: If `show_df` is True, a DataFrame is returned with the column names and their corresponding number of unique values.
                               Otherwise, a dictionary is returned with the column names as keys and the number of unique values as values.
        """

        if cols is None:
            cols = self.columns
        if show_df:
            dataframe = []  
            output = False
        num_of_uniques = {}
        for col in cols:
            try:
                num_unique_values = self[col].nunique()
                num_of_uniques.update([(col, num_unique_values)])
                if output:
                    print(f"The number of unique values in {col} is {num_unique_values}")
                if show_df:
                    col_info = [col, num_unique_values]
                    dataframe.append(col_info)
            except KeyError:
                print(f"Column {col} does not exist in the DataFrame.")
        if show_df:
            columns = ["Col_Name", "Unique_Values"]
            dataframe = pd.DataFrame(dataframe, columns=columns)
            return dataframe
        else:
            return num_of_uniques
    def get_max_values(self, cols=None, output=True, show_df=False):
        """
        Find the maximum values or the most common values in each column of a DataFrame.

        Args:
            cols (list, optional): A list of column names. If not provided, all columns in the DataFrame will be considered.
            output (bool, optional): Indicates whether to print the maximum values. Default is True.
            show_df (bool, optional): Indicates whether to return a DataFrame with the column names and their maximum values. Default is False.

        Returns:
            dict or DataFrame: If show_df is False, a dictionary is returned with column names as keys and their corresponding maximum values or most common values as values.
                               If show_df is True, a DataFrame is returned with the column names and their maximum values or most common values.
        """
        if cols is None:
            cols = self.columns
        max_values = {}
        for col in cols:
            try:
                if not pd.api.types.is_categorical_dtype(self[col]) and not pd.api.types.is_bool_dtype(self[col]):
                    value = self[col].max()
                    max_values.update([(col, value)])
                else:
                    value = self[col].mode()[0]
                    max_values.update([(col, value)])
                if output:
                    if not pd.api.types.is_categorical_dtype(self[col]) and not pd.api.types.is_bool_dtype(self[col]):
                        print(f"The maximum value in {col} is {value}")
                    else:
                        print(f"The most common value in {col} is {value}")
            except KeyError:
                print(f"Column {col} does not exist in the DataFrame.")
        if show_df:
            dataframe = []
            for col in cols:
                col_info = [col, max_values[col]]
                dataframe.append(col_info)
            columns = ["Col_Name", "Max_Values/Most_Common"]
            dataframe = pd.DataFrame(dataframe, columns=columns)
            return dataframe
        else:
            return max_values
    def get_max_values_count(self, cols=None, output=True, show_df=False):
        """
        Returns the number of occurrences of the maximum value or the most common value in each column of a DataFrame.

        Args:
            cols (list, optional): A list of column names. If not provided, all columns in the DataFrame will be considered.
            output (bool, optional): Indicates whether to print the number of occurrences of the maximum value or the most common value in each column. Default is True.
            show_df (bool, optional): Indicates whether to return a DataFrame with the column names and the number of occurrences of the maximum value or the most common value. Default is False.

        Returns:
            DataFrame or dict: If show_df is True, returns a DataFrame with the column names and the number of occurrences of the maximum value or the most common value. Otherwise, returns a dictionary with the column names as keys and the number of occurrences of the maximum value or the most common value as values.
        """
        if cols is None:
            cols = self.columns
        max_values_count = {}
        for col in cols:
            try:
                if not pd.api.types.is_categorical_dtype(self[col]) and not pd.api.types.is_bool_dtype(self[col]):
                    value = self[col].max()
                    value = self[col].eq(value).sum()  
                    max_values_count.update([(col, value)])
                else:
                    value = self[col].value_counts().iat[0]  
                    max_values_count.update([(col, value)])
                if output:
                    if not pd.api.types.is_categorical_dtype(self[col]) and not pd.api.types.is_bool_dtype(self[col]):
                        print(f"The number of ocurrences of the max value in {col} is {value}")
                    else:
                        print(f"The number of ocurrences of the most common value in {col} is {value}")
            except KeyError:
                print(f"Column {col} does not exist in the DataFrame.")
        if show_df:
            dataframe = []
            for col in cols:
                col_info = [col, max_values_count[col]]
                dataframe.append(col_info)
            columns = ["Col_Name", "Max_Values/Most_Common Count"]
            dataframe = pd.DataFrame(dataframe, columns=columns)
            return dataframe
        else:
            return max_values_count
    def get_max_values_percentage(self, cols=None, output=True, show_df=False):
        """
        Calculates the percentage of the maximum value or the most common value in each column of a DataFrame.

        Args:
            cols (list, optional): A list of column names. If not provided, all columns in the DataFrame will be considered.
            output (bool, optional): Indicates whether to print the percentage of the maximum value or the most common value. Default is True.
            show_df (bool, optional): Indicates whether to return a DataFrame with the column names and their corresponding percentages. Default is False.

        Returns:
            dict or DataFrame: If `show_df` is True, it returns a DataFrame with the column names and their corresponding percentages. 
                               Otherwise, it returns a dictionary with column names as keys and their corresponding percentages as values.

        Raises:
            KeyError: If a column specified in `cols` does not exist in the DataFrame.
        """
        if cols is None:
            cols = self.columns
        max_values_percentage = {}
        for col in cols:
            try:
                if not pd.api.types.is_categorical_dtype(self[col]) and not pd.api.types.is_bool_dtype(self[col]):
                    value = self[col].max()
                    value = self[col].eq(value).sum()
                    value = (value / self[col].count()) * 100
                    value = round(value, 2)
                    max_values_percentage.update([(col, value)])
                else:
                    value = self[col].value_counts().iat[0]
                    value = (value / self[col].count()) * 100
                    value = round(value, 2)
                    max_values_percentage.update([(col, value)])
                if output:
                    if not pd.api.types.is_categorical_dtype(self[col]) and not pd.api.types.is_bool_dtype(self[col]):
                        print(f"The percentage of max value in {col} is {value} %")
                        print("Tip: It's possible for the percentage of max values being lower than the percentage of min values. So don't take this function seriously if you are using it for numerical columns.")
                    else:
                        print(f"The percentage of most common value in {col} is {value} %")
            except KeyError:
                print(f"Column {col} does not exist in the DataFrame.")
        if show_df:
            dataframe = []
            for col in cols:
                col_info = [col, f"{max_values_percentage[col]}%"]
                dataframe.append(col_info)
            columns = ["Col_Name", "Max_Values/Most_Common Percentage"]
            dataframe = pd.DataFrame(dataframe, columns=columns)
            return dataframe
        else:
            return max_values_percentage
    def get_min_values(self, cols=None, output=True, show_df=False):
        """
        Retrieve the minimum values for specified columns in a DataFrame.
    
        Args:
            cols (list, optional): A list of column names for which the minimum values should be retrieved. 
                If not provided, the method will consider all columns in the DataFrame.
            output (bool, optional): A boolean flag indicating whether to print the minimum values for each column. 
                Default is True.
            show_df (bool, optional): A boolean flag indicating whether to return the result as a DataFrame. 
                Default is False.
    
        Returns:
            dict or DataFrame: If show_df is False, the method returns a dictionary with column names as keys 
                and their corresponding minimum values as values. If show_df is True, the method returns a DataFrame 
                with two columns: "Col_Name" and "Min_Values/Less_Common", containing the column names and their 
                minimum values.
    
        Raises:
            KeyError: If a specified column does not exist in the DataFrame.
        """
        if cols is None:
            cols = self.columns
        min_values = {}
        for col in cols:
            try:
                if not pd.api.types.is_categorical_dtype(self[col]) and not pd.api.types.is_bool_dtype(self[col]):
                    value = self[col].min()
                    min_values.update([(col, value)])
                else:
                    value = self[col].value_counts()
                    value = value.index[-1]
                    min_values.update([(col, value)])
                if output:
                    if not pd.api.types.is_categorical_dtype(self[col]) and not pd.api.types.is_bool_dtype(self[col]):
                        print(f"The minimum value in {col} is {value}")
                    else:
                        print(f"The less common value in {col} is {value}")
            except KeyError:
                print(f"Column {col} does not exist in the DataFrame.")
        if show_df:
            dataframe = []
            for col in cols:
                col_info = [col, min_values[col]]
                dataframe.append(col_info)
            columns = ["Col_Name", "Min_Values/Less_Common"]
            dataframe = pd.DataFrame(dataframe, columns=columns)
            return dataframe
        else:
            return min_values
    def get_min_values_count(self, cols=None, output=True, show_df=False):
        """
        Calculate the count of the minimum values or the count of the less common values in each column of a DataFrame.

        Args:
            cols (list, optional): A list of column names. If not provided, all columns in the DataFrame will be considered.
            output (bool, optional): A boolean flag indicating whether to print the count of the minimum values or less common values. Default is True.
            show_df (bool, optional): A boolean flag indicating whether to return a DataFrame with the column names and their corresponding counts. Default is False.

        Returns:
            dict or DataFrame: If show_df is False, returns a dictionary with column names as keys and their corresponding counts as values.
                               If show_df is True, returns a DataFrame with the column names and their corresponding counts.

        Raises:
            KeyError: If a column specified in cols does not exist in the DataFrame.
        """
        if cols is None:
            cols = self.columns
        min_values_count = {}
        for col in cols:
            try:
                if not pd.api.types.is_categorical_dtype(self[col]) and not pd.api.types.is_bool_dtype(self[col]):
                    value = self[col].min()
                    value = self[col].eq(value).sum()
                    min_values_count.update([(col, value)])
                else:
                    value = self[col].value_counts().iat[-1]
                    min_values_count.update([(col, value)])
                if output:
                    if not pd.api.types.is_categorical_dtype(self[col]) and not pd.api.types.is_bool_dtype(self[col]):
                        print(f"The number of ocurrences of the min value in {col} is {value}")
                    else:
                        print(f"The number of ocurrences of the less common value in {col} is {value}")
            except KeyError:
                print(f"Column {col} does not exist in the DataFrame.")
        if show_df:
            dataframe = []
            for col in cols:
                col_info = [col, min_values_count[col]]
                dataframe.append(col_info)
            columns = ["Col_Name", "Min_Values/Less_Common Count"]
            dataframe = pd.DataFrame(dataframe, columns=columns)
            return dataframe
        else:
            return min_values_count
    def get_min_values_percentage(self, cols=None, output=True, show_df=False):
        """
        Calculates the percentage of the minimum value or the percentage of the less common value in each column of a DataFrame.

        Args:
            cols (list, optional): A list of column names. If not provided, all columns in the DataFrame will be considered.
            output (bool, optional): Indicates whether to print the percentage of the minimum value or the less common value in each column. Default is True.
            show_df (bool, optional): Indicates whether to return a DataFrame with the column names and their corresponding percentages. Default is False.

        Returns:
            dict or DataFrame: If `show_df` is True, returns a DataFrame with the column names and their corresponding percentages. 
                               If `show_df` is False, returns a dictionary with the column names as keys and their corresponding percentages as values.
                               If `output` is True, prints the percentage of the minimum value or the less common value in each column.
        """
        if cols is None:
            cols = self.columns
        min_values_percentage = {}
        for col in cols:
            try:
                if not pd.api.types.is_categorical_dtype(self[col]) and not pd.api.types.is_bool_dtype(self[col]):
                    value = self[col].min()
                    value = self[col].eq(value).sum()
                    value = (value / self[col].count()) * 100
                    value = round(value, 2)
                    min_values_percentage.update([(col, value)])
                else:
                    value = self[col].value_counts().iat[-1]
                    value = (value / self[col].count()) * 100
                    value = round(value, 2)
                    min_values_percentage.update([(col, value)])
                if output:
                    if not pd.api.types.is_categorical_dtype(self[col]) and not pd.api.types.is_bool_dtype(self[col]):
                        print(f"The percentage of min value in {col} is {value} %")
                        print("Tip: It's possible for the percentage of max values being lower than the percentage of min values. So don't take this function seriously if you are using it for numerical columns.")
                    else:
                        print(f"The percentage of less common value in {col} is {value} %")
            except KeyError:
                print(f"Column {col} does not exist in the DataFrame.")
        if show_df:
            dataframe = []
            for col in cols:
                col_info = [col, f"{min_values_percentage[col]}%"]
                dataframe.append(col_info)
            columns = ["Col_Name", "Min_Values/Less_Common Percentage"]
            dataframe = pd.DataFrame(dataframe, columns=columns)
            return dataframe
        else:
            return min_values_percentage
    def get_dataframe_mem_insight(self, transpose=False):
        """
        Generate memory insights for each column in a given dataframe.

        Args:
            self (pandas.DataFrame): The dataframe for which memory insights are to be generated.
            transpose (bool, optional): A flag indicating whether the resulting dataframe should be transposed. Default is False.

        Returns:
            pandas.DataFrame: A dataframe containing information such as column name, data type, recommended data type, memory usage, number of missing values, percentage of missing values, and number of distinct values.
        """
        dataframe = []
        for col in self.columns:
            col_info = [
                col,
                str(Statistics.get_dtypes(self, [col], False)).strip("[]'"),
                Statistics.get_best_dtypes(self, [col], False, False),
                Statistics.get_memory_usage(self, [col], False),
                f"{Statistics.get_memory_usage_percentage(self, [col], False)}%",
                Statistics.get_nulls_count(self, [col], False),
                f"{Statistics.get_null_percentage(self, [col], False)}%",
                Statistics.get_num_of_unique_values(self, [col], False)
            ]
            dataframe.append(col_info)
    
        column_names = [
            'Column',
            'Dtype',
            'Recommend_Dtype',
            'Memory',
            'Memory_Percentage',
            'Missing_Values',
            'Percentage_of_Missing_Values',
            'Distinct_Values'
        ]
        dataframe = pd.DataFrame(dataframe, columns=column_names)
        if transpose:
            dataframe = dataframe.transpose()
            dataframe.columns = dataframe.iloc[0]
            dataframe = dataframe[1:]
        return dataframe.head(len(self.columns))
    def get_dataframe_values_insight(self, transpose=False):
        """
        Generates insights about the values in each column of a given dataframe.

        Args:
            self (pandas.DataFrame): The dataframe for which insights are to be generated.
            transpose (bool, optional): A boolean flag indicating whether to transpose the resulting dataframe. Default is False.

        Returns:
            pandas.DataFrame: A dataframe containing insights about the values in each column of the input dataframe. The number of rows in the resulting dataframe is equal to the number of columns in the input dataframe.
        """
        dataframe = []
        for col in self.columns:
            col_info = [
                col,
                str(Statistics.get_dtypes(self, [col], False)).strip("[]'"),
                list(Statistics.get_num_of_unique_values(self, [col], False).values())[0],
                list(Statistics.get_max_values(self, [col], False).values())[0],
                list(Statistics.get_max_values_count(self, [col], False).values())[0],
                f"{list(Statistics.get_max_values_percentage(self, [col], False).values())[0]}%",
                list(Statistics.get_min_values(self, [col], False).values())[0],
                list(Statistics.get_min_values_count(self, [col], False).values())[0],
                f"{list(Statistics.get_min_values_percentage(self, [col], False).values())[0]}%",
                Statistics.get_nulls_count(self, [col], False),
                f"{Statistics.get_null_percentage(self, [col], False)}%" 
            ]
            dataframe.append(col_info)

        column_names = [
            'Column',
            'Dtype',
            'Distinct_Values',
            'Most_Common/Max_Value',
            'Occurrences_of_Max_Value',
            'Percentages_of_Occurrences_of_Max_Value',
            'Less_Common/Min_Value',
            'Occurrences_of_Min_Value',
            'Percentage_of_Occurrences_of_Min_Value',
            'Missing_Values',
            'Percentage_of_Missing_Values'
        ]
        dataframe = pd.DataFrame(dataframe, columns=column_names)
        if transpose:
            dataframe = dataframe.transpose()
            dataframe.columns = dataframe.iloc[0]
            dataframe = dataframe[1:]
        return dataframe.head(len(self.columns))
    def find(self, conditions, AND=True, OR=False):
        """
        Filter the data in a DataFrame based on specified conditions using logical operators (AND or OR).

        Args:
            conditions (list): A list of conditions to filter the data. Each condition is a logical expression using comparison operators.
            AND (bool, optional): Indicates whether to use the AND operator for combining the conditions. Default is True.
            OR (bool, optional): Indicates whether to use the OR operator for combining the conditions. Default is False.

        Returns:
            DataFrame: A subset of the original DataFrame that satisfies the specified conditions.

        Raises:
            TypeError: If the conditions input is not a list.
            ValueError: If both AND and OR are True simultaneously.
            ValueError: If neither AND nor OR is True.
        """
        if not isinstance(conditions, list):
            raise TypeError(f"{conditions} has to be a list")
        if OR and AND:
            raise ValueError("Both AND and OR cannot be True simultaneously.")
        combined_condition = conditions[0]
        if AND:
            for condition in conditions[1:]:
                combined_condition = combined_condition & condition
        elif OR:
            for condition in conditions[1:]:
                combined_condition = combined_condition | condition
        else:
            raise ValueError("Either AND or OR must be True.")

        return self[combined_condition]
    def find_replace(self, conditions, replace_with, AND=True, OR=False):
        """
        Find rows in a DataFrame that meet certain conditions and replace values in a specified column with a new value.

        Args:
            conditions (dict): A dictionary specifying the conditions to filter the DataFrame. The keys are column names and the values are either a single value or a lambda function that returns True or False.
            replace_with (tuple): A tuple containing the name of the column to replace values in and the new value to replace with.
            AND (bool, optional): A boolean flag indicating whether to use the AND operator when evaluating multiple conditions. Default is True.
            OR (bool, optional): A boolean flag indicating whether to use the OR operator when evaluating multiple conditions. Default is False.

        Returns:
            None: The method modifies the DataFrame in-place and does not return any value.
        """
        new_dataset = Statistics.find(self, conditions, AND, OR)
        self.loc[new_dataset.index, replace_with[0]] = replace_with[1]
        return self
    def find_delete(self, conditions, AND=True, OR=False):
        """
        Find rows in the DataFrame that meet certain conditions, delete those rows from the DataFrame, and return the modified DataFrame.

        Args:
            conditions (list): A list of conditions to filter the rows of the DataFrame.
            AND (bool, optional): A boolean flag indicating whether the conditions should be combined using the logical AND operator. Default is True.
            OR (bool, optional): A boolean flag indicating whether the conditions should be combined using the logical OR operator. Default is False.

        Returns:
            pandas.DataFrame: The modified DataFrame after deleting the rows that meet the conditions.
        """
        new_dataset = Statistics.find(self, conditions, AND, OR)
        self = self.drop(new_dataset.index)
        return self

In [142]:
#from statistics import Statistics
class Cleaning:
    def capitalize_cols_name(self, cols = None):
        """
        Capitalizes the column names of the DataFrame.

        Parameters:
            cols (list, optional): List of column names to be capitalized. If None, all columns will be capitalized. Defaults to None.

        Returns:
            pandas.DataFrame: DataFrame with capitalized column names.
        """
        if cols is None:
            cols = self.columns
        else:
            missing_cols = set(cols) - set(self.columns)
            if missing_cols:
                raise ValueError(f"The following columns are not present in the DataFrame: {missing_cols}")
        dataframe = self.copy()
        dataframe = self.rename(columns=dict(zip(cols, map(str.capitalize, cols))))
        return dataframe
    def lower_cols_name(self, cols = None):
        """
        Converts the column names of the DataFrame to lowercase.

        Parameters:
            cols (list, optional): List of column names to be converted. If None, all columns will be converted. Defaults to None.

        Returns:
            pandas.DataFrame: DataFrame with lowercase column names.
        """
        if cols is None:
            cols = self.columns
        else:
            missing_cols = set(cols) - set(self.columns)
            if missing_cols:
                raise ValueError(f"The following columns are not present in the DataFrame: {missing_cols}")
        dataframe = self.copy()
        dataframe = self.rename(columns=dict(zip(cols, map(str.lower, cols))))
        return dataframe
    def upper_cols_name(self, cols=None):
        """
        Convert the column names of a DataFrame to uppercase.

        Args:
            cols (list, optional): A list of column names to be converted to uppercase. If not provided, all column names will be converted.

        Raises:
            ValueError: If any of the specified column names are not present in the DataFrame.

        Returns:
            pandas.DataFrame: The DataFrame with the column names converted to uppercase.
        """
        if cols is None:
            cols = self.columns
        else:
            missing_cols = set(cols) - set(self.columns)
            if missing_cols:
                raise ValueError(f"The following columns are not present in the DataFrame: {missing_cols}")
        dataframe = self.copy()
        dataframe = self.rename(columns=dict(zip(cols, map(str.upper, cols))))
        return dataframe
    def remove_cols_character(self, cols=None, characters=['_'], add_new_character=False, new_character=" "):
        """
        Remove specified characters from the column names of a DataFrame.

        Args:
            cols (list, optional): List of column names to be processed. If None, all columns will be processed. Defaults to None.
            characters (list, optional): List of characters to be removed from the column names. Defaults to ['_'].
            add_new_character (bool, optional): If True, a new character will be added in place of the removed character. Defaults to False.
            new_character (str, optional): The new character to be added in place of the removed character. Defaults to " " (space).

        Returns:
            pandas.DataFrame: DataFrame with the specified characters removed or replaced from the column names.
        """
        if cols is None:
            cols = self.columns
        else:
            missing_cols = set(cols) - set(self.columns)
            if missing_cols:
                raise ValueError(f"The following columns are not present in the DataFrame: {missing_cols}")
        new_columns = {}
        for col in cols:
            new_col = col 
            for character in characters:
                for idx, letter in enumerate(col):
                    if letter.lower() == character.lower():  
                        new_col = new_col[:idx] + new_character + new_col[idx+1:] if add_new_character else new_col[:idx] + new_col[idx+1:]
            new_columns[col] = new_col
        dataframe = self.copy()
        dataframe = self.rename(columns=new_columns)
        return dataframe
    def round_rows_value(self, cols=None, decimals=2):
        """
        Round the numerical values in specified columns of a DataFrame to a specified number of decimal places.

        Args:
            cols (list, optional): List of column names to be processed. If None, all columns will be processed. Defaults to None.
            decimals (int, optional): The number of decimal places to round the numerical values to. Defaults to 2.

        Returns:
            pandas.DataFrame: DataFrame with the specified numerical values rounded to the specified number of decimal places.
        """
        if cols is None:
            cols = self.columns
        else:
            missing_cols = set(cols) - set(self.columns)
            if missing_cols:
                raise ValueError(f"The following columns are not present in the DataFrame: {missing_cols}")
        numerical_cols = [col for col in cols if Statistics.get_dtypes(self, [col], False) not in ["categorical", "bool", "object"]]
        dataframe = self.copy()
        dataframe[numerical_cols] = self[numerical_cols].applymap(lambda x: round(x, decimals) if isinstance(x, (int, float)) else x)
        return dataframe
    def remove_rows_character(self, cols=None, characters=[','], add_new_character=False, new_character=" "):
        """
        Removes specified characters from the values in the specified columns of a DataFrame.

        Args:
            cols (list, optional): List of column names to be processed. If None, all columns will be processed. Defaults to None.
            characters (list, optional): List of characters to be removed from the values in the specified columns. Defaults to [','].
            add_new_character (bool, optional): If True, adds a new character in place of the removed character. Defaults to False.
            new_character (str, optional): The new character to be added if add_new_character is True. Defaults to " ".

        Returns:
            pandas.DataFrame: DataFrame with the specified characters removed from the values in the specified columns.
        """
        if cols is None:
            cols = self.columns
        else:
            missing_cols = set(cols) - set(self.columns)
            if missing_cols:
                raise ValueError(f"The following columns are not present in the DataFrame: {missing_cols}")
        dataframe = self.copy()
        for col in cols:
            if col in self.columns:
                for idx, value in enumerate(self[col]):
                    if isinstance(value, str):
                        new_value = value
                        for character in characters:
                            for idx_char, letter in enumerate(new_value):
                                if letter.lower() == character.lower():
                                    new_value = new_value[:idx_char] + new_character + new_value[idx_char+1:] if add_new_character else new_value[:idx_char] + new_value[idx_char+1:]
                        dataframe.at[idx, col] = new_value    
        return dataframe
    def capitalize_rows_string(self, cols = None):
        """
        Capitalizes the string values in the specified columns.

        Args:
            cols (list): List of column names to capitalize. If None, all columns will be capitalized.

        Returns:
            DataFrame: The DataFrame with capitalized string values in the specified columns.
        """
        if cols is None:
            cols = self.columns
        else:
            missing_cols = set(cols) - set(self.columns)
            if missing_cols:
                raise ValueError(f"The following columns are not present in the DataFrame: {missing_cols}")
        dataframe = self.copy()
        dataframe[cols] = self[cols].applymap(lambda x: x.capitalize() if isinstance(x, str) else x)
        return dataframe
    def lower_rows_string(self, cols=None):
        """
        Convert the string values in specified columns of a DataFrame to lowercase.

        Args:
            cols (list, optional): List of column names to be processed. If None, all columns will be processed.

        Returns:
            pandas.DataFrame: DataFrame with the specified string values converted to lowercase.
        """
        if cols is None:
            cols = self.columns
        else:
            missing_cols = set(cols) - set(self.columns)
            if missing_cols:
                raise ValueError(f"The following columns are not present in the DataFrame: {missing_cols}")
        dataframe = self.copy()
        dataframe[cols] = self[cols].applymap(lambda x: x.lower() if isinstance(x, str) else x)
        return dataframe
    def upper_rows_string(self, cols=None):
        """
        Convert the string values in specified columns of a DataFrame to uppercase.

        Args:
            cols (list, optional): List of column names to be processed. If None, all columns will be processed.

        Returns:
            pandas.DataFrame: DataFrame with the specified string values converted to uppercase.
        """
        if cols is None:
            cols = self.columns
        else:
            missing_cols = set(cols) - set(self.columns)
            if missing_cols:
                raise ValueError(f"The following columns are not present in the DataFrame: {missing_cols}")
        dataframe = self.copy()
        dataframe[cols] = self[cols].applymap(lambda x: x.upper() if isinstance(x, str) else x)
        return dataframe
    def remove_rows_with_missing_values(self, cols=None):
        """
        Remove rows with missing values from the DataFrame.

        Args:
            cols (list, optional): A list of column names. If provided, only the rows with missing values in the specified columns will be removed. If not provided, all rows with missing values will be removed.

        Returns:
            pandas.DataFrame: The DataFrame with rows containing missing values removed.
        """
        dataframe = self.copy()
        if cols is None:
            dataframe = self.dropna(axis=0)
        else:
            dataframe = self.dropna(subset=cols)
        return dataframe
    def interpolate_rows_with_missing_values(self, cols=None):
        """
        Interpolates missing values in a DataFrame by filling them with interpolated values.

        Args:
            cols (list, optional): A list of column names to interpolate missing values. If not provided, all columns will be processed.

        Returns:
            pandas.DataFrame: DataFrame with missing values interpolated.

        Raises:
            ValueError: If any of the specified columns are not present in the DataFrame.
        """
        dataframe = self.copy()
        if cols is None:
            cols = self.columns
        else:
            missing_cols = set(cols) - set(self.columns)
            if missing_cols:
                raise ValueError(f"The following columns are not present in the DataFrame: {missing_cols}")
        for col in cols:
            dtype = Statistics.get_dtypes(self, [col], False)
            dtype = str(dtype[0])     
            if dtype in ["categorical", "bool", "object"]:
                dataframe[col] = self[col].fillna(self[col].mode()[0])
            else:
                dataframe[col] = self[col].interpolate()
        return dataframe
    def foward_fill_rows_with_missing_values(self, cols = None):
        """
        Forward fill missing values in a DataFrame by filling the missing values with the last known non-null value in the column.

        Args:
            cols (list, optional): A list of column names to forward fill missing values. If not provided, all columns will be processed.

        Returns:
            pandas.DataFrame: DataFrame with missing values forward filled.
        """
        dataframe = self.copy()
        if cols is None:
            dataframe = self.ffill()
        else:
            dataframe = self.ffill(subset=cols)
        return dataframe
    def split_rows_string(self, col, new_cols, separator=",", delete_col=True, save_remain=True):
        """
        Split the values in a specified column of a DataFrame into multiple columns based on a separator.

        Args:
            col (str): The name of the column to be split.
            new_cols (list): A list of new column names to store the split values.
            separator (str, optional): The separator used to split the values. Defaults to ",".
            delete_col (bool, optional): If True, the original column will be deleted. Defaults to True.
            save_remain (bool, optional): If True, the remaining values after splitting will be saved in a new column. Defaults to True.

        Returns:
            pandas.DataFrame: The DataFrame with the specified column split into multiple columns.
        """
        dataframe = self.copy()
        split_result = dataframe[col].str.split(separator, expand=True)
        split_result = split_result.fillna('')
        for i, new_col in enumerate(new_cols):
            if i == 0:
                dataframe[new_col] = split_result[i]
            else:
                if save_remain:
                    dataframe[new_col] = split_result.loc[:, i:].apply(lambda x: separator.join(x), axis=1)
        if delete_col:
            dataframe = dataframe.drop([col], axis=1)
        else:
            dataframe[col] = split_result[len(new_cols)]
        return dataframe
    def backward_fill_rows_with_missing_values(self, cols = None):
        """
        Fill missing values in a DataFrame by backward filling them with the last valid value in each column.

        Args:
            cols (list, optional): A list of column names. If provided, only the missing values in the specified columns will be filled. If not provided, missing values in all columns will be filled.

        Returns:
            pandas.DataFrame: The DataFrame with missing values filled by backward filling with the last valid value in each column.
        """
        dataframe = self.copy()
        if cols is None:
            dataframe = self.bfill()
        else:
            dataframe = self.bfill(subset=cols)
        return dataframe
    def fill_rows_with_missing_values_mean(self, cols=None, decimals=2):
        """
        Fills missing values in a DataFrame with the mean value of the respective column.
    
        Args:
            cols (list, optional): List of column names to fill missing values. If None, all columns will be processed. Defaults to None.
            decimals (int, optional): The number of decimal places to round the mean value to. Defaults to 2.
    
        Returns:
            pandas.DataFrame: DataFrame with missing values filled using the mean value of the respective column.
    
        Raises:
            ValueError: If any of the specified columns are not present in the DataFrame.
        """
        dataframe = self.copy()
        if cols is None:
            cols = self.columns
        else:
            missing_cols = set(cols) - set(self.columns)
            if missing_cols:
                raise ValueError(f"The following columns are not present in the DataFrame: {missing_cols}")
        for col in cols:
            dtype = Statistics.get_dtypes(self, [col], False)
            dtype = str(dtype[0])
            if dtype in ["categorical", "bool", "object"]:
                dataframe[col] = self[col].fillna(self[col].mode()[0])
            else:
                dataframe[col] = self[col].fillna(round(self[col].mean(), decimals))
        return dataframe
    def fill_rows_with_missing_values_max(self, cols = None):
        """
        Fills missing values in a DataFrame with the maximum value of each column.

        Args:
            cols (list, optional): List of column names to fill missing values. If None, all columns will be processed.

        Returns:
            pandas.DataFrame: DataFrame with missing values filled using the maximum value of each column.

        Raises:
            ValueError: If any of the specified columns are not present in the DataFrame.
        """
        dataframe = self.copy()
        if cols is None:
            cols = self.columns
        else:
            missing_cols = set(cols) - set(self.columns)
            if missing_cols:
                raise ValueError(f"The following columns are not present in the DataFrame: {missing_cols}")
        for col in cols:
            dtype = Statistics.get_dtypes(self, [col], False)
            dtype = str(dtype[0])
            if dtype in ["categorical", "bool", "object"]:
                dataframe[col] = self[col].fillna(self[col].mode()[0])
            else:
                dataframe[col] = self[col].fillna(self[col].max())
        return dataframe
    def fill_rows_with_missing_values_min(self, cols=None):
        """
        Fills missing values in a DataFrame with the minimum value of each column.
        If a column has a categorical, boolean, or object data type, the missing values are filled with the most frequent value in that column.

        Args:
            cols (list, optional): A list of column names to fill missing values. If not provided, all columns will be processed.

        Returns:
            pandas.DataFrame: DataFrame with missing values filled using the minimum value of each column.

        Raises:
            ValueError: If any of the specified columns are not present in the DataFrame.
        """
        dataframe = self.copy()
        if cols is None:
            cols = self.columns
        else:
            missing_cols = set(cols) - set(self.columns)
            if missing_cols:
                raise ValueError(f"The following columns are not present in the DataFrame: {missing_cols}")
        for col in cols:
            dtype = Statistics.get_dtypes(self, [col], False)
            dtype = str(dtype[0])
            if dtype in ["categorical", "bool", "object"]:
                value = self[col].value_counts()
                value = value.index[-1]
                dataframe[col] = self[col].fillna(value)
            else:
                dataframe[col] = self[col].fillna(self[col].min())
        return dataframe

### **Next Steps** 

1 - Add more values insight

6 - Visualization Functions

# Send images

## Telegram

In [6]:
base_url = "https://api.telegram.org/bot6148622889:AAFHdvQ_CxImlx1VEXE_vYhg4_2NFXk1OyU/sendPhoto"
def send_images_via_telegram(file_path, chat_id, caption="This is a caption"):
    my_file = open(file_path, 'rb')
    parameters = {
    "chat_id" : chat_id,
    "caption" : caption
    }
    files = {   
    "photo" : my_file
    }
    resp = requests.post(base_url, data=parameters, files=files)
    print(resp.status_code)

## Slack


Agora vou mostrar as funções que me faltam criar

In [7]:
slack_token = "xoxp-5452682117826-5446024818310-5452588829571-92e60adc3ecd07a736b6faea910b8831"
channel_id = "C05DAGDAPEX"
client = WebClient(token=slack_token)
def send_images_via_slack(file_path):
    try:
        response = client.files_upload(
                channels=channel_id,
                file=file_path
                )
        print(response)
    except SlackApiError as e:
        print(f"Error uploading file: {e.response['error']}")

# Testes

In [134]:
df = pd.read_csv("./Titanic.csv")
df1 = pd.read_csv("./Sales_Data.csv")
df2 = pd.read_parquet("./Titanic_Cleaned.parquet")

In [9]:
# Define the string
my_string = "HellO, wOrld!"

# Define the character you want to check for
character_to_check = "o"

# Check if the character is in the string
if character_to_check.lower() in my_string.lower():
    print(f"The character '{character_to_check}' is in the string.")
else:
    print(f"The character '{character_to_check}' is not in the string.")


The character 'o' is in the string.


In [10]:
import pandas as pd

# Creating a DataFrame for testing
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Emma'],
    'age': [25, 30, 35, 40, None],
    'Gender': ['female', 'male', None, 'male', 'female'],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix'],
    'SCORE': [85.5, None, 90.7, 75.2, 88.9]
}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,name,age,Gender,City,SCORE
0,Alice,25.0,female,New York,85.5
1,Bob,30.0,male,Los Angeles,
2,Charlie,35.0,,Chicago,90.7
3,David,40.0,male,Houston,75.2
4,Emma,,female,Phoenix,88.9


In [50]:
df.head()

Unnamed: 0,name,age,Gender,City,SCORE
0,Alice,25.0,female,New York,85.5
1,Bob,30.0,male,Los Angeles,
2,Charlie,35.0,,Chicago,90.7
3,David,40.0,male,Houston,75.2
4,Emma,,female,Phoenix,88.9


In [123]:
df1.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,150502.0,iPhone,1.0,700.0,02/18/19 01:35,"866 Spruce St, Portland, ME 04101"
1,150503.0,AA Batteries (4-pack),1.0,3.84,02/13/19 07:24,"18 13th St, San Francisco, CA 94016"
2,150504.0,27in 4K Gaming Monitor,1.0,389.99,02/18/19 09:46,"52 6th St, New York City, NY 10001"
3,150505.0,Lightning Charging Cable,1.0,14.95,02/02/19 16:47,"129 Cherry St, Atlanta, GA 30301"
4,150506.0,AA Batteries (4-pack),2.0,3.84,02/28/19 20:32,"548 Lincoln St, Seattle, WA 98101"


In [143]:
df_capitalize = Cleaning.split_rows_string(df1, col="Purchase Address", new_cols=["City", "State",], delete_col=False)
df_capitalize.head()

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,City,State
0,150502.0,iPhone,1.0,700.0,02/18/19 01:35,ME 04101,866 Spruce St,"Portland, ME 04101"
1,150503.0,AA Batteries (4-pack),1.0,3.84,02/13/19 07:24,CA 94016,18 13th St,"San Francisco, CA 94016"
2,150504.0,27in 4K Gaming Monitor,1.0,389.99,02/18/19 09:46,NY 10001,52 6th St,"New York City, NY 10001"
3,150505.0,Lightning Charging Cable,1.0,14.95,02/02/19 16:47,GA 30301,129 Cherry St,"Atlanta, GA 30301"
4,150506.0,AA Batteries (4-pack),2.0,3.84,02/28/19 20:32,WA 98101,548 Lincoln St,"Seattle, WA 98101"


In [14]:
erro

NameError: name 'erro' is not defined

In [None]:
# self.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sex       891 non-null    object 
 1   age       714 non-null    float64
 2   sibsp     891 non-null    int64  
 3   parch     891 non-null    int64  
 4   fare      891 non-null    float64
 5   embarked  889 non-null    object 
 6   class     891 non-null    object 
 7   who       891 non-null    object 
 8   alone     891 non-null    bool   
 9   survived  891 non-null    int64  
dtypes: bool(1), float64(2), int64(3), object(4)
memory usage: 63.6+ KB


In [None]:
# teste = File.read_file("./Titanic_Cleaned.parquet")
# teste.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   Gender               891 non-null    category
 1   Age                  714 non-null    float64 
 2   Siblings_on_Board    891 non-null    int8    
 3   Parents_on_Board     891 non-null    int8    
 4   Ticket_Price         891 non-null    float64 
 5   Port_of_Embarkation  889 non-null    category
 6   Class                891 non-null    category
 7   Adult/Child          891 non-null    category
 8   Alone                891 non-null    bool    
 9   Survived             891 non-null    int64   
dtypes: bool(1), category(4), float64(2), int64(1), int8(2)
memory usage: 27.6 KB


In [None]:
import pandas as pd
# Criar um DataFrame de exemplo
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Emma'],
    'Age': [25, 30, 35, 40, 45],
    'Gender': ['Female', 'Male', 'Male', 'Male', 'Female'],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix']
}
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Name,Age,Gender,City
0,Alice,25,Female,New York
1,Bob,30,Male,Los Angeles
2,Charlie,35,Male,Chicago
3,David,40,Male,Houston
4,Emma,45,Female,Phoenix


In [None]:
# Conditions for filtering
conditions = [
    df['Age'] <= 35,
    df['Gender'] == 'Male'
]
df_find = Statistics.find(df, conditions)
df_find.head()

Unnamed: 0,Name,Age,Gender,City
1,Bob,30,Male,Los Angeles
2,Charlie,35,Male,Chicago


In [None]:
df_replaced = Statistics.find_replace(df, conditions, ('Age', 'Unknown'))
df_replaced.head()

Unnamed: 0,Name,Age,Gender,City
0,Alice,25,Female,New York
1,Bob,Unknown,Male,Los Angeles
2,Charlie,Unknown,Male,Chicago
3,David,40,Male,Houston
4,Emma,45,Female,Phoenix


In [None]:
# df_deleted = Statistics.find_delete(df, conditions)
# df_deleted.head()

Unnamed: 0,Name,Age,Gender,City
0,Alice,25,Female,New York
3,David,40,Male,Houston
4,Emma,45,Female,Phoenix


In [None]:
# Statistics.get_best_dtypes(self, output=True, convert=True)

The best dtype for sex is category
But consider changing it to bool, has you have 2 unique values so you can map the numbers to be True or False
The best dtype for age is float16
The best dtype for sibsp is int8
The best dtype for parch is int8
The best dtype for fare is float16
The best dtype for embarked is category
The best dtype for class is category
The best dtype for who is category
The best dtype for alone is bool
The best dtype for survived is int8
But consider changing it to bool, has you have 2 unique values so you can map the numbers to be True or False


Unnamed: 0,sex,age,sibsp,parch,fare,embarked,class,who,alone,survived
0,male,22.0,1,0,7.2500,S,Third,man,False,0
1,female,38.0,1,0,71.2833,C,First,woman,False,1
2,female,26.0,0,0,7.9250,S,Third,woman,True,1
3,female,35.0,1,0,53.1000,S,First,woman,False,1
4,male,35.0,0,0,8.0500,S,Third,man,True,0
...,...,...,...,...,...,...,...,...,...,...
886,male,27.0,0,0,13.0000,S,Second,man,True,0
887,female,19.0,0,0,30.0000,S,First,woman,True,1
888,female,,1,2,23.4500,S,Third,woman,False,0
889,male,26.0,0,0,30.0000,C,First,man,True,1


In [None]:
# Statistics.get_null_percentage(teste, get_dict=True, output=False, get_total=False)

{'Gender': '0.0%',
 'Age': '19.87%',
 'Siblings_on_Board': '0.0%',
 'Parents_on_Board': '0.0%',
 'Ticket_Price': '0.0%',
 'Port_of_Embarkation': '0.22%',
 'Class': '0.0%',
 'Adult/Child': '0.0%',
 'Alone': '0.0%',
 'Survived': '0.0%'}

In [None]:
# Metadata.read_netCDF_metadata(teste)

AttributeError: 'DataFrame' object has no attribute 'variables'

In [None]:
# teste.head()

In [None]:
# get_nulls_count(teste1)

In [None]:
# import pandas as pd

# # Creating a sample DataFrame
# data = {
#     'col1': ['apple, pie', 'banana! split', 'cherry? cake'],
#     'col2': ['ice-cream', 'chocolate? cake', 'strawberry! shortcake']
# }

# teste = pd.DataFrame(data)
# teste.head()


In [None]:
# self.head()

In [None]:
# self = pd.read_csv("./Titanic.csv")
# condition = self["sex"] == "female"
# conditions = [condition]
# teste = find_delete(self, conditions)
# teste["sex"].value_counts()

In [None]:
# self = pd.read_csv("./Titanic.csv")
# condition = self["sex"] == "female"
# conditions = [condition]
# replace = ["survived", 1]
# teste = replace.values()
# teste
# teste = find_replace(self, conditions, replace)
# teste4 = find(teste, conditions)
# teste4["survived"].value_counts()

In [None]:
# self = pd.read_csv("./Titanic.csv")
# teste2 = find(self, [self["sex"] == "female"])
# self.loc[teste2.index, "survived"] = 0
# teste3 = find(self, [self["sex"] == "female"])
# teste3["survived"].value_counts()

In [None]:
# condition = self["sex"] == "female"
# conditions = [condition]
# replace = [self["survived"], 1]
# teste = replace.values()
# teste
# teste = find_replace(self, conditions, replace)
# teste.head()
# teste1 = find(teste, conditions)
# teste1["survived"].value_counts()
# adw = find(teste, conditions)
# teste.head()
# df["female"].value_counts()

In [None]:
# teste.head()

In [None]:
# teste = remove_rows_character(teste, characters=["-", "?", "!", " ", ","], add_blankspace=True)
# teste.head()

In [None]:
# df1["Product"].dtypes

In [None]:
# get_nulls_count(df1)

In [None]:
# df1.info()

In [None]:
# get_dtype(df1, "Product")

In [None]:
# df1 = get_best_dtypes(df1, convert=True)

In [None]:
# df1.info()

In [None]:
# df1 = remove_rows_with_missing_values(df1)

In [None]:
# get_col_null_count(df1, ["Purchase Address"])

In [126]:
Cleaning.split_rows_string(df1, "Purchase Address", ["Address", "City", "State"], delete_col=False)

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Address,City,State
0,150502.0,iPhone,1.0,700.00,02/18/19 01:35,"866 Spruce St, Portland, ME 04101",866 Spruce St,"Portland, ME 04101",ME 04101
1,150503.0,AA Batteries (4-pack),1.0,3.84,02/13/19 07:24,"18 13th St, San Francisco, CA 94016",18 13th St,"San Francisco, CA 94016",CA 94016
2,150504.0,27in 4K Gaming Monitor,1.0,389.99,02/18/19 09:46,"52 6th St, New York City, NY 10001",52 6th St,"New York City, NY 10001",NY 10001
3,150505.0,Lightning Charging Cable,1.0,14.95,02/02/19 16:47,"129 Cherry St, Atlanta, GA 30301",129 Cherry St,"Atlanta, GA 30301",GA 30301
4,150506.0,AA Batteries (4-pack),2.0,3.84,02/28/19 20:32,"548 Lincoln St, Seattle, WA 98101",548 Lincoln St,"Seattle, WA 98101",WA 98101
...,...,...,...,...,...,...,...,...,...
372985,295660.0,AAA Batteries (4-pack),2.0,2.99,11/04/19 14:17,"574 4th St, Los Angeles, CA 90001",574 4th St,"Los Angeles, CA 90001",CA 90001
372986,295661.0,USB-C Charging Cable,1.0,11.95,11/23/19 07:22,"359 1st St, Austin, TX 73301",359 1st St,"Austin, TX 73301",TX 73301
372987,295662.0,Lightning Charging Cable,1.0,14.95,11/13/19 16:12,"900 10th St, Boston, MA 02215",900 10th St,"Boston, MA 02215",MA 02215
372988,295663.0,AAA Batteries (4-pack),1.0,2.99,11/17/19 17:08,"592 Sunset St, Boston, MA 02215",592 Sunset St,"Boston, MA 02215",MA 02215
