## Imports

In [1]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import json
import jsonschema
from jsonschema.exceptions import ValidationError
import os
from tqdm.notebook  import tqdm
import pathlib
import os.path
import polars as pl 
import xarray as xr
import matplotlib.pyplot as plt
import requests
import seaborn as sns
from slack_sdk import WebClient
from slack_sdk.errors import SlackApiError
import time
import os
import numpy as np 

In [2]:
df = pd.read_parquet("./aa.parquet")

In [3]:
# import numpy as np
# import pandas as pd
# from ydata_profiling import ProfileReport

In [4]:
# profile = ProfileReport(df, title="Profiling Report", html={'style':{'full_width': True}})

In [5]:
# profile

# netCDF

### Variable Metadata

### Read

In [6]:
def read_netCDF_metadata(self, variables=None, attributes=None):
    """
    Get metadata for variables.

    Args:
        variables (list): List of variable names. If None, metadata for all variables will be retrieved.
        attributes (list): List of attribute names. If None, all attributes will be retrieved.

    Returns:
        None
    """
    def read_variable_metadata(var_name, var):
        """
        Print metadata for a variable.

        Args:
            var_name (str): Name of the variable.
            var (Variable): Variable object.

        Returns:
            None
        """
        print(f"Variable: {var_name}")
        if not var.attrs:
            if var.values is not None:
                print(f"    Values: {var.values}")
            else:
                print("No values were found")
            print("    No attributes were found for this variable.")
        else:
            print(f"    Values: {var.values}")
            print("    Attributes:")
            for key, value in var.attrs.items():
                if attributes is None or key in attributes:
                    print(f"     {key}: {value}")

    if variables is None:
        variables = get_file_variables(self)
    for var_name in variables:
        try:
            coord_var = self.coords[var_name]
            read_variable_metadata(var_name, coord_var)
        except (KeyError, AttributeError) as e:
            print(f"Error occurred while retrieving metadata for variable {var_name}: {str(e)}")

### Insert

#### Input

In [7]:
def insert_netCDF_metadata_input(self, variables=None, attributes=None, new_file=False, filename="new_file.nc",):
    """
    This function prompts the user to input metadata for the specified variables in a netCDF file.
    
    Parameters:
    - filename (str): Name of the netCDF file.
    - variables (list): List of variable names. If None, all coordinate variables are used.
    - attributes (list): List of attribute names. If None, default attributes are used.
    - new_file (bool): If True, a new netCDF file is created. If False, the existing file is used.
    
    Raises:
    - KeyError: If a variable was not found.
    - FileExistsError: If the specified file already exists.
    - ValueError: If the filename is invalid.
    """
    
    # Define default attributes if not provided
    default_attributes = [
        "Units", "Long_Name", "Standard_Name/Short_Name", 
        "Valid_Min", "Valid_Max", "Missing_Value", 
        "Fill_Value", "Scale_Factor", "Add_Offset", 
        "Coordinates", "Axis", "Description"
    ]
    if attributes is None:
        attributes = default_attributes

    if variables is None:
        variables = get_file_variables(self)

    for coord_name in variables:
        try:
            for attribute in attributes:
                self[coord_name].attrs[attribute] = input(f"{coord_name}: {attribute} - Enter value: ")
        except KeyError as e:
            raise KeyError(f"Variable {coord_name} not found.") from e
    if new_file:
        export_to_file(self,filename)
    read_netCDF_metadata(self)

#### Dictionary

In [8]:
def insert_netCDF_metadata_dict(self, dictionary, variables=None, new_file=False, filename="new_file.nc"):
    """
    Insert metadata into a netCDF file using a dictionary.

    Parameters:
    - self: The netCDF object.
    - dictionary: A dictionary containing the metadata to be inserted.
    - filename: The name of the netCDF file to be created or modified.
    - variables: A list of variables to insert the metadata into. If None, all variables will be used.
    - new_file: If True, a new file will be created. If False, the metadata will be inserted into an existing file.

    Raises:
    - ValueError: If dictionary is None.
    - AttributeError: If dictionary is not a dictionary.
    - FileExistsError: If the specified file already exists.
    - ValueError: If the filename is invalid.

    Returns:
    - None
    """
    if dictionary is None:
        raise ValueError("Please provide a dictionary.")
    if variables is None:
        variables = get_file_variables(self)
    if isinstance(dictionary, dict):
        for var in variables:
            for key, value in dictionary.items():
                self[var].attrs[key] = value
    else:
        raise AttributeError(f"{dictionary} is not a dictionary.")
    if new_file:
        export_to_file(self,filename)
    read_netCDF_metadata(self)

#### Json

In [9]:
def insert_netCDF_metadata_json(self, json_file, new_file=False, filename="new_file.nc"):
    """
    Inserts metadata from a JSON file into a netCDF file.

    Args:
        self: The instance of the class that the function belongs to.
        json_file (str): The path to the JSON file containing the metadata.
        new_file (bool, optional): A boolean flag indicating whether a new netCDF file should be created. Defaults to False.
        filename (str, optional): The name of the new netCDF file. Defaults to "new_file.nc".

    Raises:
        FileNotFoundError: If the specified filename already exists.

    Returns:
        None: The function modifies the attributes of the netCDF file directly.
    """
    schema = {
        "type": "object",
        "patternProperties": {
            ".*": {
                "type": "object",
                "patternProperties": {
                    ".*": {
                        "type": "string",
                    }
                }
            },
            "additionalProperties": False
            }   
        }   
    try:
        with open(json_file, 'r') as file:
            metadata = json.load(file)
    except IOError:
        raise IOError("Error opening JSON file. Please check if the file exists or if there are any permission issues.")
    try:
        # Validate JSON against schema
        jsonschema.validate(instance=metadata, schema=schema)
    except ValidationError as e:
        raise ValidationError(str(e))
    for var, attributes in metadata.items():
        for attr, value in attributes.items():
            self[var].attrs[attr] = value    
    if new_file:
        export_to_file(self,filename)
    read_netCDF_metadata(self)

#### All

In [10]:
def insert_netCDF_metadata(self, via="input", **kwargs):
    """
    Insert metadata into the netCDF file.

    Parameters:
        via (str, optional): The method of providing metadata. Can be "dict", "json", or "input". Defaults to "input".
        **kwargs: Additional keyword arguments for the specific method.

    Raises:
        ValueError: If `via` is not a valid metadata input.
    """
    via_lower = via.lower()
    try:
        if via_lower == "dict":
            insert_netCDF_metadata_dict(self, **kwargs)
        elif via_lower == "json":
            insert_netCDF_metadata_json(self, **kwargs)
        elif via_lower == "input":
            insert_netCDF_metadata_input(self, **kwargs)
        else:
            raise ValueError(f"{via} is not a valid metadata input.")
    except Exception as e:
        raise ValueError(f"Error inserting netCDF metadata: {str(e)}")

## Global Metadata

### Read

In [11]:
def get_attrs(self):
    return self.attrs

def read_global_metadata(self, attributes=None):
    """
    Print the global metadata attributes of the dataset.

    Args:
        attributes (list): List of attribute names to print. If None, all attributes will be printed.
    """
    attrs = get_attrs(self)
    if not attrs:
        print("No Global Attributes were found.")
    else:
        if attributes is None:
            for attr_name, attr_value in attrs.items():
                print(attr_name, ":", attr_value)
        else:
            for attr_name, attr_value in attrs.items():
                if attr_name in attributes:
                    print(attr_name, ":", attr_value)

### Insert

#### Input

In [12]:
def insert_netCDF_global_metadata_input(self, attributes=None, new_file=False, filename="new_file.nc"):
    """
    Insert global netCDF metadata attributes.

    Args:
        attributes (list): List of attributes to insert. If None, default attributes will be used.

    Returns:
        None
    """
    default_attributes = [
        "Title", "Institution", "Source",
        "History", "References", "Conventions",
        "Creator_Author", "Project", "Description"
    ]
    if attributes is None:
        attributes = default_attributes
    try:
        if not isinstance(attributes, list):
            raise ValueError("attributes must be a list")
        for attribute in attributes:
            if not isinstance(attribute, str):
                raise ValueError("attributes must contain only strings")
            self.attrs[attribute] = input(f"{attribute} - Enter value: ")
    except ValueError as e:
        print(f"An error occurred: {e}")
    if new_file:
        export_to_file(self, filename)

#### Dictionary

In [13]:
def insert_netCDF_global_metadata_dict(self, dictionary, new_file=False, filename="new_file.nc"):
    """
    Inserts a dictionary of global netCDF metadata into a netCDF file.

    Args:
        self: The instance of the class that the function belongs to.
        dictionary (dict): The dictionary of global netCDF metadata to be inserted.
        new_file (bool, optional): A boolean flag indicating whether to create a new netCDF file or not. Default is True.
        filename (str, optional): The name of the new netCDF file to be created. Default is "new_file.nc".

    Raises:
        TypeError: If the dictionary parameter is not a valid dictionary.
        FileNotFoundError: If the filename is invalid.
        FileExistsError: If the specified filename already exists.

    Returns:
        None. The function doesn't return any value.
    """
    if not isinstance(dictionary, dict):
        raise TypeError(f"{dictionary} is not a dictionary.")
    
    for key, value in dictionary.items():
        self.attrs[key] = value
    if new_file:
        export_to_file(self, filename)

#### Json

In [14]:
def insert_netCDF_global_metadata_json(self, json_file, new_file=False, filename="new_file.nc"):
    """
    Inserts global metadata from a JSON file into a netCDF file.

    Args:
        self: The instance of the class calling the function.
        json_file (str): The path to the JSON file containing the metadata.
        new_file (bool, optional): Indicates whether a new netCDF file should be created. Default is False.
        filename (str, optional): Specifies the name of the new netCDF file. Default is "new_file.nc".

    Raises:
        FileNotFoundError: If there is an error opening the JSON file.
        json.JSONDecodeError: If there is an error decoding the JSON file.
        ValueError: If the filename is invalid.
        FileExistsError: If the filename already exists.
        ValidationError: If the JSON file does not match the specified schema.

    Returns:
        None
    """
    schema = {
        "type": "object",
        "patternProperties": {
            ".*": { "type": "string" }
        },
        "additionalProperties": False
    }

    try:
        with open(json_file, 'r') as file:
            metadata = json.load(file)
    except FileNotFoundError:
        raise FileNotFoundError("Error opening JSON file. Please check if the file exists or if there are any permission issues.")
    except json.JSONDecodeError:
        raise json.JSONDecodeError("Error decoding JSON file. Please check if the file contains valid JSON.")
    
    try:
        # Validate JSON against schema
        jsonschema.validate(instance=metadata, schema=schema)
    except ValidationError as e:
        raise ValidationError(str(e))
    if new_file:
        export_to_file(self, filename)

# Parquet

### Read

In [15]:
def read_parquet_metadata(self, attributes=None, cols=None):
    """
    Reads the metadata of a Parquet file and prints the attributes of each column.

    Args:
        attributes (list, optional): A list of attributes to filter the metadata. If not provided, all attributes will be printed.
        cols (list, optional): A list of column names to filter the columns. If not provided, metadata of all columns will be printed.

    Returns:
        None

    Example Usage:
        # Example 1: Read metadata of all columns
        read_parquet_metadata()

        # Example 2: Read metadata of specific columns
        read_parquet_metadata(cols=['column1', 'column2'])

        # Example 3: Read metadata of specific attributes
        read_parquet_metadata(attributes=['attribute1', 'attribute2'])

        # Example 4: Read metadata of specific columns and attributes
        read_parquet_metadata(cols=['column1', 'column2'], attributes=['attribute1', 'attribute2'])
    """
    if isinstance(self, pd.DataFrame):
        self = pa.Table.from_pandas(self)
    if cols is None:
        for i in range(self.num_columns):
            field = self.field(i)
            col = field.name
            print(col)
            if field.metadata is None:
                print("    No attributes were found for this column.")
            else:
                metadata = {key.decode('utf-8'): value.decode('utf-8') for key, value in field.metadata.items()}
                for key, value in metadata.items():
                    if attributes is None or key in attributes:
                        print(f"    {key}: {value}")
    else:
        for i in range(self.num_columns):
            field = self.field(i)
            col = field.name
            if col in cols:
                print(col)
                if field.metadata is None:
                    print("    No attributes were found for this column.")
                else:
                    metadata = {key.decode('utf-8'): value.decode('utf-8') for key, value in field.metadata.items()}
                    if attributes:
                        for attr in attributes:
                            if attr in metadata:
                                print(f"    {attr}: {metadata[attr]}")
                            else:
                                print(f"    The '{attr}' attribute was not found in this column's metadata.")
                    else:
                        for key, value in metadata.items():
                            print(f"    {key}: {value}") 
        # TODO: Check why the else statement is much bigger than the if statement

### Insert

#### Input

In [16]:
def insert_parquet_metadata_input(self, attributes=None, cols=None, new_file=False, filename="new_file.parquet"):
    """
    Inserts metadata into a Parquet file.

    Args:
        attributes (list, optional): A list of attribute names to be used as metadata keys. Default value is ['Description', 'Units', 'Data Source', 'Valid Range or Categories'].
        cols (list, optional): A list of column names for which metadata needs to be inserted. Default value is all the columns in the DataFrame.

    Returns:
        pyarrow.Table: A Parquet table with metadata inserted.
    """
    default_attributes = ['Description', 'Units', 'Data Source', 'Valid Range or Categories']
    if attributes is None:
        attributes = default_attributes
    if cols is None:
        cols = list(self.columns)  # Suggestion 1: Use list(self.columns) instead of self.columns.tolist()
    metadata = []
    columns = self.columns  # Suggestion 2: Store self.columns in a variable
    cols_set = set(cols)  # Suggestion 3: Convert cols to a set for faster lookup
    for col in columns:
        if col in cols_set:
            col_metadata = {}
            for attribute in attributes:
                data = input(f"{col}: {attribute} - Enter value: ")
                col_metadata[attribute] = data
            metadata.append(col_metadata)
        else:
            metadata.append(None)
    dtypes = self.dtypes  # Suggestion 4: Get all column data types at once
    dtypes = ["string" if dtype == "category" else str(dtype) for dtype in dtypes]
    cols_dtypes = zip(columns, dtypes, metadata)
    schema = [pa.field(col, pa.type_for_alias(dtype), metadata=meta) for col, dtype, meta in cols_dtypes]
    table_schema = pa.schema(schema)
    table = pa.Table.from_pandas(self, schema=table_schema)
    if new_file:
        export_to_file(table, filename)
    return table

#### Dictionary

In [17]:
def insert_parquet_metadata_dict(self, dictionary, cols=None, new_file=False, filename="new_file.parquet"):
    """
    Insert metadata into a netCDF file using a dictionary.

    Parameters:
    - self: The netCDF object.
    - dictionary: A dictionary containing the metadata to be inserted.
    - filename: The name of the netCDF file to be created or modified.
    - variables: A list of variables to insert the metadata into. If None, all variables will be used.
    - new_file: If True, a new file will be created. If False, the metadata will be inserted into an existing file.

    Raises:
    - ValueError: If dictionary is None.
    - AttributeError: If dictionary is not a dictionary.
    - FileExistsError: If the specified file already exists.
    - ValueError: If the filename is invalid.

    Returns:
    - None
    """
    if dictionary is None:
        raise ValueError("Please provide a dictionary.")
    if cols is None:
        cols = list(self.columns)
    columns = self.columns  # Suggestion 2: Store self.columns in a variable
    dtypes = self.dtypes  # Suggestion 4: Get all column data types at once
    dtypes = ["string" if dtype == "category" else str(dtype) for dtype in dtypes]
    metadata = []
    if isinstance(dictionary, dict):
        cols_set = set(cols)  # Suggestion 3: Convert cols to a set for faster lookup
        for col in columns:
            if col in cols_set:
                metadata.append(dictionary)
            else:
                metadata.append(None)
        cols_dtypes = zip(columns, dtypes, metadata)
        schema = [pa.field(col, pa.type_for_alias(dtype), metadata=meta) for col, dtype, meta in cols_dtypes]
        table_schema = pa.schema(schema)
        table = pa.Table.from_pandas(self, schema=table_schema)
        if new_file:
            export_to_file(table, filename)
        return table  
    else:
        raise AttributeError(f"{dictionary} is not a dictionary.")  

#### Json

In [18]:
def insert_parquet_metadata_json(self, json_file, new_file=False, filename="new_file.parquet"):
    """
    Inserts metadata from a JSON file into a netCDF file.

    Args:
        self: The instance of the class that the function belongs to.
        json_file (str): The path to the JSON file containing the metadata.
        new_file (bool, optional): A boolean flag indicating whether a new netCDF file should be created. Defaults to False.
        filename (str, optional): The name of the new netCDF file. Defaults to "new_file.nc".

    Raises:
        FileNotFoundError: If the specified filename already exists.

    Returns:
        None: The function modifies the attributes of the netCDF file directly.
    """
    schema = {
    "type": "object",
    "patternProperties": {
        ".*": {
            "type": "object",
            "patternProperties": {
                ".*": {
                    "type": "string",
                }
            }
        },
        "additionalProperties": False
        }   
    }   
    try:
        with open(json_file, 'r') as file:
            json_data = json.load(file)
    except IOError:
        raise IOError("Error opening JSON file. Please check if the file exists or if there are any permission issues.")
    try:
        # Validate JSON against schema
        jsonschema.validate(instance=json_data, schema=schema)
    except ValidationError as e:
        raise ValidationError(str(e))
    cols_dtypes = get_cols_dtypes(self)
    cols_dtypes = [[col, "string"] if dtype == "category" else [col, str(dtype)] for col, dtype in cols_dtypes]
    metadata = []
    for col in cols_dtypes:
        if col[0] in json_data:
            col_metadata = json_data[col[0]]
            metadata.append(col_metadata)
        else:
            metadata.append(None)
    cols_dtypes = zip(cols_dtypes, metadata)
    schema = [pa.field(col_dtype[0], pa.type_for_alias(col_dtype[1]), metadata=meta) for col_dtype, meta in cols_dtypes]
    table_schema = pa.schema(schema)
    table = pa.Table.from_pandas(self, schema=table_schema)
    if new_file:
        export_to_file(table, filename)
    return table

# Export to File

In [19]:
def get_file_extension(filename):
    suffix = pathlib.Path(filename).suffix
    return suffix

In [20]:
def export_to_file(self, filename):
    suffixs = [".nc", ".parquet"]
    if not os.path.isfile(filename):
        if get_file_extension(filename) in suffixs:
            if get_file_extension(filename) == ".nc":
                self.to_netcdf(filename)
            elif get_file_extension(filename) == ".parquet":
                pq.write_table(self, filename, compression=None)        
        else:
            raise ValueError(f"Invalid file extension. Please provide a valid filename. Valid file extesions {suffixs}.")
    else:
        raise FileExistsError(f"{filename} already exists. Please change it or delete it.")

# Col Statics

In [21]:
def get_file_variables(self):
    variables = list(self.variables.keys())
    return variables

In [22]:
def get_dtypes(self):
    dtypes = (str(dtype) for dtype in self.dtypes)
    return dtypes

In [23]:
def get_dtype(self, col):
    return self[col].dtypes

In [24]:
def get_cols(self):
    cols = list(self.columns)
    return cols

In [25]:
def get_cols_dtypes(self):
    cols = get_cols(self)
    dtypes = get_dtypes(self)
    cols_dtypes = list(zip(cols, dtypes))
    return cols_dtypes

In [26]:
def convert_python_type(min_value, max_value):
            if isinstance(min_value, (int, np.integer)) and isinstance(max_value, (int, np.integer)):
                return int(min_value), int(max_value)
            elif isinstance(min_value, (float, np.floating)) and isinstance(max_value, (float, np.floating)):
                return float(min_value), float(max_value)
            elif isinstance(min_value, (np.bool_, bool)) and isinstance(max_value, (np.bool_, bool)):
                return bool(min_value), bool(max_value)
            else:
                return min_value, max_value

In [27]:
def get_best_dtypes(self, cols=None, convert=False):
    if cols is None:
        cols = self.columns
    for col in cols:    
        try:
            if pd.api.types.is_numeric_dtype(self[col]):
                col_min = self[col].min()
                col_max = self[col].max()
                col_min, col_max = convert_python_type(col_min, col_max)
                if isinstance(col_min, bool) and isinstance(col_max, bool):
                    if convert:
                        self[col] = self[col].astype("bool")
                    else:
                        print(f"The best dtype for {col} is bool") 
                elif isinstance(col_min, int) and isinstance(col_max, int):
                    if col_min >= -128 and col_max <= 127:
                        if convert:
                            self[col] = self[col].astype("int8")
                        else:
                            print(f"The best dtype for {col} is int8")
                            if df[col].nunique(dropna=False) == 2:
                                print("But consider changing it to bool, has you have 2 unique values so you can map the numbers to be True or False")
                    elif col_min >= -32768 and col_max <= 32767:
                        if convert:
                            self[col] = self[col].astype("int16")
                        else:
                            print(f"The best dtype for {col} is int16")
                    elif col_min >= -2147483648 and col_max <= 2147483647:
                        if convert:
                            self[col] = self[col].astype("int32")
                        else:
                            print(f"The best dtype for {col} is int32")
                    else:
                        if convert:
                            self[col] = self[col].astype("int64")
                        else:
                            print(f"The best dtype for {col} is int64")
                elif isinstance(col_min, float) and isinstance(col_max, float):
                    if col_min >= np.finfo(np.float16).min and col_min <= np.finfo(np.float16).max:
                        if convert:
                            self[col] = self[col].astype("float16")
                        else:
                            print(f"The best dtype for {col} is float16")
                    elif col_max >= np.finfo(np.float32).min and col_max <= np.finfo(np.float32).max:
                        if convert:
                            self[col] = self[col].astype("float32")
                        else:
                            print(f"The best dtype for {col} is float32")
                    else:
                        if convert:
                            self[col] = self[col].astype("float64")
                        else:
                            print(f"The best dtype for {col} is float64")
            else:
                if convert:
                    self[col] = self[col].astype("category")
                else:
                    print(f"The best dtype for {col} is category")
        except Exception as e:
            print(f"Erro ao processar a coluna {col}: {e}")
    if convert:
        return self

In [28]:
from IPython.display import display
def loop_to_col_info(self, func, new_col_name, show_df, get_total=False, output=True):
    try:
        if callable(func) is True:
            if show_df:
                dataframe = []
            if get_total:
                total = 0
            for col in self.columns:
                value = func(self, col, False)
                if get_total:
                    total += value
                if output:
                    func(self, col)     
                if show_df:
                    col_info = [col, value]
                    dataframe.append(col_info)
            if show_df:
                collums = ["Col_Name", new_col_name]
                if get_total:
                    dataframe.append(["Total", total])
                dataframe = pd.DataFrame(dataframe, columns=collums)
                if get_total:
                    n_rows = len(self.columns) + 1
                    display(dataframe.head(n_rows))
                    return total
                else:
                    return dataframe
            if get_total:
                if output:   
                    print(f"Total: {total}")
                return total
    except ValueError:
        print(f"{func} has to be a function.")

In [29]:
def get_col_memory_usage(self, col, output=True, unit="kb"):
    memory_usage = self[col].memory_usage(deep=True)
    if unit == "kb":
        value = round(memory_usage / 1024, 2)
        if output:
            print(f"{value} kb")
        return value    
    elif unit == "mb":
        value = round(memory_usage / (1024**2), 2)
        if output:
            print(f"{value} mb")
        return value
    elif unit == "b":
        if output:
            print(f"{memory_usage} b")
        return memory_usage
    else:
        raise ValueError(f"{unit} not supported. Units supported is bytes, kilobytes and megabytes.")

In [30]:
def get_memory_usage(self, show_df=False, get_total=True, output=False):
    return loop_to_col_info(self, func=get_col_memory_usage, new_col_name="Memory_Used", show_df= show_df, get_total=get_total, output=output)  

In [31]:
def get_col_memory_usage_percentage(self, col, output=True, unit="kb"):
    total_usage = get_memory_usage(self)
    col_usage = get_col_memory_usage(self, col, False, unit)
    value = round((col_usage/total_usage) * 100, 2)
    return value

In [32]:
def get_memory_usage_percentage(self, show_df=False, get_total=True, output=False):
    return loop_to_col_info(self, func=get_col_memory_usage_percentage, new_col_name="Memory_Used", show_df= show_df, get_total=get_total, output=output)  

In [33]:
get_memory_usage(df)

29.28

In [34]:
get_memory_usage_percentage(df, True)

Unnamed: 0,Col_Name,Memory_Used
0,Gender,4.13
1,Age,24.21
2,Siblings_on_Board,3.42
3,Parents_on_Board,3.42
4,Ticket_Price,24.21
5,Port_of_Embarkation,4.44
6,Class,4.37
7,Adult/Child,4.17
8,Alone,3.42
9,Survived,24.21


100.0

In [35]:
callable(get_col_memory_usage_percentage)

True

In [36]:
def get_col_null_count(self, col, print_output=True):
    value = self[col].isnull().sum()
    if print_output:
        print(f"The number of null values in {col} is {value}")
    return value

In [37]:
def get_nulls_count(self, show_df=False, get_total=True):
    return loop_to_col_info(self, func=get_col_null_count, new_col_name="Null_Values", show_df= show_df, get_total=get_total)  

In [38]:
def get_col_null_percentage(self, col, print_output=True):
    value = self[col].isnull().sum()
    value = round((value/len(self[col])) * 100, 2)
    if print_output:
        print(f"The percentage of null values in {col} is {value} %")
    return value

In [39]:
def get_nulls_percentage(self, show_df=False):
    return loop_to_col_info(self, func=get_col_null_percentage, new_col_name="Percentage_of_Null_Values", show_df= show_df)  

In [40]:
def get_col_number_of_unique_values(self, col, print_output=True):
    value = self[col].nunique()
    if print_output:
        print(f"The number of unique values in {col} is {value}")
    return value    

In [41]:
def get_number_of_unique_values(self, show_df=False, get_total=True):
    return loop_to_col_info(self, func=get_col_number_of_unique_values, new_col_name="Unique_Values", show_df= show_df, get_total=get_total)  

In [42]:
get_number_of_unique_values(df)

The number of unique values in Gender is 2
The number of unique values in Age is 88
The number of unique values in Siblings_on_Board is 7
The number of unique values in Parents_on_Board is 7
The number of unique values in Ticket_Price is 236
The number of unique values in Port_of_Embarkation is 3
The number of unique values in Class is 3
The number of unique values in Adult/Child is 2
The number of unique values in Alone is 2
The number of unique values in Survived is 2
Total: 352


352

In [43]:
def get_col_max_value(self, col, output=True):
    if self[col].dtype != "category" and self[col].dtype != "bool":
        value = self[col].max()
        if output:
            print(f"The maximum value in {col} is {value}")
        return value
    else:
        value = self[col].value_counts().idxmax()
        if output:
            print(f"The most common value in {col} is {value}")
        return value

In [44]:
def get_max_values(self, show_df=False):
    return loop_to_col_info(self, func=get_col_max_value, new_col_name="Max_Values", show_df= show_df)

In [45]:
def get_col_max_value_count(self, col, output=True):
    if self[col].dtype != "category":
        value = self[col].max()
        value = self[self[col] == value][col].count()
        if output:
            print(f"The number of ocurrences of the max value in {col} is {value}")
        return value
    else:
        value = self[col].value_counts().max()
        if output:
            print(f"The number of ocurrences of the most common value in {col} is {value}")
        return value

In [46]:
def get_max_value_count(self, show_df=False):
    return loop_to_col_info(self, func=get_col_max_value_count, new_col_name="Max_Values_Count", show_df= show_df)

In [47]:
def get_col_max_value_percentage(self, col, output=True):
    if self[col].dtype != "category":
        value = self[col].max()
        value = self[self[col] == value][col].count()
        value = round((value/len(self[col])) * 100, 2)
        if output:
            print(f"The percentage of max value in {col} is {value} %")
            print("Tip: It's possible for the percentage of max values being lower than the percentage of min values. So don't take this function seriously if you are using it for numerical columns.")
        return value
    else:
        value = self[col].value_counts().max()
        value = round((value/len(self[col])) * 100, 2)
        if output:
            print(f"The percentage of most common value in {col} is {value} %")
        return value

In [48]:
def get_max_value_percentage(self, show_df=False):
    return loop_to_col_info(self, func=get_col_max_value_count, new_col_name="Max_Values_Percentage", show_df= show_df)

In [49]:
def get_col_min_value(self, col, output=True):
    if self[col].dtype != "category":
        value = self[col].min()
        if output:
            print(f"The minimum value in {col} is {value}")
        return value
    else:
        value = self[col].value_counts().idxmin()
        if output:
            print(f"The less common value in {col} is {value}")
        return value

In [50]:
def get_min_value(self, show_df=False):
    return loop_to_col_info(self, func=get_col_min_value, new_col_name="Min_Values", show_df= show_df)

In [51]:
def get_col_min_value_count(self, col, output=True):
    if self[col].dtype != "category":
        value = self[col].min()
        value = self[self[col] == value][col].count()
        if output:
            print(f"The number of ocurrences of the minimum value in {col} is {value}")
        return value
    else:
        value = self[col].value_counts().min()
        if output:
            print(f"The number of ocurrences of the less common value in {col} is {value}")
        return value

In [52]:
def get_min_value_count(self, show_df=False):
    return loop_to_col_info(self, func=get_col_min_value_count, new_col_name="Min_Values_Count", show_df= show_df)

In [53]:
def get_col_min_value_percentage(self, col, output=True):
    if self[col].dtype != "category":
        value = self[col].min()
        value = self[self[col] == value][col].count()
        value = round((value/len(self[col])) * 100, 2)
        if output:
            print(f"The percentage of min value in {col} is {value} %")
            print("Tip: It's possible for the percentage of max values being lower than the percentage of min values. So don't take this function seriously if you are using it for numerical columns.")
        return value
    else:
        value = self[col].value_counts().min()
        value = round((value/len(self[col])) * 100, 2)
        if output:
            print(f"The percentage of less common value in {col} is {value} %")
        return value

In [54]:
def get_min_value_percentage(self, show_df=False):
    return loop_to_col_info(self, func=get_col_min_value_count, new_col_name="Min_Values_Percentage", show_df= show_df)

In [55]:
def get_dataframe_mem_insight(self, transpose = False):
    dataframe = []
    for col in self.columns:
        col_info = [
            col,
            get_dtype(df, col),
            get_best_dtypes(df, col),
            get_col_memory_usage(df, col),
            get_col_null_count(df, col, False),
            f"{get_col_null_percentage(df, col, False)}%",
            get_col_number_of_unique_values(df, col, False)
        ]
        dataframe.append(col_info)

    column_names = [
        'Column',
        'Dtype',
        'Recommend_Dtype',
        'Memory',
        'Missing_Values',
        'Percentage_of_Missing_Values',
        'Distinct_Values'
    ]
    dataframe = pd.DataFrame(dataframe, columns=column_names)
    if transpose:
        dataframe = dataframe.transpose()
        dataframe.columns = dataframe.iloc[0]
        dataframe = dataframe[1:]
    return dataframe.head(len(self.columns))

In [56]:
def get_dataframe_values_insight(self, transpose = False):
    dataframe = []
    for col in self.columns:
        col_info = [
            col,
            get_dtype(df, col),
            get_col_number_of_unique_values(df, col, False),
            get_col_max_value(df, col, False),
            get_col_max_value_count(df, col, False),
            f"{get_col_max_value_percentage(df, col, False)}%",
            get_col_min_value(df, col, False),
            get_col_min_value_count(df, col, False),
            f"{get_col_min_value_percentage(df, col, False)}%",
            get_col_null_count(df, col, False),
            f"{get_col_null_percentage(df, col, False)}%"
        ]
        dataframe.append(col_info)

    column_names = [
        'Column',
        'Dtype',
        'Distinct_Values',
        'Most_Common/Max_Value',
        'Occurrences_of_Max_Value',
        'Percentages_of_Occurrences_of_Max_Value',
        'Less_Common/Min_Value',
        'Occurrences_of_Min_Value',
        'Percentage_of_Occurrences_of_Min_Value',
        'Missing_Values',
        'Percentage_of_Missing_Values'
    ]
    dataframe = pd.DataFrame(dataframe, columns=column_names)
    if transpose:
        dataframe = dataframe.transpose()
        dataframe.columns = dataframe.iloc[0]
        dataframe = dataframe[1:]
    return dataframe.head(len(self.columns))

In [57]:
get_dataframe_values_insight(df)

Unnamed: 0,Column,Dtype,Distinct_Values,Most_Common/Max_Value,Occurrences_of_Max_Value,Percentages_of_Occurrences_of_Max_Value,Less_Common/Min_Value,Occurrences_of_Min_Value,Percentage_of_Occurrences_of_Min_Value,Missing_Values,Percentage_of_Missing_Values
0,Gender,category,2,M,577,64.76%,F,314,35.24%,0,0.0%
1,Age,float64,88,80.0,1,0.11%,0.42,1,0.11%,177,19.87%
2,Siblings_on_Board,int8,7,8,7,0.79%,0,608,68.24%,0,0.0%
3,Parents_on_Board,int8,7,6,1,0.11%,0,678,76.09%,0,0.0%
4,Ticket_Price,float64,236,512.33,3,0.34%,0.0,15,1.68%,0,0.0%
5,Port_of_Embarkation,category,3,Southampton,644,72.28%,Queenstown,77,8.64%,2,0.22%
6,Class,category,3,Third,491,55.11%,Second,184,20.65%,0,0.0%
7,Adult/Child,category,2,Adult,808,90.68%,Child,83,9.32%,0,0.0%
8,Alone,bool,2,True,537,60.27%,False,354,39.73%,0,0.0%
9,Survived,int64,2,1,342,38.38%,0,549,61.62%,0,0.0%


### **Next Steps** 

1 - Add more values insight

2 - Get the percentage of memory of the dataframe that the col is using

3 - Fix get_best_dtypes

4 - Cleaning Functions

5 - Analysis Functions

6 - Visualization Functions

In [58]:
# import pandas as pd

# dataframe = []
# for col in df.columns:
#     col_info = [
#         col,
#         get_dtype(df, col),
#         get_best_dtype(df, col),
#         get_col_memory_usage(df, col),
#         get_col_null_count(df, col, False),
#         f"{get_col_null_percentage(df, col, False)}%",
#         get_col_number_of_unique_values(df, col, False),
#         get_col_max_value(df, col, False),
#         get_col_max_value_count(df, col, False),
#         f"{get_col_max_value_percentage(df, col, False)}%",
#         get_col_min_value(df, col, False),
#         get_col_min_value_count(df, col, False),
#         f"{get_col_min_value_percentage(df, col, False)}%"
#     ]
#     dataframe.append(col_info)

# column_names = [
#     'Attributes',
#     'Dtype',
#     'Recommend_Dtype',
#     'Memory',
#     'Missing_Values',
#     'Percentage_of_Missing_Values',
#     'Distinct_Values',
#     'Most_Common/Max_Value',
#     'Occurrences_of_Max_Value',
#     'Percentages_of_Occurrences_of_Max_Value',
#     'Less_Common/Min_Value',
#     'Occurrences_of_Min_Value',
#     'Percentage_of_Occurrences_of_Min_Value'
# ]
# # total = ["Total", get_memory_usage(self)]
# # dataframe.append(total)
# dataframe = pd.DataFrame(dataframe, columns=column_names).transpose()

# # Set the first row as column names
# dataframe.columns = dataframe.iloc[0]

# # Drop the first row (now it's redundant)
# dataframe = dataframe[1:]

# # Display the updated DataFrame
# dataframe.head(len(dataframe.columns))

# Cleaning

In [98]:
df = pd.read_csv("./Titanic.csv")
df1 = pd.read_csv("./Sales_Data.csv")
df2 = pd.read_parquet("./Titanic_Cleaned.parquet")

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sex       891 non-null    object 
 1   age       714 non-null    float64
 2   sibsp     891 non-null    int64  
 3   parch     891 non-null    int64  
 4   fare      891 non-null    float64
 5   embarked  889 non-null    object 
 6   class     891 non-null    object 
 7   who       891 non-null    object 
 8   alone     891 non-null    bool   
 9   survived  891 non-null    int64  
dtypes: bool(1), float64(2), int64(3), object(4)
memory usage: 63.6+ KB


In [61]:
df["sex"] = df["sex"].astype('category')

In [None]:
teste = get_best_dtypes(df)

The best dtype for sex is category
The best dtype for age is float16
The best dtype for sibsp is int8
The best dtype for parch is int8
The best dtype for fare is float16
The best dtype for embarked is category
The best dtype for class is category
The best dtype for who is category
The best dtype for alone is bool
The best dtype for survived is int8
But consider changing it to bool, has you have 2 unique values so you can map the numbers to be True or False


In [None]:
teste.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   sex       891 non-null    category
 1   age       714 non-null    float16 
 2   sibsp     891 non-null    int8    
 3   parch     891 non-null    int8    
 4   fare      891 non-null    float16 
 5   embarked  889 non-null    category
 6   class     891 non-null    category
 7   who       891 non-null    category
 8   alone     891 non-null    bool    
 9   survived  891 non-null    int8    
dtypes: bool(1), category(4), float16(2), int8(3)
memory usage: 11.1 KB


## Cols

In [None]:
def capitalize_cols_name(self, cols = None):
    if cols is None:
        cols = self.columns
    for col in tqdm(cols, desc='Renaming Collumns', unit='Columns'):
        new_col = col.capitalize()
        self = self.rename(columns={col: new_col})
    return self

In [None]:
def lower_cols_name(self, cols = None):
    if cols is None:
        cols = self.columns
    for col in tqdm(cols, desc='Renaming Collumns', unit='Columns'):
        new_col = col.lower()
        self = self.rename(columns={col: new_col})
    return self

In [None]:
def upper_cols_name(self, cols = None):
    if cols is None:
        cols = self.columns
    for col in tqdm(cols, desc='Renaming Collumns', unit='Columns'):
        new_col = col.upper()
        self = self.rename(columns={col: new_col})
    return self

In [None]:
def remove_cols_blankspaces(self, cols = None):
    if cols is None:
        cols = self.columns
    for column in tqdm(cols, desc='Removing BlankSpaces', unit='BlankSpaces'):
        if ' ' in column:
            new_column = column.replace(' ', '')
            self = self.rename(columns={column: new_column})
        else:
            continue
    return self

In [None]:
def remove_cols_underscores(self, cols = None):
    if cols is None:
        cols = self.columns
    for column in tqdm(cols, desc='Removing Underscores', unit='Underscores'):
        if '_' in column:
            new_column = column.replace('_', '')
            self = self.rename(columns={column: new_column})
        else:
            continue
    return self

In [None]:
def remove_cols_character(self, cols = None, characters=['_'], add_blankspace=False):
    if cols is None:
        cols = self.columns
    for col in tqdm(cols, desc="", unit=""):
        for character in characters:
            if character in col:
                if not add_blankspace:
                    new_column = col.replace(character, '')
                else:
                    new_column = col.replace(character, ' ')
                self = self.rename(columns={col: new_column})
            else:
                continue
    return self

## Rows

In [None]:
def round_rows_value(self, col, decimals=2):
    dtype = get_dtype(self, col)
    if dtype not in ["categorical", "bool", "object"]:
        for index, row in tqdm(self.iterrows(), desc= "Rounding Rows Value", unit="Rows"):
            self.loc[index, col] = round(row[col], decimals) 
    else:
        raise ValueError(f"{col}'s dtype is not a numerical.")
    return self

In [None]:
def remove_rows_character(self, cols = None, characters=[','], add_blankspace=False):
    if cols is None:
        cols = self.columns
    for col in tqdm(cols, desc="", unit=""):
        dtype = get_dtype(self, col) 
        if dtype in ["categorical", "bool", "object"]:
            for character in characters:
                if not add_blankspace:
                    self[col] = self[col].apply(lambda x: x.replace(character, '') if isinstance(x, str) and character in x else x)
                else:
                    self[col] = self[col].apply(lambda x: x.replace(character, ' ') if isinstance(x, str) and character in x else x)
    return self

In [None]:
def capitalize_rows_string(self, cols = None):
    if cols is None:
        cols = self.columns
    for col in tqdm(cols, desc="", unit=""):
        dtype = get_dtype(self, col) 
        if dtype in ["categorical", "bool", "object"]:
            self[col] = self[col].apply(lambda x: x.capitalize() if isinstance(x, str) else x)
    return self

In [None]:
def lower_rows_string(self, cols = None):
    if cols is None:
        cols = self.columns
    for col in tqdm(cols, desc="", unit=""):
        dtype = get_dtype(self, col) 
        if dtype in ["categorical", "bool", "object"]:
            self[col] = self[col].apply(lambda x: x.lower() if isinstance(x, str) else x)
    return self

In [None]:
def upper_rows_string(self, cols = None):
    if cols is None:
        cols = self.columns
    for col in tqdm(cols, desc="", unit=""):
        dtype = get_dtype(self, col) 
        if dtype in ["categorical", "bool", "object"]:
            self[col] = self[col].apply(lambda x: x.upper() if isinstance(x, str) else x)
    return self

In [None]:
def remove_rows_with_missing_values(self, cols = None):
    if cols is None:
        self = self.dropna(axis=0)
    else:
        self = self.dropna(subset=cols)
    return self

In [None]:

def interpolate_rows_with_missing_values(self, cols = None):
    if cols is None:
        self = self.interpolate()
    else:
        for col in tqdm(cols, desc="", unit=""):
            dtype = get_dtype(self, col) 
            if dtype in ["categorical", "bool", "object"]:
                raise ValueError(f"{col} does not have numerical values. Please use mode to replace the missing values.")
            else:
                self = self[col].interpolate()
    return self

In [None]:
def foward_fill_rows_with_missing_values(self, cols = None):
    if cols is None:
        self = self.ffill()
    else:
        self = self.ffill(subset=cols)
    return self

In [None]:
def split_rows_string(self, col, new_cols, separator=",", delete_col=True, save_remain=True):
    split_result = self[col].str.split(separator, expand=True)
    split_result = split_result.fillna('')
    for i, new_col in enumerate(new_cols):
        if i == 0:
            self[new_col] = split_result[i]
        else:
            if save_remain:
                self[new_col] = split_result.loc[:, i:].apply(lambda x: separator.join(x), axis=1)
    if delete_col:
        self = self.drop([col], axis=1)
    return self

In [None]:
def backward_fill_rows_with_missing_values(self, cols = None):
    if cols is None:
        self = self.bfill()
    else:
        self = self.bfill(subset=cols)
    return self

In [None]:
def fill_rows_with_missing_values_mean(self, cols = None, decimals=2):
    if cols is None:
       self = self.fill(self.mean())
    else:
        for col in tqdm(cols, desc="", unit=""):
            dtype = get_dtype(self, col) 
            if dtype in ["categorical", "bool", "object"]:
                self = self[col].fillna(self[col].mode())
            else:
                self = self[col].fillna(round(self[col].mean(), decimals))
    return self

In [None]:
teste1 = remove_rows_with_missing_values(df, ["age"])

In [None]:
get_nulls_count(teste1)

The number of null values in sex is 0
The number of null values in age is 0
The number of null values in sibsp is 0
The number of null values in parch is 0
The number of null values in fare is 0
The number of null values in embarked is 2
The number of null values in class is 0
The number of null values in who is 0
The number of null values in alone is 0
The number of null values in survived is 0
Total: 2


2

In [None]:
import pandas as pd

# Creating a sample DataFrame
data = {
    'col1': ['apple, pie', 'banana! split', 'cherry? cake'],
    'col2': ['ice-cream', 'chocolate? cake', 'strawberry! shortcake']
}

teste = pd.DataFrame(data)
teste.head()


Unnamed: 0,col1,col2
0,"apple, pie",ice-cream
1,banana! split,chocolate? cake
2,cherry? cake,strawberry! shortcake


In [67]:
df.head()

Unnamed: 0,sex,age,sibsp,parch,fare,embarked,class,who,alone,survived
0,male,22.0,1,0,7.25,S,Third,man,False,0
1,female,38.0,1,0,71.2833,C,First,woman,False,1
2,female,26.0,0,0,7.925,S,Third,woman,True,1
3,female,35.0,1,0,53.1,S,First,woman,False,1
4,male,35.0,0,0,8.05,S,Third,man,True,0


In [90]:
def find(self, conditions, AND=True, OR=False):
    combined_condition = conditions[0]
    if AND:
        for condition in conditions[1:]:
            combined_condition = combined_condition & condition
    elif OR:
        for condition in conditions[1:]:
            combined_condition = combined_condition | condition
    elif OR and AND:
        raise ValueError("Both AND and OR cannot be True simultaneously.")
    else:
        raise ValueError("Either AND or OR must be True.")
    return self[combined_condition]

In [101]:
condition = df["sex"] == "female"
condition4 = df["who"] == "child"
condition2 = df["survived"] > 0
conditions = [condition, condition2, condition4]
adw = find(df, conditions)
adw.head()

Unnamed: 0,sex,age,sibsp,parch,fare,embarked,class,who,alone,survived
9,female,14.0,1,0,30.0708,C,Second,child,False,1
10,female,4.0,1,1,16.7,S,Third,child,False,1
22,female,15.0,0,0,8.0292,Q,Third,child,True,1
39,female,14.0,1,0,11.2417,C,Third,child,False,1
43,female,3.0,1,2,41.5792,C,Second,child,False,1


In [None]:
teste = remove_rows_character(teste, characters=["-", "?", "!", " ", ","], add_blankspace=True)
teste.head()

  0%|          | 0/2 [00:00<?, ?/s]

Unnamed: 0,col1,col2
0,apple pie,ice cream
1,banana split,chocolate cake
2,cherry cake,strawberry shortcake


In [None]:
df1["Product"].dtypes

dtype('O')

In [None]:
get_nulls_count(df1)

The number of null values in Order ID is 1
The number of null values in Product is 1
The number of null values in Quantity Ordered is 1
The number of null values in Price Each is 1
The number of null values in Order Date is 1
The number of null values in Purchase Address is 1
Total: 6


6

In [66]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372990 entries, 0 to 372989
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Order ID          371900 non-null  float64
 1   Product           371900 non-null  object 
 2   Quantity Ordered  371900 non-null  float64
 3   Price Each        371900 non-null  float64
 4   Order Date        371900 non-null  object 
 5   Purchase Address  371900 non-null  object 
dtypes: float64(3), object(3)
memory usage: 17.1+ MB


In [None]:
get_dtype(df1, "Product")

dtype('O')

In [63]:
df1 = get_best_dtypes(df1, convert=True)

In [64]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 372990 entries, 0 to 372989
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   Order ID          371900 non-null  float32 
 1   Product           371900 non-null  category
 2   Quantity Ordered  371900 non-null  float16 
 3   Price Each        371900 non-null  float16 
 4   Order Date        371900 non-null  category
 5   Purchase Address  371900 non-null  category
dtypes: category(3), float16(2), float32(1)
memory usage: 16.3 MB


In [None]:
df1 = remove_rows_with_missing_values(df1)

In [None]:
get_col_null_count(df1, ["Purchase Address"])

The number of null values in ['Purchase Address'] is Purchase Address    0
dtype: int64


Purchase Address    0
dtype: int64

In [None]:
split_rows_string(df1, "Purchase Address", ["Address", "City", "State"], delete_col=False)

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address,Address,City,State
0,150502.0,iPhone,1.0,700.00,02/18/19 01:35,"866 Spruce St, Portland, ME 04101",866 Spruce St,"Portland, ME 04101",ME 04101
1,150503.0,AA Batteries (4-pack),1.0,3.84,02/13/19 07:24,"18 13th St, San Francisco, CA 94016",18 13th St,"San Francisco, CA 94016",CA 94016
2,150504.0,27in 4K Gaming Monitor,1.0,389.99,02/18/19 09:46,"52 6th St, New York City, NY 10001",52 6th St,"New York City, NY 10001",NY 10001
3,150505.0,Lightning Charging Cable,1.0,14.95,02/02/19 16:47,"129 Cherry St, Atlanta, GA 30301",129 Cherry St,"Atlanta, GA 30301",GA 30301
4,150506.0,AA Batteries (4-pack),2.0,3.84,02/28/19 20:32,"548 Lincoln St, Seattle, WA 98101",548 Lincoln St,"Seattle, WA 98101",WA 98101
...,...,...,...,...,...,...,...,...,...
372985,295660.0,AAA Batteries (4-pack),2.0,2.99,11/04/19 14:17,"574 4th St, Los Angeles, CA 90001",574 4th St,"Los Angeles, CA 90001",CA 90001
372986,295661.0,USB-C Charging Cable,1.0,11.95,11/23/19 07:22,"359 1st St, Austin, TX 73301",359 1st St,"Austin, TX 73301",TX 73301
372987,295662.0,Lightning Charging Cable,1.0,14.95,11/13/19 16:12,"900 10th St, Boston, MA 02215",900 10th St,"Boston, MA 02215",MA 02215
372988,295663.0,AAA Batteries (4-pack),1.0,2.99,11/17/19 17:08,"592 Sunset St, Boston, MA 02215",592 Sunset St,"Boston, MA 02215",MA 02215


# Send images

## Telegram

In [None]:
base_url = "https://api.telegram.org/bot6148622889:AAFHdvQ_CxImlx1VEXE_vYhg4_2NFXk1OyU/sendPhoto"
def send_images_via_telegram(file_path, chat_id, caption="This is a caption"):
    my_file = open(file_path, 'rb')
    parameters = {
    "chat_id" : chat_id,
    "caption" : caption
    }
    files = {   
    "photo" : my_file
    }
    resp = requests.post(base_url, data=parameters, files=files)
    print(resp.status_code)

## Slack


In [None]:
slack_token = "xoxp-5452682117826-5446024818310-5452588829571-92e60adc3ecd07a736b6faea910b8831"
channel_id = "C05DAGDAPEX"
client = WebClient(token=slack_token)
def send_images_via_slack(file_path):
    try:
        response = client.files_upload(
                channels=channel_id,
                file=file_path
                )
        print(response)
    except SlackApiError as e:
        print(f"Error uploading file: {e.response['error']}")