# Clean 

In [12]:
import polars as pl
import pandas as pd
from typing import Union

def calculate_missing_percentages(
    df: Union[pl.DataFrame, pd.DataFrame], 
    missing_value: float = None
) -> pd.DataFrame:
    """
    Calculates the percentage of missing values in each column of a DataFrame (Polars or Pandas), 
    accounting for a specified missing value if provided.

    Args:
        df (pl.DataFrame or pd.DataFrame): The input DataFrame which might contain missing values.
        missing_value (float, optional): A custom value to treat as missing. If None, 
                                         the function will consider only None or NaN values as missing.

    Returns:
        pd.DataFrame: A DataFrame with one row showing the percentage of missing values for each column.

    Raises:
        ValueError: If an unsupported DataFrame type is provided (neither Polars nor Pandas).
    """
    
    # Check for Polars DataFrame
    if isinstance(df, pl.DataFrame):
        total_rows = df.height
        
        # Define the missing condition based on whether a custom missing value is provided
        if missing_value is not None:
            missing_condition = lambda col: (pl.col(col).is_null() | (pl.col(col) == missing_value)).sum().alias(col)
        else:
            missing_condition = lambda col: pl.col(col).is_null().sum().alias(col)
        
        # Calculate missing counts for each column
        missing_counts = df.select([missing_condition(col) for col in df.columns])
        
        # Convert counts to percentages and return as Pandas DataFrame
        missing_percentage = (missing_counts / total_rows * 100).to_pandas()
        return missing_percentage
    
    # Check for Pandas DataFrame
    elif isinstance(df, pd.DataFrame):
        total_rows = len(df)
        
        # Calculate missing counts based on custom missing value or NaN
        if missing_value is not None:
            missing_counts = df.isnull().sum() + df.eq(missing_value).sum()
        else:
            missing_counts = df.isnull().sum()

        # Convert counts to percentages and return as Pandas DataFrame
        missing_percentage = (missing_counts / total_rows * 100).to_frame().T
        return missing_percentage
    
    # Raise error for unsupported DataFrame types
    else:
        raise ValueError("Unsupported DataFrame type. Please provide either a Polars or Pandas DataFrame.")


# Sample Data (works for both Pandas and Polars DataFrames)
data = {
    'A': [1, 2, None, 4],
    'B': [None, 1, 2, None],
    'C': [5, 6, 7, 8]
}

# Example 1: Using Polars DataFrame
polars_df = pl.DataFrame(data)

# Calculate missing percentages in Polars DataFrame
polars_missing_percentages = calculate_missing_percentages(polars_df)
print("Missing Percentages (Polars DataFrame):")
print(polars_missing_percentages)

# Example 2: Using Pandas DataFrame
pandas_df = pd.DataFrame(data)

# Calculate missing percentages in Pandas DataFrame
pandas_missing_percentages = calculate_missing_percentages(pandas_df)
print("\nMissing Percentages (Pandas DataFrame):")
print(pandas_missing_percentages)


Missing Percentages (Polars DataFrame):
      A     B    C
0  25.0  50.0  0.0

Missing Percentages (Pandas DataFrame):
      A     B    C
0  25.0  50.0  0.0


In [15]:
def remove_specific_values(
    df: pl.DataFrame,  # Input Polars DataFrame
    column_name: str,  # Column name to check
    value: Any  # Specific value to remove (can be int, float, str, etc.)
) -> pl.DataFrame:
    """
    Remove rows from a Polars DataFrame based on a specific value in a given column.

    This function filters out rows from the DataFrame where the specified column 
    contains the given value, returning a cleaned DataFrame.

    Args:
        df (pl.DataFrame): The input Polars DataFrame.
        column_name (str): The name of the column where the specific value is checked.
        value (Any): The value that will trigger row removal if found in the specified column.
                     This can be an int, float, string, or other types.

    Returns:
        pl.DataFrame: A new DataFrame with rows containing the specified value removed.

    Raises:
        KeyError: If the column does not exist in the DataFrame.
        ValueError: If the value type does not match the column type.
    """
    if column_name not in df.columns:
        raise KeyError(f"Column '{column_name}' not found in the DataFrame.")

    return df.filter(df[column_name] != value)


# Convert dataset 

In [14]:
import xarray as xr
import numpy as np
import polars as pl
import pandas as pd

def xr_to_polars(
    dataset: xr.Dataset,
    variables: list = None,
    start_date: str = None,
    end_date: str = None,
    time_dim_candidates: list = ['time', 'TIME', 'Time', 'datetime', 'DATE']
) -> pl.DataFrame:
    """
    Convert an xarray Dataset to a Polars DataFrame in a universal manner.

    This function handles datasets with various time dimension names, allows
    selection of specific variables, and supports date range filtering.

    Args:
        dataset (xr.Dataset): The input xarray dataset.
        variables (list, optional): List of variable names to include. If None, include all variables.
        start_date (str, optional): The start date for data selection (e.g., '1981-09-01').
        end_date (str, optional): The end date for data selection (e.g., '2024-10-17').
        time_dim_candidates (list, optional): List of possible time dimension names to identify the time dimension.

    Returns:
        pl.DataFrame: The converted Polars DataFrame.
    
    Raises:
        ValueError: If no valid time dimension is found or if specified variables are missing.
    """
    # Identify the time dimension
    time_dim = next((dim for dim in time_dim_candidates if dim in dataset.dims), None)
    if not time_dim:
        raise ValueError(f"No recognized time dimension found. Tried: {time_dim_candidates}")

    # Select specific variables if provided
    if variables is not None:
        missing_vars = [var for var in variables if var not in dataset.data_vars]
        if missing_vars:
            raise ValueError(f"Variables not found in dataset: {missing_vars}")
        dataset = dataset[variables]
    
    # Apply date filtering if specified
    if start_date or end_date:
        # Determine the maximum valid date dynamically if end_date is not provided
        if not end_date:
            max_valid_date = dataset[variables[0]].dropna(dim=time_dim, how='all')[time_dim].max().values
            end_date = pd.to_datetime(max_valid_date).strftime('%Y-%m-%d')
        dataset = dataset.sel({time_dim: slice(start_date, end_date)})
    
    # Convert to pandas DataFrame
    df_pandas = dataset.to_dataframe().reset_index()
    
    # Convert pandas DataFrame to Polars DataFrame
    df_polars = pl.from_pandas(df_pandas)
    
    return df_polars

# Example Usage
# Create a sample xarray dataset
data = xr.Dataset(
    {
        "temperature": (("time", "lat", "lon"), 15 + 8 * np.random.randn(3, 2, 2)),
        "precipitation": (("time", "lat", "lon"), 10 * np.random.rand(3, 2, 2)),
    },
    coords={
        "time": pd.date_range("2023-01-01", periods=3),
        "lat": [10, 20],
        "lon": [30, 40],
    },
)

# Define the list of variables and date range for selection
variables = ["temperature"]
start_date = "2023-01-01"
end_date = "2023-01-03"

# Convert xarray dataset to Polars DataFrame
df_polars = xr_to_polars(dataset=data, variables=variables, start_date=start_date, end_date=end_date)

# Print the result
print(df_polars)



shape: (12, 4)
┌─────────────────────┬─────┬─────┬─────────────┐
│ time                ┆ lat ┆ lon ┆ temperature │
│ ---                 ┆ --- ┆ --- ┆ ---         │
│ datetime[ns]        ┆ i64 ┆ i64 ┆ f64         │
╞═════════════════════╪═════╪═════╪═════════════╡
│ 2023-01-01 00:00:00 ┆ 10  ┆ 30  ┆ 8.58694     │
│ 2023-01-01 00:00:00 ┆ 10  ┆ 40  ┆ 16.899987   │
│ 2023-01-01 00:00:00 ┆ 20  ┆ 30  ┆ 31.264098   │
│ 2023-01-01 00:00:00 ┆ 20  ┆ 40  ┆ 25.477382   │
│ 2023-01-02 00:00:00 ┆ 10  ┆ 30  ┆ 10.841174   │
│ …                   ┆ …   ┆ …   ┆ …           │
│ 2023-01-02 00:00:00 ┆ 20  ┆ 40  ┆ 26.906074   │
│ 2023-01-03 00:00:00 ┆ 10  ┆ 30  ┆ 22.603768   │
│ 2023-01-03 00:00:00 ┆ 10  ┆ 40  ┆ -2.146431   │
│ 2023-01-03 00:00:00 ┆ 20  ┆ 30  ┆ 26.722581   │
│ 2023-01-03 00:00:00 ┆ 20  ┆ 40  ┆ 17.598968   │
└─────────────────────┴─────┴─────┴─────────────┘


# Convert variable type

In [None]:
import polars as pl
from typing import Union, Any

def convert_column_to_type(
    df: pl.DataFrame, 
    column_name: str, 
    dtype: Union[type, str]
) -> pl.DataFrame:
    """
    Convert a specified column in a Polars DataFrame to a desired type.

    Args:
    df (pl.DataFrame): The input Polars DataFrame.
    column_name (str): The name of the column to convert.
    dtype (Union[type, str]): The desired type for the column. Can be a type or string.

    Returns:
    pl.DataFrame: A new DataFrame with the specified column converted to the desired type.

    Raises:
    ValueError: If the specified column doesn't exist in the DataFrame.
    TypeError: If the dtype is not supported or invalid.
    """
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")
    
    if isinstance(dtype, str):
        try:
            dtype = getattr(pl, dtype)
        except AttributeError:
            raise TypeError(f"Invalid dtype '{dtype}' provided.")
    
    return df.with_columns(
        pl.col(column_name).cast(dtype).alias(column_name)
    )

# Merge 

In [1]:
# xarrays 
# pandas 
# polars 