In [None]:
def find_instances(df):
    """
    Find instances of mixed data types in a dataframe.
    """
    for column in df.columns:
        # Calculate the data types only once per column to improve efficiency
        # Treat None/NaN as 'NoneType' explicitly
        column_types = df[column].apply(lambda x: type(x).__name__ if x is not None else 'NoneType')
        type_counts = column_types.value_counts()

        # Print information only for columns with mixed types
        if len(type_counts) > 1:
            print(f"Data types in column '{column}':")
            # Print the amount of nulls in the column
            null_count = df[column].isnull().sum()
       
            print(f"Number of null values in column '{column}': {null_count}")
            print(type_counts)
            
            # Print an example of an instance for each type and count nulls associated with each type
            for t in type_counts.index:
                # Filter the dataframe to only rows of the current type
                filtered_df = df[column][column_types == t]
                # Find an example of the current type
                example = filtered_df.iloc[0] if t != 'NoneType' else 'None'
                print(f"Example of type {t}: {example}")
                
                # Count nulls specifically for the current type
                if t != 'NoneType':
                    null_count = filtered_df.isnull().sum()
                else:
                    null_count = filtered_df.size  # All entries are 'None' for 'NoneType'
                
                print(f"Number of null values for type {t} in column '{column}': {null_count}")

            print()

In [None]:
def merge_city_names(*dicts):
    merged_dict = {}
    for d in dicts:
        for city, names in d.items():
            if city not in merged_dict:
                merged_dict[city] = set(names)  # Use a set to avoid duplicates
            else:
                merged_dict[city].update(names)
    # Convert sets back to lists
    for city in merged_dict:
        merged_dict[city] = list(merged_dict[city])
    return merged_dict

In [None]:
def merge_multiple_dfs_on_apn(DS, *dfs):
    """
    Renames 'ain' to 'APN' in the DS DataFrame and merges it with multiple DataFrames on 'APN'.
    Saves the rows that did not make the merge into a separate DataFrame.

    Args:
        DS (pd.DataFrame): The original DataFrame with 'ain' column.
        *dfs (pd.DataFrame): Additional DataFrames to merge with DS on 'APN'.

    Returns:
        pd.DataFrame: The merged DataFrame.
        pd.DataFrame: The DataFrame with dropped rows.
    """
    # Rename 'ain' to 'APN' in DS
    DS_renamed = DS.rename(columns={'ain': 'assessor_identification_number'})

    # Save the original rows
    original_rows = DS_renamed.copy()

    # Perform the merge operations sequentially
    merged_df = DS_renamed
    for df in dfs:
        merged_df = merged_df.merge(df, on='assessor_identification_number', how='inner')

    # Identify the rows that were dropped
    merged_APNs = merged_df['assessor_identification_number']
    dropped_rows_df = original_rows[~original_rows['assessor_identification_number'].isin(merged_APNs)]
    
    return merged_df, dropped_rows_df

In [None]:
def convert_to_datetime(df, datetime_columns):
    """
    Converts specified columns to datetime, handling zero and NaN values.
    
    Args:
        df (pd.DataFrame): The DataFrame to process.
        datetime_columns (list): List of columns to convert to datetime.

    Returns:
        pd.DataFrame: The DataFrame with the datetime columns processed.
    """
    for column in datetime_columns:
        df[column] = pd.to_datetime(df[column].astype(str).replace(['0', '0.0', 'nan', 'NaT', np.nan, 'NaN'], '19700101'), format='%Y%m%d', errors='coerce')
    return df

In [None]:
def clean_and_convert_strings(df, columns):
    """
    Cleans and converts specified columns in a dataframe to strings. It replaces null values, nan, and any instances
    of 'ÿ' and its repetitions with 'Unknown', then converts each column to string type.

    Args:
        df (pd.DataFrame): The dataframe to process.
        columns (list of str): The list of column names to process as string columns.

    Returns:
        pd.DataFrame: The dataframe with the processed columns.
    """
    for column in columns:
        if column in df.columns:
            # Replace 'ÿ' sequences and null values with 'Unknown'
            df[column] = df[column].replace(to_replace=r'ÿ+', value='Unknown', regex=True).fillna('Unknown')
            # Convert to string type
            df[column] = df[column].astype(str)
        else:
            print(f"Warning: Column '{column}' not found in DataFrame.")
    return df

def convert_to_datetime(df, datetime_columns):
    """
    Converts specified columns to datetime, handling zero and NaN values.
    
    Args:
        df (pd.DataFrame): The DataFrame to process.
        datetime_columns (list): List of columns to convert to datetime.

    Returns:
        pd.DataFrame: The DataFrame with the datetime columns processed.
    """
    for column in datetime_columns:
        # Replace invalid entries with a placeholder date
        df[column] = pd.to_datetime(
            df[column].astype(str).replace(['0', '0.0', 'nan', 'NaT', np.nan, 'NaN'], '19700101'),
            format='%Y%m%d', errors='coerce'
        ).dt.normalize()  # Normalize to remove the time part
    
    # Replace NaT values with the placeholder date
    df[datetime_columns] = df[datetime_columns].fillna(pd.Timestamp('1970-01-01'))
    
    return df

def convert_to_int(df, columns):
    """
    Converts specified columns in a dataframe to int64. It handles floats by converting them directly to ints,
    numeric strings are also converted to ints, and non-numeric strings or any other non-convertible values are
    replaced with 0.

    Args:
        df (pd.DataFrame): The dataframe to process.
        columns (list of str): The list of column names to process as integer columns.

    Returns:
        pd.DataFrame: The dataframe with the processed columns.
    """
    for column in columns:
        if column in df.columns:
            # Attempt to convert all values to int64, replacing non-convertible values with 0
            df[column] = pd.to_numeric(df[column], errors='coerce').fillna(0).astype('int64')
        else:
            print(f"Warning: Column '{column}' not found in DataFrame.")

    return df

def convert_to_float(df, columns):
    """
    Converts specified columns in a dataframe to float64. It handles numeric strings and integers by converting them
    directly to floats, and non-numeric strings or any other non-convertible values are replaced with 0.

    Args:
        df (pd.DataFrame): The dataframe to process.
        columns (list of str): The list of column names to process as float columns.

    Returns:
        pd.DataFrame: The dataframe with the processed columns.
    """
    for column in columns:
        if column in df.columns:
            # Attempt to convert all values to float64, replacing non-convertible values with 0
            df[column] = pd.to_numeric(df[column], errors='coerce').fillna(0).astype('float64')
        else:
            print(f"Warning: Column '{column}' not found in DataFrame.")

    return df

def detect_column_types_from_dict(df, column_types_dict):
    """
    Detects and verifies the data types for the specified columns using the provided dictionary.

    Args:
        df (pd.DataFrame): The DataFrame to process.
        column_types_dict (dict): A dictionary mapping column names to their desired data types.

    Returns:
        dict: A dictionary mapping column names to their detected data types.
    """
    column_types = {}
    for column, expected_type in column_types_dict.items():
        if column in df.columns:
            if pd.api.types.is_dtype_equal(df[column], expected_type):
                column_types[column] = expected_type
            else:
                # Mixed type detection (this part could be more sophisticated)
                unique_types = set(df[column].apply(type))
                if expected_type == 'object' and (str in unique_types or bytes in unique_types):
                    column_types[column] = 'object'
                elif expected_type == 'int64' and (int in unique_types or float in unique_types):
                    column_types[column] = 'int64'
                elif expected_type == 'float64' and (float in unique_types or int in unique_types):
                    column_types[column] = 'float64'
                elif expected_type in ['datetime64'] and pd.api.types.is_datetime64_any_dtype(df[column]):
                    column_types[column] = 'datetime64'
                else:
                    column_types[column] = expected_type
        else:
            print(f"Warning: Column '{column}' not found in DataFrame.")
    return column_types

def apply_column_conversions(df, column_types):
    """
    Applies the appropriate conversions to the DataFrame columns based on a dictionary of column types.

    Args:
        df (pd.DataFrame): The DataFrame to process.
        column_types (dict): A dictionary mapping column names to their desired data types.

    Returns:
        pd.DataFrame: The DataFrame with the columns converted to the specified data types.
    """

    string_columns = []
    int_columns = []
    datetime_columns = []
    float_columns = []
    
    # Classify columns by the target data type
    for column, dtype in column_types.items():
        if dtype == 'object':
            string_columns.append(column)
        elif dtype == 'int64':
            int_columns.append(column)
        elif dtype == 'float64':
            float_columns.append(column)
        elif dtype == 'datetime64':
            datetime_columns.append(column)
    
    # Apply conversions
    if string_columns:
        df = df.astype({col: 'str' for col in string_columns})
    if int_columns:
        df[int_columns] = df[int_columns].apply(pd.to_numeric, errors='coerce').fillna(0).astype('int64')
    if float_columns:
        df[float_columns] = df[float_columns].apply(pd.to_numeric, errors='coerce').astype('float64')
    if datetime_columns:
        df = convert_to_datetime(df, datetime_columns)

    return df

def clean_column_names(df):
    """
    Cleans the column names by making them lowercase, replacing spaces with underscores, and removing dashes.

    Args:
        df (pd.DataFrame): The DataFrame to process.

    Returns:
        pd.DataFrame: The DataFrame with cleaned column names.
    """
    df.columns = df.columns.str.replace('-', '', regex=False)
    df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
    return df