---
title: Data Manipulation
author: Juma Shafara
date: "10-31-2024"
---

In [None]:
#| default_exp manipulate

In [None]:
#| hide
#| export

import pandas as pd

def set_row_as_header(df:pd.DataFrame=None, row_num:int=None) -> pd.DataFrame:
    """
    Set the specified row as column names for the given DataFrame
    """

    if row_num is None or df is None:
        raise ValueError("df and row_num must be an provided")
    
    new_header = df.iloc[row_num]  
    df = df[row_num + 1:]  
    df.columns = new_header  

    # set header as strings
    df.columns = df.columns.astype(str)    
    
    return df

In [None]:
#| hide
#| export

def create_dec_dummies(df):
    # Find columns that end with '_dec'
    dec_columns = [col for col in df.columns if col.endswith('_dec')]
    
    # Explode each list in the columns, and concatenate all exploded columns together
    df_expanded = pd.concat([df[col].explode() for col in dec_columns], axis=1)
    
    # Create dummies for each unique value in the columns ending with '_dec'
    dummies = pd.get_dummies(df_expanded, prefix=dec_columns).groupby(level=0).sum()
    
    # Concatenate the original DataFrame with the dummies
    df = pd.concat([df, dummies], axis=1).drop(columns=dec_columns)
    
    return df

## Set specific row as header

To set a specific row as header (column names), you can use the `setRowAsHeader` from `manipulate` as below

In [None]:
import pandas as pd
from rtvpy.manipulate import set_row_as_header

dataset = pd.read_csv('2022_data_selected.csv')
dataset = set_row_as_header(df=dataset, row_num=0)

dataset.columns

Index(['1.0', '6.0', '55.0', '1.0', '0.3265457238059978', '0', '0', '0', '0',
       '1', '1', '0', '2.0', '0', '1.0', '1', '0', '97', '0', '0.0', '-99',
       '0', '1', 'Struggling'],
      dtype='object', name=0)

In [None]:
#| hide
#| export 
import pandas as pd

def create_region_district_mapping():
    """
    Creates a dictionary mapping districts to their respective regions
    Including explicit entries for GAC and Standard variants
    All district keys are in lowercase for consistent matching
    """
    mapping = {
        # South West Region
        'mitooma': 'South_West',
        'rubanda': 'South_West',
        'kanungu': 'South_West',
        'rukungiri': 'South_West',
        'rubirizi': 'South_West',
        'rukiga': 'South_West',
        
        # Mid West Region
        'kagadi': 'Mid_West',
        'kagadi - gac': 'Mid_West',
        'kagadi - standard': 'Mid_West',
        'kagadi_gac': 'Mid_West',
        'kagadi_standard': 'Mid_West',
        'kyenjojo': 'Mid_West',
        'kyenjojo - gac': 'Mid_West',
        'kyenjojo - standard': 'Mid_West',
        'kibaale': 'Mid_West',
        'kiryandongo': 'Mid_West',
        
        # Eastern Region
        'kaliro': 'Eastern',
        'luuka': 'Eastern'
    }
    return mapping

In [None]:
#| hide
#| export 


def standardize_district_names(df, district_col='pre_district'):
    """
    Standardizes district names by converting to lowercase and stripping whitespace
    
    Parameters:
    df (pandas.DataFrame): Input dataframe
    district_col (str): Name of the district column
    
    Returns:
    pandas.DataFrame: DataFrame with standardized district names
    """
    df = df.copy()
    df[district_col] = df[district_col].str.strip().str.lower()
    return df

In [None]:
#| hide
#| export 


def populate_region_column(df, district_col='pre_district', region_col='region'):
    """
    Populates the region column in a dataframe based on the district-region mapping
    Handles case sensitivity by converting districts to lowercase before mapping
    
    Parameters:
    df (pandas.DataFrame): Input dataframe
    district_col (str): Name of the district column
    region_col (str): Name of the region column to be populated
    
    Returns:
    pandas.DataFrame: DataFrame with populated region column
    """
    # First standardize the district names
    df = standardize_district_names(df, district_col)
    
    # Create and apply the mapping
    mapping = create_region_district_mapping()
    df[region_col] = df[district_col].map(mapping)
    
    # Print any districts that weren't matched
    unmatched = df[df[region_col].isna()][district_col].unique()
    if len(unmatched) > 0:
        print(f"Warning: The following districts were not found in the mapping: {unmatched}")
    
    return df

In [None]:
#| hide
#| export 

import pandas as pd
import numpy as np
import ast

def create_dummies_from_list(df, column_name, prefix=None):
    """
    Create dummy variables from a column containing list-like strings,
    splitting into individual values.
    
    Parameters:
    -----------
    df : pandas DataFrame
    column_name : str
        Name of the column containing list-like strings
    prefix : str, optional
        Prefix for the dummy columns
    """
    # Set prefix
    if prefix is None:
        prefix = column_name.lower()
    
    # Convert string representation of lists to actual lists and extract unique values
    all_values = set()
    
    def safe_eval_list(x):
        if pd.isna(x):
            return []
        try:
            # Safely evaluate string representation of list
            lst = ast.literal_eval(x)
            return [str(item).strip() for item in lst]
        except:
            return []

    # Process each row and collect unique values
    for item in df[column_name].dropna():
        values = safe_eval_list(item)
        all_values.update(values)
    
    # Create dummy columns
    dummies = {}
    for value in all_values:
        # Clean column name
        col_name = f"{prefix}_{value.lower().replace('/', '_').replace(' ', '_').replace(',', '').replace('(', '').replace(')', '')}"
        
        # Create the dummy column
        dummies[col_name] = df[column_name].apply(
            lambda x: 1 if value in safe_eval_list(x) else 0
        )
    
    return pd.DataFrame(dummies, index=df.index)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

In [None]:
#| hide