In [1]:
#Read in Packages
import pandas as pd
import re

#Read in Data
_5_x_15_df = pd.read_excel("Chronic Disease 5X15.xlsx", sheet_name="raw_data")
request_columns_df = pd.read_excel("Chronic Disease 5X15.xlsx", sheet_name="columns")

#Extract only needed columns:
column_list = request_columns_df['Columns'].to_list()
_5_x_15_df = _5_x_15_df[column_list]

#Creating a dictinoary that will store all the dfs that will be sent to an excel file.
excel_dfs = {}

#Creating a list of excel sheets and a blank dataframe to create an excel sheets info 
    #which will hold a desciption of each excel sheet, since we have so many.
excel_sheets_info = pd.DataFrame(columns=['Excel Sheet', 'Description'])
excel_sheets_list = []

#We are appending it because we want it to appear first in the excel file:
excel_dfs["Excel Sheets Descriptions"] = excel_sheets_info

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Data Cleaning: Filter Rows and Columns

In [2]:
#Apply filters:
#Filter to completed responses:
_5_x_15_df = _5_x_15_df[_5_x_15_df["Finished"]==True]
#Drop unneeded columns now:
_5_x_15_df = _5_x_15_df.drop(["Progress", "Finished"], axis=1)

# Remove " Select all that apply." from column names
_5_x_15_df.columns = (_5_x_15_df.columns
    .str.replace(" Select all that apply.", "", regex=False)
    .str.replace("Does your health department perform any of the actions below regarding ", "", regex=False)
    .str.replace("interventions?", "interventions", regex=False))

#Rename to match old data
_5_x_15_df.rename(columns={
    'What District do you work in?': 
        "District", 
    'What is your role at the health department?': 
        "Role at County Health Department (CHD)",
    'What level of knowledge do you have around 5x15 interventions':
        'Knowledge around 5x15 interventions',
    'Select the area of interventions used at your CHD, to the best of your knowledge.':
        "Area of inteventions used at CHD (To the best of the employee's knowledge).",
    'Tobacco intervention steps. Select any of the below steps that your health department performs.':
        "Tobacco intervention steps"}, inplace=True)

#Before we parse the values and create true or false matricies, we need to fix the issue with a comma in on of the diabetes interventions test:
_5_x_15_df['Diabetes 5x15 interventions'] = _5_x_15_df['Diabetes 5x15 interventions'].str.replace(
    "program, like Conversation Maps.",
    "program (like Conversation Maps).",
    regex=False)

#_5_x_15_df

Create our main dataframe that will link to all the other pivot dataframes:

In [3]:
#Create a 'Unique_ID' column for Tabealu Visuials:
_5_x_15_df['Unique_ID'] = ['OSDH_' + str(i) for i in range(1, len(_5_x_15_df) + 1)]

_5_x_15_df['Central Office or District'] = _5_x_15_df.apply(lambda row: 
                                                    'District' if (isinstance(row['District'], str) and 'District' in row['District'])
                                                     else 'Central Office', axis=1)

#Define County Health Department column list:
CHD_column_list = ['What Health Department do you work with?-1',
                   'What Health Department do you work with?-2',
                   'What Health Department do you work with?-3',
                   'What Health Department do you work with?-4',
                   'What Health Department do you work with?-5',
                   'What Health Department do you work with?-6',
                   'What Health Department do you work with?-7',
                   'What Health Department do you work with?-8',
                   'What Health Department do you work with?-9',
                   'What Health Department do you work with?-10']

# Remove " CHD" from the string values in the column
for column in CHD_column_list:
    _5_x_15_df[column] = _5_x_15_df[column].str.replace(" CHD", "", regex=False)
    
# Combine values across the specified columns, excluding null values
_5_x_15_df['County Health Department (CHD)'] = _5_x_15_df[CHD_column_list].apply(lambda row: ', '.join(row.dropna().astype(str)), axis=1)

_5_x_15_non_pivot_df = _5_x_15_df[['Unique_ID',
                                   'Central Office or District',
                                   'District',
                                   'Have you heard of the 5x15 Interventions for Chronic Disease before today?',
                                   'Role at County Health Department (CHD)',
                                   'Knowledge around 5x15 interventions']]


#Appending Termination Dataframe it will appear last:
excel_dfs["meta_data"] = _5_x_15_non_pivot_df
excel_sheets_list.append({'Excel Sheet': "meta_data", 
                          'Description': 'Cotains the meta data along with the responses to single value survey items.'})

#_5_x_15_non_pivot_df

In [4]:
#Filter to District Responses Only:
_5_x_15_district_df = _5_x_15_df[_5_x_15_df['Central Office or District']=="District"].copy()


def create_matrix(df, column_name, id_column, excel_sheet, desc):

    # Replace commas outside parentheses with semicolons
    df[column_name] = df[column_name].apply(lambda x: re.sub(r',(?![^(]*\))', ';', x) if isinstance(x, str) else x)
    
    # Split the delimited values into lists
    df.loc[:,column_name] = df[column_name].fillna("").str.split(";")
    
    # Get all unique values across all rows in the specified column
    unique_values = set(val.strip() for sublist in df[column_name] for val in sublist if val.strip())
    
    # Create a new DataFrame with unique values as columns
    if excel_sheet == "CHD_mx":
        transformed_df = pd.DataFrame({val: df[column_name].apply(
            lambda x: val in x) for val in unique_values})
    else:
        transformed_df = pd.DataFrame({val: df[column_name].apply(
            lambda x: "Yes" if val in x else "No") for val in unique_values})
    
    # Add the unique ID column back
    transformed_df[id_column] = df[id_column]

    # Remove rows where all columns (except the ID column) are False
    if excel_sheet != "area_interv_mx":
        transformed_df = transformed_df.loc[~transformed_df.drop(columns=[id_column]).eq(False or "No").all(axis=1)]

    #Appending Data to the excel_dfs dictionary:
    excel_dfs[excel_sheet] = transformed_df
    
    #Assigning Excel Sheet Info Dataframe:
    new_excel_sheet_info = {'Excel Sheet': excel_sheet, 'Description': desc}

    # Append the new row data to the list
    excel_sheets_list.append(new_excel_sheet_info)
    
    #return transformed_df

create_matrix(_5_x_15_district_df, 
              column_name="County Health Department (CHD)", 
              id_column="Unique_ID", 
              excel_sheet="CHD_mx", 
              desc="Ture False Matrix for CHDs. Tells us which counties a response came from.")

create_matrix(_5_x_15_district_df, 
              column_name="Area of inteventions used at CHD (To the best of the employee's knowledge).", 
              id_column="Unique_ID", 
              excel_sheet="area_interv_mx", 
              desc=("Ture False Matrix for area of interventions. " + 
                    "Tells us which areas of inteverntion are used at their CHD to the best of the their knowledge."))

matrix_dict = {
    "Survey Item": 
        ["Heart Disease 5x15 interventions",	
         "Healthy Brain 5x15 interventions",
         "Tobacco intervention steps",
         "Take Charge (breast and cervical cancer) interventions",
         "Diabetes 5x15 interventions"],
    
    "Area of Intervention":
        ["Heart Disease",
         "Healthy Brain",
         "Tobacco",
         "Cancer (Cervical, Breast)",
         "Diabetes"]}

# Loop through the dictionary and call create_matrix for each pair
for survey_item, area in zip(matrix_dict["Survey Item"], matrix_dict["Area of Intervention"]):
    # Call create_matrix
    create_matrix(
        _5_x_15_district_df,
        column_name=survey_item,
        id_column="Unique_ID",
        excel_sheet= (area + "_mx"),
        desc= ("True False Matrix for actions or steps being used for " + area + "." +
              "If a respondent said 'yes' to " + area + " being used in their CHD, this tells us which actions/steps they are aware of being used."))

In [5]:
#Concating all Excel Sheet Descriptions together:
excel_dfs["Excel Sheets Descriptions"] = pd.concat([excel_dfs["Excel Sheets Descriptions"],pd.DataFrame(excel_sheets_list)], ignore_index=True)

writer = pd.ExcelWriter("5X15 Results Cleaned.xlsx", engine='xlsxwriter')

# loop through `dict` of dataframes
for sheet, df in excel_dfs.items():  
    df.to_excel(writer, sheet_name=sheet, index=False) 
    
    #Saves Dataframe with filters turned on:
    worksheet = writer.sheets[sheet]
    worksheet.autofilter(0,0,df.shape[0],df.shape[1]-1)   
    
    #Adjusts sizes of columns for easier reading:
    for column in df:
        column_length = max(df[column].astype(str).map(len).max(), len(column) + 10) #9 for a bit of extra space
        col_idx = df.columns.get_loc(column)
        writer.sheets[sheet].set_column(col_idx, col_idx, column_length)

writer.close()