#### Purpose of the notebook
To restrict the Georgia BRFSS dataset to a theoretically  subset of questions relevant to the analysis of diabetes prevalence, preventive healthcare access, and socioeconomic inequality 

### Impor the required Libaries

In [1]:
# Core data handling
import pandas as pd
import numpy as np

# Reproducibility
np.random.seed(42)

# Display settings
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", "{:.3f}".format)

#### Load cleaned Gergia Dataset

In [2]:
# Load Georgia-only scaffold dataset
df = pd.read_csv('brfss_scaffold_clean.csv')
df.head(2)

Unnamed: 0,Year,Locationabbr,Locationdesc,Topic,Question,Break_Out_Category,Break_Out,Sample_Size,Data_value
0,2024,GA,Georgia,Depression,Ever told you that you have a form of depression?,Overall,Overall,1297,18.5
1,2024,GA,Georgia,Depression,Ever told you that you have a form of depression?,Overall,Overall,5923,81.6


In [3]:
# Key columns
df.columns

Index(['Year', 'Locationabbr', 'Locationdesc', 'Topic', 'Question',
       'Break_Out_Category', 'Break_Out', 'Sample_Size', 'Data_value'],
      dtype='object')

#### Identify Diabates related question

In [4]:
# Identify diabetes-related questions
df['Question'].unique()

array(['Ever told you that you have a form of depression?',
       'What is your age?',
       'Are you blind or do you have serious difficulty seeing, even when wearing glasses?',
       'Ever told you have COPD?', 'Ever told you have kidney disease?',
       'Ever told you had any other types of cancer?',
       'Ever told you had skin cancer?',
       'About how long has it been since you last visited a doctor for a routine checkup?',
       'Ever told you had angina or coronary heart disease?',
       'Ever told you had a heart attack (myocardial infarction)?',
       'Ever told you had a stroke?', 'Adults who reported being deaf',
       'Do you have serious difficulty concentrating, remembering, or making decisions?',
       'Have you ever been told by a doctor that you have diabetes?',
       'Do you have difficulty doing errands alone?',
       'Do you have difficulty dressing or bathing?',
       'Do you have serious difficulty walking or climbing stairs?',
       'Adults who 

#### Define the Relevant questions

- Outcome variable (Diabetes)

In [5]:
diabetes_question = [
    "Have you ever been told by a doctor that you have diabetes?"
]

- Preventive Healthcare Access (Treatment Variables) 

In [6]:
#Preventive questions
prevention_questions = [
    "About how long has it been since you last visited a doctor for a routine checkup?",
    "Do you have any kind of health care coverage?",
    "Adults aged 18-64 who have any kind of health care coverage (variable calculated from one or more BRFSS questions)"
]

- Socioeconomic Status (Inequality Stratifiers)

In [7]:
# Social Economic questions
ses_questions = [
    "What is your annual household income?",
    "What is the highest grade or year of school you completed?",
    "What is your race/ethnicity?",
    "Sex of respondent"
]


- Risk Factors for Machine Learning and Interpretation

In [8]:
#Risk Factor questions
risk_factor_questions = [
    "Weight classification by Body Mass Index (BMI) (variable calculated from one or more BRFSS questions)",
    "Adults who are current smokers (variable calculated from one or more BRFSS questions)",
    "During the past month, did you participate in any physical activities? (variable calculated from one or more BRFSS questions)",
    "Adults who have been told they have high blood pressure (variable calculated from one or more BRFSS questions)",
    "Adults who have had their blood cholesterol checked and have been told it was high (variable calculated from one or more BRFSS questions)"
]


#### Creating Individual data frames

In [9]:
# Outcome Group (Diabetes)
df_diabetes = df[df['Question'].isin(diabetes_question)].copy()
df_diabetes["Question_Role"] = "Outcome (Diabetes)"

#  Prevention Group (Healthcare Access)
df_prevention = df[df["Question"].isin(prevention_questions)].copy()
df_prevention["Question_Role"] = "Prevention (Treatment)"

# Socioeconomic Status (SES)
df_ses = df[df["Question"].isin(ses_questions)].copy()
df_ses["Question_Role"] = "Socioeconomic Status"

# Risk Factors (for ML)
df_risk = df[df["Question"].isin(risk_factor_questions)].copy()
df_risk["Question_Role"] = "Risk Factor"

#### Merging the data frames

In [10]:
# Combine all individual dataframes into one master analysis set
df_relevant = pd.concat([df_diabetes, df_prevention, df_ses, df_risk], ignore_index=True)

# Important: Convert data values to numbers now while they are in one place
df_relevant["Data_value"] = pd.to_numeric(df_relevant["Data_value"], errors='coerce')

# Drop rows that don't have a numerical value (cannot be used for analysis)
df_relevant = df_relevant.dropna(subset=["Data_value"])

print(f"Dataset successfully merged. Total usable rows: {df_relevant.shape[0]:,}")

Dataset successfully merged. Total usable rows: 19,594


#### Verify the groups

In [11]:
# Check that all roles exist and have data
role_check = df_relevant.groupby("Question_Role").agg(
    Record_Count=('Data_value', 'count'),
    Average_Value=('Data_value', 'mean')
).round(2)

display(role_check)

Unnamed: 0_level_0,Record_Count,Average_Value
Question_Role,Unnamed: 1_level_1,Unnamed: 2_level_1
Outcome (Diabetes),1605,40.27
Prevention (Treatment),4020,40.31
Risk Factor,6069,43.48
Socioeconomic Status,7900,26.86


#### Check the distrivution of newly assigned roles

In [12]:
# Check the distribution of your newly assigned roles
print(df_relevant["Question_Role"].value_counts())

Question_Role
Socioeconomic Status      7900
Risk Factor               6069
Prevention (Treatment)    4020
Outcome (Diabetes)        1605
Name: count, dtype: int64


#### Validation

In [13]:
# Run this to see the exact strings used in the dataset
print(df_relevant['Question'].unique()[:20])

['Have you ever been told by a doctor that you have diabetes?'
 'About how long has it been since you last visited a doctor for a routine checkup?'
 'Adults aged 18-64 who have any kind of health care coverage (variable calculated from one or more BRFSS questions)'
 'Do you have any kind of health care coverage?'
 'What is the highest grade or year of school you completed?'
 'What is your annual household income?' 'What is your race/ethnicity?'
 'Sex of respondent'
 'Weight classification by Body Mass Index (BMI) (variable calculated from one or more BRFSS questions)'
 'Adults who are current smokers (variable calculated from one or more BRFSS questions)'
 'During the past month, did you participate in any physical activities? (variable calculated from one or more BRFSS questions)'
 'Adults who have had their blood cholesterol checked and have been told it was high (variable calculated from one or more BRFSS questions)'
 'Adults who have been told they have high blood pressure (variabl

In [14]:
# Check how many rows were found for each specific category
# This ensures no question was "lost" due to a typo
question_counts = df_relevant['Question'].value_counts()

print(f"Total rows extracted: {df_relevant.shape[0]:,}")
print("\nRows found per question:")
print(question_counts)

Total rows extracted: 19,594

Rows found per question:
Question
What is your annual household income?                                                                                                        2425
What is your race/ethnicity?                                                                                                                 2202
Weight classification by Body Mass Index (BMI) (variable calculated from one or more BRFSS questions)                                        2167
What is the highest grade or year of school you completed?                                                                                   2034
About how long has it been since you last visited a doctor for a routine checkup?                                                            1941
Have you ever been told by a doctor that you have diabetes?                                                                                  1605
During the past month, did you participate in any physical a

#### Renaming the Variable Names

In [15]:
# Mapping Dictionary: Long Question -> Short Variable Name
rename_map = {
    # Outcome
    "Have you ever been told by a doctor that you have diabetes?": "diabetes_status",
    
    # Preventive Healthcare
    "About how long has it been since you last visited a doctor for a routine checkup?": "last_checkup",
    "Do you have any kind of health care coverage?": "health_insurance",
    "Adults aged 18-64 who have any kind of health care coverage (variable calculated from one or more BRFSS questions)": "insurance_18_64",
    
    # Socioeconomic Status
    "What is your annual household income?": "income_level",
    "What is the highest grade or year of school you completed?": "education_level",
    "What is your race/ethnicity?": "race_ethnicity",
    "Sex of respondent": "gender",
    
    # Risk Factors
    "Weight classification by Body Mass Index (BMI) (variable calculated from one or more BRFSS questions)": "bmi_category",
    "Adults who are current smokers (variable calculated from one or more BRFSS questions)": "smoking_status",
    "During the past month, did you participate in any physical activities? (variable calculated from one or more BRFSS questions)": "physical_activity",
    "Adults who have been told they have high blood pressure (variable calculated from one or more BRFSS questions)": "high_blood_pressure",
    "Adults who have had their blood cholesterol checked and have been told it was high (variable calculated from one or more BRFSS questions)": "high_cholesterol"
}

# Apply the mapping to a new column
df_relevant["Variable_Name"] = df_relevant["Question"].map(rename_map)

In [16]:
df_relevant.head(3)

Unnamed: 0,Year,Locationabbr,Locationdesc,Topic,Question,Break_Out_Category,Break_Out,Sample_Size,Data_value,Question_Role,Variable_Name
0,2024,GA,Georgia,Diabetes,Have you ever been told by a doctor that you h...,Overall,Overall,1242,12.8,Outcome (Diabetes),diabetes_status
1,2024,GA,Georgia,Diabetes,Have you ever been told by a doctor that you h...,Overall,Overall,46,0.6,Outcome (Diabetes),diabetes_status
2,2024,GA,Georgia,Diabetes,Have you ever been told by a doctor that you h...,Overall,Overall,5779,84.1,Outcome (Diabetes),diabetes_status


#### Verifying the New schema

In [17]:
# Create a clean lookup table of your new variable names
schema_check = df_relevant[["Question_Role", "Variable_Name", "Question"]].drop_duplicates()
display(schema_check)

Unnamed: 0,Question_Role,Variable_Name,Question
0,Outcome (Diabetes),diabetes_status,Have you ever been told by a doctor that you h...
1605,Prevention (Treatment),last_checkup,About how long has it been since you last visi...
1699,Prevention (Treatment),insurance_18_64,Adults aged 18-64 who have any kind of health ...
2663,Prevention (Treatment),health_insurance,Do you have any kind of health care coverage?
5625,Socioeconomic Status,education_level,What is the highest grade or year of school yo...
5708,Socioeconomic Status,income_level,What is your annual household income?
5824,Socioeconomic Status,race_ethnicity,What is your race/ethnicity?
5913,Socioeconomic Status,gender,Sex of respondent
13525,Risk Factor,bmi_category,Weight classification by Body Mass Index (BMI)...
13613,Risk Factor,smoking_status,Adults who are current smokers (variable calcu...


#### Final Numeric check

In [18]:
df_relevant["Data_value"] = pd.to_numeric(df_relevant["Data_value"], errors='coerce')
df_relevant = df_relevant.dropna(subset=["Data_value"])
df_relevant.shape

(19594, 11)

In [19]:
# Validation: Check if the filter returned rows
if diabetes_df.empty:
    print("Warning: No records found for the specified diabetes question. Check for exact string matches.")
else:
    print(f"Successfully isolated {len(diabetes_df):,} records for diabetes analysis.")

NameError: name 'diabetes_df' is not defined

In [None]:
# Inspect the response categories for the selected question
print("Available response categories:")
print(diabetes_df['Break_Out'].unique())

In [None]:
# Grouping prevention and access questions for thematic analysis
prevention_questions = [
    "About how long has it been since you last visited a doctor for a routine checkup?",
    "Do you have any kind of health care coverage?",
    "Adults aged 18-64 who have any kind of health care coverage (variable calculated from one or more BRFSS questions)"
]

# Filtering the dataset
prevention_df =df[df['Question'].isin(prevention_questions)].copy()

In [None]:
# Create a mapping for cleaner visualizations later
question_map = {
    "About how long has it been since you last visited a doctor for a routine checkup?": "Last_Checkup",
    "Do you have any kind of health care coverage?": "Has_Coverage_Raw",
    "Adults aged 18-64 who have any kind of health care coverage (variable calculated from one or more BRFSS questions)": "Has_Coverage_Calculated"
}

# Apply a short-name column if pivoting the data later
prevention_df['Question_Short'] = prevention_df['Question'].map(question_map)

In [None]:
# Create a clean label for every question in your relevant list
clean_label_map = {
    # Outcome
    "Have you ever been told by a doctor that you have diabetes?": "diabetes_status",
    
    # Risk Factors
    "Weight classification by Body Mass Index (BMI) (variable calculated from one or more BRFSS questions)": "bmi_category",
    "Adults who are current smokers (variable calculated from one or more BRFSS questions)": "smoking_status",
    "Adults who have been told they have high blood pressure (variable calculated from one or more BRFSS questions)": "high_bp",
    
    # SES / Prevention
    "What is your annual household income?": "income_level",
    "Do you have any kind of health care coverage?": "healthcare_access",
    "During the past month, did you participate in any physical activities? (variable calculated from one or more BRFSS questions)": "physical_activity"
}

# Apply the clean labels to a new column
df_relevant["Clean_Question"] = df_relevant["Question"].map(clean_label_map).fillna("other_metric")

#### Save the dataset for Analysis

In [20]:
#Save as CSV 
df_relevant.to_csv("brfss_diabetes_cleaned.csv", index=False)

print("Success! Dataset saved for the next session.")

Success! Dataset saved for the next session.
