# Cleaning the Data

In [1]:
import pandas as pd

In [2]:
# Load the dataset
file_path = '../Resources/U.S._Chronic_Disease_Indicators.csv'
data = pd.read_csv(file_path)

In [3]:
data.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,TopicID,QuestionID,ResponseID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2019,2019,AR,Arkansas,BRFSS,Diabetes,Diabetes among adults,,%,Crude Prevalence,...,DIA,DIA01,,CRDPREV,SEX,SEXM,,,,
1,2019,2019,ID,Idaho,BRFSS,Diabetes,Diabetes among adults,,%,Crude Prevalence,...,DIA,DIA01,,CRDPREV,SEX,SEXM,,,,
2,2019,2019,IN,Indiana,YRBSS,Sleep,Short sleep duration among high school students,,%,Crude Prevalence,...,SLEP,SLP02,,CRDPREV,GRADE,GRD12,,,,
3,2019,2019,IA,Iowa,NVSS,Asthma,"Asthma mortality among all people, underlying ...",,Number,Number,...,AST,AST01,,NMBR,OVERALL,OVR,,,,
4,2019,2019,IA,Iowa,BRFSS,Asthma,Current asthma among adults,,%,Crude Prevalence,...,AST,AST02,,CRDPREV,AGE,AGE1844,,,,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309215 entries, 0 to 309214
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   YearStart                  309215 non-null  int64  
 1   YearEnd                    309215 non-null  int64  
 2   LocationAbbr               309215 non-null  object 
 3   LocationDesc               309215 non-null  object 
 4   DataSource                 309215 non-null  object 
 5   Topic                      309215 non-null  object 
 6   Question                   309215 non-null  object 
 7   Response                   0 non-null       float64
 8   DataValueUnit              309215 non-null  object 
 9   DataValueType              309215 non-null  object 
 10  DataValue                  209196 non-null  float64
 11  DataValueAlt               209196 non-null  float64
 12  DataValueFootnoteSymbol    101716 non-null  object 
 13  DataValueFootnote          10

In [5]:
# Drop columns with 0 non-null values
# Drop only the columns that exist in the dataset
columns_to_drop = [
    'Response', 'StratificationCategory2', 'Stratification2', 'StratificationCategory3', 'Stratification3', 
    'ResponseID', 'StratificationCategoryID2', 'StratificationID2','StratificationCategoryID3', 
    'StratificationID3', 'YearStart', 
    'DataValueFootnoteSymbol', 'DataValueFootnote', 'StratificationID1', 'StratificationCategoryID1'
    
]

# Filter columns_to_drop to include only existing columns
columns_to_drop = [col for col in columns_to_drop if col in data.columns]

# Drop the filtered list of columns
data.drop(columns=columns_to_drop, inplace=True)

print("Dropped unnecessary columns successfully!")


Dropped unnecessary columns successfully!


In [6]:
# Fill missing 'Geolocation' with 'Unknown'
data['Geolocation'].fillna('Unknown', inplace=True)

In [7]:
# Extract longitude and latitude into separate columns
data[['Longitude', 'Latitude']] = data['Geolocation'].str.extract(r'POINT \(([-\d.]+) ([-\d.]+)\)')


In [8]:
# Drop duplicate rows
data = data.drop_duplicates()

In [9]:
#Rename Columns
data.rename(columns={'YearEnd': 'Year'}, inplace=True)
data.rename(columns={'LocationDesc': 'Location'}, inplace=True)
data.rename(columns={'LocationAbbr': 'LocationID'}, inplace=True)
data.rename(columns={'DataValue': 'Value'}, inplace=True)

In [10]:
# List of stratification categories
StratificationCategory1 = ['Sex', 'Age', 'Race/Ethnicity', 'Grade', 'Overall']

# Pivot the data: Create separate columns for each StratificationCategory
for category in StratificationCategory1:
    # Assign values from Stratification1 where StratificationCategory1 matches the current category
    data[category] = data['Stratification1'].where(data['StratificationCategory1'] == category)

# Drop the original columns if no longer needed
data.drop(columns=['StratificationCategory1', 'Stratification1'], inplace=True)

# Fill missing values in the new columns with 'Unknown' (optional)
data[StratificationCategory1] = data[StratificationCategory1].fillna('Unknown')


In [11]:
data.drop(columns=[ 'Overall'], inplace=True)

In [12]:
# Find the unique values in the DataValueUnit column
unique_units = data['DataValueUnit'].unique()
# Print the unique units
print(unique_units)
# Add a new column for each unique unit type, excluding "Number" in the formatted cells
unique_units = data['DataValueUnit'].unique()
# Create new columns
for unit in unique_units:
    data[unit] = data.apply(
        lambda row: f"{row['Value']} {unit}" if row['DataValueUnit'] == unit and unit != 'Number' else row['Value'] if row['DataValueUnit'] == unit else None,
        axis=1
    )
# View the updated DataFrame
print(data.head())

['%' 'Number' 'Years' 'per 100,000' 'cases per 100,000' 'cases per 1,000'
 'cases per 1,000,000' 'gallons']
   Year LocationID  Location DataSource     Topic  \
0  2019         AR  Arkansas      BRFSS  Diabetes   
1  2019         ID     Idaho      BRFSS  Diabetes   
2  2019         IN   Indiana      YRBSS     Sleep   
3  2019         IA      Iowa       NVSS    Asthma   
4  2019         IA      Iowa      BRFSS    Asthma   

                                            Question DataValueUnit  \
0                              Diabetes among adults             %   
1                              Diabetes among adults             %   
2    Short sleep duration among high school students             %   
3  Asthma mortality among all people, underlying ...        Number   
4                        Current asthma among adults             %   

      DataValueType  Value  DataValueAlt  ...  Race/Ethnicity     Grade  \
0  Crude Prevalence   13.6          13.6  ...         Unknown   Unknown   
1 

In [13]:
# Define the new column order
new_column_order = ['Year','Sex', 'Age', 'Grade', 'Race/Ethnicity', 'DataSource',
                    'Location','LocationID','Geolocation', 'Longitude', 'Latitude', 
                    'Topic', 'TopicID', 'Question', 'QuestionID', 'Value', 'DataValueUnit', 
                    'DataValueType', 'DataValueAlt', 'DataValueTypeID', 'LowConfidenceLimit', 'HighConfidenceLimit' ]
# Reorder the DataFrame
data_reordered = data[new_column_order]

In [14]:
data = data_reordered.sort_values(by=['DataSource'], ascending=[True])
data.reset_index(drop=True, inplace=True)

In [15]:
data.head()

Unnamed: 0,Year,Sex,Age,Grade,Race/Ethnicity,DataSource,Location,LocationID,LocationID.1,Geolocation,...,TopicID,Question,QuestionID,Value,DataValueUnit,DataValueType,DataValueAlt,DataValueTypeID,LowConfidenceLimit,HighConfidenceLimit
0,2019,Unknown,Unknown,Unknown,Hispanic,ACS,California,CA,6,POINT (-120.99999953799971 37.63864012300047),...,SDOH,High school completion among adults aged 18-24,SDH02,87.7,%,Crude Prevalence,87.7,CRDPREV,87.0,88.4
1,2019,Male,Unknown,Unknown,Unknown,ACS,New York,NY,36,POINT (-75.54397042699964 42.82700103200045),...,SDOH,Unemployment rate among people 16 years and ol...,SDH08,4.7,%,Crude Prevalence,4.7,CRDPREV,4.4,5.0
2,2021,Unknown,Unknown,Unknown,"Black, non-Hispanic",ACS,Georgia,GA,13,POINT (-83.62758034599966 32.83968109300048),...,SDOH,Living below 150% of the poverty threshold amo...,SDH01,30.8,%,Crude Prevalence,30.8,CRDPREV,29.3,32.4
3,2019,Unknown,Unknown,Unknown,Unknown,ACS,Utah,UT,49,POINT (-111.58713063499971 39.360700171000474),...,SDOH,Unemployment rate among people 16 years and ol...,SDH08,3.0,%,Crude Prevalence,3.0,CRDPREV,2.7,3.4
4,2021,Unknown,Unknown,Unknown,"Black, non-Hispanic",ACS,Nevada,NV,32,POINT (-117.07184056399967 39.493240390000494),...,SDOH,High school completion among adults aged 18-24,SDH02,82.1,%,Crude Prevalence,82.1,CRDPREV,74.1,87.9


In [16]:
# Save the dataset as a CSV file
data.to_csv('../Resources/final_cleaned_dataset.csv', index=False)
