# Cleaning the Data

In [1]:
import pandas as pd

In [2]:
# Load the dataset
file_path = '../Resources/U.S._Chronic_Disease_Indicators.csv'
data = pd.read_csv(file_path)

In [3]:
data.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,TopicID,QuestionID,ResponseID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2019,2019,AR,Arkansas,BRFSS,Diabetes,Diabetes among adults,,%,Crude Prevalence,...,DIA,DIA01,,CRDPREV,SEX,SEXM,,,,
1,2019,2019,ID,Idaho,BRFSS,Diabetes,Diabetes among adults,,%,Crude Prevalence,...,DIA,DIA01,,CRDPREV,SEX,SEXM,,,,
2,2019,2019,IN,Indiana,YRBSS,Sleep,Short sleep duration among high school students,,%,Crude Prevalence,...,SLEP,SLP02,,CRDPREV,GRADE,GRD12,,,,
3,2019,2019,IA,Iowa,NVSS,Asthma,"Asthma mortality among all people, underlying ...",,Number,Number,...,AST,AST01,,NMBR,OVERALL,OVR,,,,
4,2019,2019,IA,Iowa,BRFSS,Asthma,Current asthma among adults,,%,Crude Prevalence,...,AST,AST02,,CRDPREV,AGE,AGE1844,,,,


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309215 entries, 0 to 309214
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   YearStart                  309215 non-null  int64  
 1   YearEnd                    309215 non-null  int64  
 2   LocationAbbr               309215 non-null  object 
 3   LocationDesc               309215 non-null  object 
 4   DataSource                 309215 non-null  object 
 5   Topic                      309215 non-null  object 
 6   Question                   309215 non-null  object 
 7   Response                   0 non-null       float64
 8   DataValueUnit              309215 non-null  object 
 9   DataValueType              309215 non-null  object 
 10  DataValue                  209196 non-null  float64
 11  DataValueAlt               209196 non-null  float64
 12  DataValueFootnoteSymbol    101716 non-null  object 
 13  DataValueFootnote          10

In [5]:
# Drop columns with 0 non-null values
# Drop only the columns that exist in the dataset
columns_to_drop = [
    'Response', 'StratificationCategory2', 'Stratification2', 'StratificationCategory3', 'Stratification3', 
    'ResponseID', 'StratificationCategoryID2', 'StratificationID2','StratificationCategoryID3', 
    'StratificationID3', 'YearEnd', 'LocationAbbr', 'DataValueUnit', 'DataValueAlt',
    'DataValueFootnoteSymbol', 'DataValueFootnote', 'StratificationID1', 'StratificationCategoryID1'
    
]

# Filter columns_to_drop to include only existing columns
columns_to_drop = [col for col in columns_to_drop if col in data.columns]

# Drop the filtered list of columns
data.drop(columns=columns_to_drop, inplace=True)

print("Dropped unnecessary columns successfully!")


Dropped unnecessary columns successfully!


In [6]:
# Fill missing 'Geolocation' with 'Unknown'
data['Geolocation'].fillna('Unknown', inplace=True)

In [7]:
# Drop duplicate rows
data = data.drop_duplicates()

In [8]:
#Rename Columns
data.rename(columns={'YearStart': 'Year'}, inplace=True)
data.rename(columns={'LocationDesc': 'State'}, inplace=True)

In [9]:
# List of stratification categories
StratificationCategory1 = ['Sex', 'Age', 'Race/Ethnicity', 'Grade', 'Overall']

# Pivot the data: Create separate columns for each StratificationCategory
for category in StratificationCategory1:
    # Assign values from Stratification1 where StratificationCategory1 matches the current category
    data[category] = data['Stratification1'].where(data['StratificationCategory1'] == category)

# Drop the original columns if no longer needed
data.drop(columns=['StratificationCategory1', 'Stratification1'], inplace=True)

# Fill missing values in the new columns with 'Unknown' (optional)
data[StratificationCategory1] = data[StratificationCategory1].fillna('Unknown')


In [10]:
data.drop(columns=['TopicID', 'QuestionID', 'Overall','LocationID', 'DataValueTypeID'], inplace=True)

In [11]:
data.head()

Unnamed: 0,Year,State,DataSource,Topic,Question,DataValueType,DataValue,LowConfidenceLimit,HighConfidenceLimit,Geolocation,Sex,Age,Race/Ethnicity,Grade
0,2019,Arkansas,BRFSS,Diabetes,Diabetes among adults,Crude Prevalence,13.6,12.1,15.4,POINT (-92.27449074299966 34.74865012400045),Male,Unknown,Unknown,Unknown
1,2019,Idaho,BRFSS,Diabetes,Diabetes among adults,Crude Prevalence,10.6,9.1,12.2,POINT (-114.3637300419997 43.682630005000476),Male,Unknown,Unknown,Unknown
2,2019,Indiana,YRBSS,Sleep,Short sleep duration among high school students,Crude Prevalence,,,,POINT (-86.14996019399968 39.766910452000445),Unknown,Unknown,Unknown,Grade 12
3,2019,Iowa,NVSS,Asthma,"Asthma mortality among all people, underlying ...",Number,54.0,,,POINT (-93.81649055599968 42.46940091300047),Unknown,Unknown,Unknown,Unknown
4,2019,Iowa,BRFSS,Asthma,Current asthma among adults,Crude Prevalence,10.3,9.1,11.7,POINT (-93.81649055599968 42.46940091300047),Unknown,Age 18-44,Unknown,Unknown


In [12]:
# Save the dataset as a CSV file
data.to_csv('../Resources/VM_cleaned_dataset.csv', index=False)
