# Cleaning the Data

In [43]:
import pandas as pd

In [44]:
# Load the dataset
file_path = r"C:\Users\info\Desktop\project3team11\Resources\U.S._Chronic_Disease_Indicators.csv"
data = pd.read_csv(file_path)

In [45]:
data.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,TopicID,QuestionID,ResponseID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
0,2019,2019,AR,Arkansas,BRFSS,Diabetes,Diabetes among adults,,%,Crude Prevalence,...,DIA,DIA01,,CRDPREV,SEX,SEXM,,,,
1,2019,2019,ID,Idaho,BRFSS,Diabetes,Diabetes among adults,,%,Crude Prevalence,...,DIA,DIA01,,CRDPREV,SEX,SEXM,,,,
2,2019,2019,IN,Indiana,YRBSS,Sleep,Short sleep duration among high school students,,%,Crude Prevalence,...,SLEP,SLP02,,CRDPREV,GRADE,GRD12,,,,
3,2019,2019,IA,Iowa,NVSS,Asthma,"Asthma mortality among all people, underlying ...",,Number,Number,...,AST,AST01,,NMBR,OVERALL,OVR,,,,
4,2019,2019,IA,Iowa,BRFSS,Asthma,Current asthma among adults,,%,Crude Prevalence,...,AST,AST02,,CRDPREV,AGE,AGE1844,,,,


In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309215 entries, 0 to 309214
Data columns (total 34 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   YearStart                  309215 non-null  int64  
 1   YearEnd                    309215 non-null  int64  
 2   LocationAbbr               309215 non-null  object 
 3   LocationDesc               309215 non-null  object 
 4   DataSource                 309215 non-null  object 
 5   Topic                      309215 non-null  object 
 6   Question                   309215 non-null  object 
 7   Response                   0 non-null       float64
 8   DataValueUnit              309215 non-null  object 
 9   DataValueType              309215 non-null  object 
 10  DataValue                  209196 non-null  float64
 11  DataValueAlt               209196 non-null  float64
 12  DataValueFootnoteSymbol    101716 non-null  object 
 13  DataValueFootnote          10

In [47]:
# Drop columns with 0 non-null values
# Drop only the columns that exist in the dataset
columns_to_drop = [
    'Response', 'StratificationCategory2', 'Stratification2', 'StratificationCategory3', 'Stratification3', 
    'ResponseID', 'StratificationCategoryID2', 'StratificationID2','StratificationCategoryID3', 
    'StratificationID3', 'YearStart', 
    'DataValueFootnoteSymbol', 'DataValueFootnote', 'StratificationID1', 'StratificationCategoryID1'
    
]

# Filter columns_to_drop to include only existing columns
columns_to_drop = [col for col in columns_to_drop if col in data.columns]

# Drop the filtered list of columns
data.drop(columns=columns_to_drop, inplace=True)

print("Dropped unnecessary columns successfully!")


Dropped unnecessary columns successfully!


In [48]:
# Fill missing 'Geolocation' with 'Unknown'
data['Geolocation'].fillna('Unknown', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Geolocation'].fillna('Unknown', inplace=True)


In [49]:
# Extract longitude and latitude into separate columns
data[['Longitude', 'Latitude']] = data['Geolocation'].str.extract(r'POINT \(([-\d.]+) ([-\d.]+)\)')
#Rename Columns
data.rename(columns={'YearEnd': 'Year'}, inplace=True)
data.rename(columns={'LocationDesc': 'Location'}, inplace=True)
data.rename(columns={'LocationAbbr': 'LocationID'}, inplace=True)
data.rename(columns={'DataValue': 'Value'}, inplace=True)

In [50]:
# Drop duplicate rows
data = data.drop_duplicates()

In [51]:
# List of stratification categories
StratificationCategory1 = ['Sex', 'Age', 'Race/Ethnicity', 'Grade', 'Overall']

# Pivot the data: Create separate columns for each StratificationCategory
for category in StratificationCategory1:
    # Assign values from Stratification1 where StratificationCategory1 matches the current category
    data[category] = data['Stratification1'].where(data['StratificationCategory1'] == category)

# Drop the original columns if no longer needed
data.drop(columns=['StratificationCategory1', 'Stratification1'], inplace=True)

# Fill missing values in the new columns with 'Unknown' (optional)
data[StratificationCategory1] = data[StratificationCategory1].fillna('Unknown')

In [52]:
#Drop 'Overall' Column
data.drop(columns=[ 'Overall','DataValueAlt'], inplace=True)

In [53]:
# Find the unique values in the DataValueUnit column
unique_units = data['DataValueUnit'].unique()

# Print the unique units
print(unique_units)

# Add a new column for each unique unit type, excluding "Number" in the formatted cells
unique_units = data['DataValueUnit'].unique()

# Create new columns
for unit in unique_units:
    data[unit] = data.apply(
        lambda row: f"{row['Value']} {unit}" if row['DataValueUnit'] == unit and unit != 'Number' else row['Value'] if row['DataValueUnit'] == unit else None,
        axis=1
    )
# View the updated DataFrame
print(data.head())

['%' 'Number' 'Years' 'per 100,000' 'cases per 100,000' 'cases per 1,000'
 'cases per 1,000,000' 'gallons']
   Year LocationID  Location DataSource     Topic  \
0  2019         AR  Arkansas      BRFSS  Diabetes   
1  2019         ID     Idaho      BRFSS  Diabetes   
2  2019         IN   Indiana      YRBSS     Sleep   
3  2019         IA      Iowa       NVSS    Asthma   
4  2019         IA      Iowa      BRFSS    Asthma   

                                            Question DataValueUnit  \
0                              Diabetes among adults             %   
1                              Diabetes among adults             %   
2    Short sleep duration among high school students             %   
3  Asthma mortality among all people, underlying ...        Number   
4                        Current asthma among adults             %   

      DataValueType  Value  LowConfidenceLimit  ...  Race/Ethnicity     Grade  \
0  Crude Prevalence   13.6                12.1  ...         Unknown   U

In [54]:
print(data.columns)

Index(['Year', 'LocationID', 'Location', 'DataSource', 'Topic', 'Question',
       'DataValueUnit', 'DataValueType', 'Value', 'LowConfidenceLimit',
       'HighConfidenceLimit', 'Geolocation', 'LocationID', 'TopicID',
       'QuestionID', 'DataValueTypeID', 'Longitude', 'Latitude', 'Sex', 'Age',
       'Race/Ethnicity', 'Grade', '%', 'Number', 'Years', 'per 100,000',
       'cases per 100,000', 'cases per 1,000', 'cases per 1,000,000',
       'gallons'],
      dtype='object')


In [55]:
# Define the new column order
new_column_order = ['Year','Sex', 'Age', 'Grade', 'Race/Ethnicity', 'DataSource',
                    'Location','LocationID','Geolocation', 'Longitude', 'Latitude', 
                    'Topic', 'TopicID', 'Question', 'QuestionID', 'Value', 'DataValueUnit',
                    'DataValueType', '%', 'Number', 'Years', 'per 100,000', 'cases per 100,000', 'cases per 1,000', 'cases per 1,000,000', 'gallons', 'DataValueTypeID', 'LowConfidenceLimit', 'HighConfidenceLimit' ]
# Reorder the DataFrame
data_reordered = data[new_column_order]

# Columns sorted by data source type
data = data_reordered.sort_values(by=['DataSource'], ascending=[True])
data.reset_index(drop=True, inplace=True)

In [56]:
# Review the final data set
print(data)

        Year      Sex      Age     Grade  \
0       2019  Unknown  Unknown   Unknown   
1       2019     Male  Unknown   Unknown   
2       2021  Unknown  Unknown   Unknown   
3       2019  Unknown  Unknown   Unknown   
4       2021  Unknown  Unknown   Unknown   
...      ...      ...      ...       ...   
309210  2021  Unknown  Unknown   Unknown   
309211  2021  Unknown  Unknown   Unknown   
309212  2021  Unknown  Unknown  Grade 12   
309213  2019     Male  Unknown   Unknown   
309214  2019  Unknown  Unknown   Unknown   

                                        Race/Ethnicity DataSource  \
0                                             Hispanic        ACS   
1                                              Unknown        ACS   
2                                  Black, non-Hispanic        ACS   
3                                              Unknown        ACS   
4                                  Black, non-Hispanic        ACS   
...                                                ...   

In [59]:
# Save the dataset as a CSV file
file_path = r'C:\Users\info\Desktop\project3team11\Resources\updated_final_cleaned_dataset.csv'
data.to_csv(file_path, index=False)  
