In [None]:
#Data Cleaning and Processing
#Environment Setup (For Mac Users)

#Required Libraries
#pandas numpy matplotlib seaborn scipy

In [2]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
from google.colab import files

# Load the dataset
df = pd.read_csv('/content/Nutrition__Physical_Activity__and_Obesity_-_Behavioral_Risk_Factor_Surveillance_System.csv')


# data cleaning
# Focusing on Obseity/ Weight Status, and Education over 12 year spam only in the states with group of ages
# limiting data - too much data
# removing duplicates
# removing outliers
# Idenifying missing values
# remove that value
# Renaming columns
# Removing white space, converting to lowercase for string type columns





#filter rows to only obtain obesity or not
new_df = df[df['Topic'].str.contains('Weight|Obesity', case=False, na=False)]

#removing empty rows and Null values
new_df = new_df[new_df['Education'].notna() & (new_df['Education'].str.strip() != "")]

#filter the columns: to use age, location, year, and obesity, education
new_df = new_df[['YearEnd', 'LocationDesc', 'Data_Value','Education']]

#remove any duplicates in the data
new_df = new_df.drop_duplicates()

#rename columns
new_df = new_df.rename(columns={
   'YearEnd': 'Year',
   'LocationDesc': 'State',
   'Education': 'Education_Level',
   'Data_Value': 'Obesity_Rate',
   'Age': 'Age_Group'
})


#removing outliers
n = new_df['Obesity_Rate'].quantile(0.25)
n2 = new_df['Obesity_Rate'].quantile(0.75)
IQR = n2 - n
lower = n - 1.5 * IQR
upper = n2 + 1.5 * IQR
new_df = new_df[(new_df['Obesity_Rate'] >= lower) & (new_df['Obesity_Rate'] <= upper)]

#remove white space and lowercase
for col in new_df.select_dtypes(include=['object']).columns:
   new_df[col] = new_df[col].str.strip().str.lower()


cleaned_file_path = "cleaned_data.csv"
new_df.to_csv(cleaned_file_path, index=False)

print(f"Cleaned data saved to: {cleaned_file_path}")


Cleaned data saved to: cleaned_data.csv


In [3]:
#EDA

#First we explored the new dataset structure along with viewing the first few rows
print(new_df.info())

#viewing first rows of data
print(new_df.head())


<class 'pandas.core.frame.DataFrame'>
Index: 5344 entries, 14 to 104257
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Year             5344 non-null   int64  
 1   State            5344 non-null   object 
 2   Obesity_Rate     5344 non-null   float64
 3   Education_Level  5344 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 208.8+ KB
None
     Year   State  Obesity_Rate                   Education_Level
14   2011  alaska          25.8  some college or technical school
91   2011  alaska          28.8             less than high school
95   2011  alaska          33.1              high school graduate
96   2011  alaska          38.0              high school graduate
120  2011  alaska          35.3             less than high school


In [4]:
#computing the mean, median, mode

#we will calculate the mean, median, mode for each state over the 12 years

mean_med_mode_each_state = new_df.groupby('State')['Obesity_Rate'].agg(['mean', 'median', lambda x: x.mode().iloc[0] if not x.mode().empty else None])

#mode
mean_med_mode_each_state = mean_med_mode_each_state.rename(columns={'<lambda_0>': 'mode'})

#saving to csv
mean_med_mode_each_state.to_csv("state_obesity_statistics.csv")

#seeing the first few rows

print(mean_med_mode_each_state.head())



                 mean  median  mode
State                              
alabama     34.772115   34.40  34.4
alaska      33.097030   33.40  30.7
arizona     33.174000   34.05  32.8
arkansas    34.499029   34.20  33.2
california  33.721978   34.70  35.6
