<a href="https://colab.research.google.com/github/AUSTIN-OMONDI/Austoo/blob/main/Preprocessing_Daily_Food_Nutrition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##1. Load the Data

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('daily_food_nutrition_dataset.csv')

##2. Inspect the Data

In [4]:
# Display the first few rows
(df.head())

         Date  User_ID       Food_Item Category  Calories (kcal)  Protein (g)  \
0  2024-09-11      496            Eggs     Meat              173         42.4   
1  2024-12-17      201           Apple   Fruits               66         39.2   
2  2024-06-09      776  Chicken Breast     Meat              226         27.1   
3  2024-08-27      112          Banana   Fruits              116         43.4   
4  2024-07-28      622          Banana   Fruits              500         33.9   

   Carbohydrates (g)  Fat (g)  Fiber (g)  Sugars (g)  Sodium (mg)  \
0               83.7      1.5        1.5        12.7          752   
1               13.8      3.2        2.6        12.2          680   
2               79.1     25.8        3.2        44.7          295   
3               47.1     16.1        6.5        44.1          307   
4               75.8     47.0        7.8        19.4          358   

   Cholesterol (mg)  Meal_Type  Water_Intake (ml)  
0               125      Lunch                

In [5]:
# Get summary information
(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Date               10000 non-null  object 
 1   User_ID            10000 non-null  int64  
 2   Food_Item          10000 non-null  object 
 3   Category           10000 non-null  object 
 4   Calories (kcal)    10000 non-null  int64  
 5   Protein (g)        10000 non-null  float64
 6   Carbohydrates (g)  10000 non-null  float64
 7   Fat (g)            10000 non-null  float64
 8   Fiber (g)          10000 non-null  float64
 9   Sugars (g)         10000 non-null  float64
 10  Sodium (mg)        10000 non-null  int64  
 11  Cholesterol (mg)   10000 non-null  int64  
 12  Meal_Type          10000 non-null  object 
 13  Water_Intake (ml)  10000 non-null  int64  
dtypes: float64(5), int64(5), object(4)
memory usage: 1.1+ MB


In [6]:
# Get statistical summary
(df.describe())

Unnamed: 0,User_ID,Calories (kcal),Protein (g),Carbohydrates (g),Fat (g),Fiber (g),Sugars (g),Sodium (mg),Cholesterol (mg),Water_Intake (ml)
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,498.7063,327.6939,25.52305,52.56855,25.43735,4.98694,25.05257,497.969,151.8866,554.3536
std,289.123477,158.194716,14.131993,27.387152,14.14532,2.864984,14.480605,287.988001,87.360643,260.977642
min,1.0,50.0,1.0,5.0,1.0,0.0,0.0,0.0,0.0,100.0
25%,245.0,190.0,13.2,28.8,13.3,2.5,12.5,249.75,76.0,327.0
50%,492.0,328.0,25.5,52.8,25.3,5.0,25.0,495.0,153.0,555.5
75%,748.0,464.0,37.7,76.4,37.6,7.5,37.7,749.0,228.0,783.0
max,1000.0,600.0,50.0,100.0,50.0,10.0,50.0,1000.0,300.0,1000.0


##3. Handle Missing Values

In [7]:
df.isnull().sum()

Unnamed: 0,0
Date,0
User_ID,0
Food_Item,0
Category,0
Calories (kcal),0
Protein (g),0
Carbohydrates (g),0
Fat (g),0
Fiber (g),0
Sugars (g),0


##4. Handle Duplicates

In [8]:
df.duplicated().sum()

0

##Convert Data Types

In [9]:
# Convert 'Category' to categorical type
df['Category'] = df['Category'].astype('category')

##6. Encode Categorical Variables

In [10]:
# One-hot encoding for 'Category'
df = pd.get_dummies(df, columns=['Category'], drop_first=True)

##7. Normalize or Scale Numerical values

In [11]:
from sklearn.preprocessing import StandardScaler

# Standardize numerical columns
scaler = StandardScaler()
numerical_cols = ['Calories (kcal)', 'Protein (g)', 'Carbohydrates (g)', 'Fat (g)', 'Fiber (g)', 'Sugars (g)', 'Sodium (mg)', 'Cholesterol (mg)', 'Water_Intake (ml)']
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

##8. Feature Scalling

In [12]:
# Example: Create a new feature 'Total_Nutrients' as the sum of Protein, Carbs, and Fat
df['Total_Nutrients'] = df['Protein (g)'] + df['Carbohydrates (g)'] + df['Fat (g)']

##9. Split the Data

In [13]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X = df.drop('Calories (kcal)', axis=1)  # Features
y = df['Calories (kcal)']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##10. Save the Preprocessed Data

In [14]:
#This allows you to reuse the preprocessed data without repeating the preprocessing steps.
# Save the preprocessed data
df.to_csv('preprocessed_daily_food_nutrition_dataset.csv', index=False)