In [3]:
import pandas as pd

# Reading the CSV file
df = pd.read_csv('food.csv/food.csv')

# Selecting the specified columns
columns_to_extract = [
    'Description',
    'Data.Carbohydrate',
    'Data.Protein',
    'Data.Fat.Total Lipid',
    'Data.Kilocalories',
    'Data.Fiber',
    'Data.Sugar Total',
    'Data.Major Minerals.Calcium',
    'Data.Major Minerals.Iron',
    'Data.Vitamins.Vitamin C',
    'Data.Vitamins.Vitamin E',
    'Data.Major Minerals.Sodium',
    'Data.Cholesterol'
]

# Creating a new dataframe with only the selected columns
filtered_df = df[columns_to_extract].copy()

# Renaming columns for simplicity (removing 'Data.' prefix and making names more readable)
filtered_df.columns = [
    'Description',
    'Carbohydrate (g)',
    'Protein (g)',
    'Total Fat (g)',
    'Kilocalories',
    'Fiber (g)',
    'Sugar Total (g)',
    'Calcium (mg)',
    'Iron (mg)',
    'Vitamin C (mg)',
    'Vitamin E (mg)',
    'Sodium (mg)',
    'Cholesterol (mg)'
]

# Handling missing or invalid data
# Converting numeric columns to float, replacing non-numeric values with NaN
numeric_columns = [
    'Carbohydrate (g)', 'Protein (g)', 'Total Fat (g)', 'Kilocalories',
    'Fiber (g)', 'Sugar Total (g)', 'Calcium (mg)', 'Iron (mg)',
    'Vitamin C (mg)', 'Vitamin E (mg)', 'Sodium (mg)', 'Cholesterol (mg)'
]

for col in numeric_columns:
    filtered_df[col] = pd.to_numeric(filtered_df[col], errors='coerce')

# Filling NaN values with 0 for simplicity (or could use other strategies like mean/median)
filtered_df.fillna(0, inplace=True)

# Saving the processed dataframe to a new CSV file
filtered_df.to_csv('filtered_food_data.csv', index=False)

# Displaying the first few rows of the processed data
print(filtered_df.head())

                Description  Carbohydrate (g)  Protein (g)  Total Fat (g)  \
0          BUTTER,WITH SALT              0.06         0.85          81.11   
1  BUTTER,WHIPPED,WITH SALT              0.06         0.85          81.11   
2      BUTTER OIL,ANHYDROUS              0.00         0.28          99.48   
3               CHEESE,BLUE              2.34        21.40          28.74   
4              CHEESE,BRICK              2.79        23.24          29.68   

   Kilocalories  Fiber (g)  Sugar Total (g)  Calcium (mg)  Iron (mg)  \
0           717        0.0             0.06            24       0.02   
1           717        0.0             0.06            24       0.16   
2           876        0.0             0.00             4       0.00   
3           353        0.0             0.50           528       0.31   
4           371        0.0             0.51           674       0.43   

   Vitamin C (mg)  Vitamin E (mg)  Sodium (mg)  Cholesterol (mg)  
0             0.0            2.32    