Feature engineering

In [2]:
import pandas as pd

# Load filtered dataset
df = pd.read_csv("filtered_food_data.csv")

# ========== 1. Feature: Count of 'bad' ingredients ==========
bad_ingredients = ['sugar', 'fructose', 'glucose', 'syrup', 'palm oil', 'maltodextrin', 'artificial', 'color', 'sweetener']

def count_bad_ingredients(text):
    text = str(text).lower()
    return sum([text.count(word) for word in bad_ingredients])

df['bad_ingredient_count'] = df['ingredients_text'].apply(count_bad_ingredients)

# ========== 2. Feature: Length of ingredients list ==========
df['ingredients_length'] = df['ingredients_text'].apply(lambda x: len(str(x).split(',')))

# ========== 3. Feature: Count of additives ==========
df['additive_count'] = df['additives_tags'].fillna('').apply(lambda x: len(x.split(',')) if x != '' else 0)

# ========== 4. Feature: Nutritional categories ==========
# You can use binning or create a manual health label for classification

def score_to_label(score):
    if score <= 0:
        return 'Healthy'
    elif 0 < score <= 10:
        return 'Moderate'
    else:
        return 'Unhealthy'

df['health_label'] = df['nutriscore_score'].apply(score_to_label)

# Optional: View class distribution
print(df['health_label'].value_counts())

# ========== 5. Final feature set ==========
features = [
    'energy_100g', 'fat_100g', 'sugars_100g', 'salt_100g',
    'fiber_100g', 'proteins_100g',
    'bad_ingredient_count', 'additive_count', 'ingredients_length'
]

target_regression = 'nutriscore_score'
target_classification = 'health_label'

# Save feature-engineered dataset
df.to_csv("engineered_food_data.csv", index=False)

df[features + [target_regression, target_classification]].head()


  df = pd.read_csv("filtered_food_data.csv")


health_label
Unhealthy    108713
Moderate      70764
Healthy       49587
Name: count, dtype: int64


Unnamed: 0,energy_100g,fat_100g,sugars_100g,salt_100g,fiber_100g,proteins_100g,bad_ingredient_count,additive_count,ingredients_length,nutriscore_score,health_label
0,2243.0,28.57,14.29,0.0,3.6,3.57,2,0,4,14.0,Unhealthy
1,1941.0,17.86,17.86,0.635,7.1,17.86,1,0,13,0.0,Healthy
2,2540.0,57.14,3.57,1.22428,7.1,17.86,0,0,5,12.0,Unhealthy
3,1833.0,18.75,15.62,0.1397,9.4,14.06,0,1,31,7.0,Moderate
4,2230.0,36.67,3.33,1.60782,6.7,16.67,1,0,25,12.0,Unhealthy
