In [1]:
# =========================
# Hücre 1 – Importlar
# =========================

import os
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, top_k_accuracy_score

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier


In [2]:
# =========================
# Hücre 2 – Veriyi oku
# =========================

csv_path = "../data/Food_and_Nutrition new.csv"   # kendi yoluna göre güncelle
df = pd.read_csv(csv_path)

print("Satır sayısı:", len(df))
df.head()


Satır sayısı: 1698


Unnamed: 0,Gender,Activity Level,Dietary Preference,Breakfast Suggestion,Lunch Suggestion,Dinner Suggestion,Snack Suggestion,Disease,Ages,Height,Weight,Daily Calorie Target,Protein,Sugar,Sodium,Calories,Carbohydrates,Fiber,Fat
0,Male,Moderately Active,Omnivore,Oatmeal with berries and nuts,Grilled chicken salad with mixed greens,Salmon with roasted vegetables,Greek yogurt with fruit,Weight Gain,25,180,80,2000,120,125.0,24.0,2020,250,30.0,60
1,Female,Lightly Active,Vegetarian,Tofu scramble with veggies,Lentil soup with whole wheat bread,Vegetable stir-fry with brown rice,Apple with almond butter,"Weight Gain, Hypertension, Heart Disease",32,165,65,1600,80,100.0,16.0,1480,200,24.0,40
2,Male,Sedentary,Vegan,Tofu and veggie breakfast burrito,Black bean burger on a whole wheat bun,Lentil and vegetable curry,Trail mix,Weight Gain,48,175,95,2200,100,150.0,20.0,2185,300,36.0,65
3,Female,Very Active,Omnivore,Greek yogurt with granola and fruit,Chicken and vegetable stir-fry,Turkey chili with brown rice,Banana with peanut butter,Weight Gain,55,160,70,2500,140,175.0,28.0,2680,350,42.0,80
4,Male,Sedentary,Vegetarian,Scrambled eggs with whole wheat toast and avocado,Quinoa salad with chickpeas and vegetables,Vegetarian chili with cornbread,Fruit and nut mix,Weight Gain,62,170,85,2000,80,125.0,16.0,1815,250,30.0,55


In [3]:
# =========================
# Hücre 3 – Öneri kolonlarını temizle + rare class merge
# =========================

meal_cols = [
    "Breakfast Suggestion",
    "Lunch Suggestion",
    "Dinner Suggestion",
    "Snack Suggestion"
]

def clean_text(s: pd.Series) -> pd.Series:
    return (
        s.astype(str)
         .str.lower()
         .str.strip()
         .str.replace(r"\s+", " ", regex=True)
    )

# 1) Text temizliği
for col in meal_cols:
    df[col] = clean_text(df[col])

print("Temizlik sonrası sınıf sayıları:")
for col in meal_cols:
    print(col, "→", df[col].nunique(), "benzersiz yemek")


# 2) Rare class merge (şimdilik min_count=10, istersen sonra 5'e düşürürüz)
def merge_rare_classes(df, col, min_count=10):
    vc = df[col].value_counts()
    rare_classes = vc[vc < min_count].index
    new_label = "other_" + col.replace(" ", "_").lower()
    df[col] = df[col].replace(rare_classes, new_label)
    return df

for col in meal_cols:
    df = merge_rare_classes(df, col, min_count=10)

print("\nRare class merge sonrası sınıf sayıları:")
for col in meal_cols:
    print(col, "→", df[col].nunique(), "sınıf")


Temizlik sonrası sınıf sayıları:
Breakfast Suggestion → 115 benzersiz yemek
Lunch Suggestion → 187 benzersiz yemek
Dinner Suggestion → 174 benzersiz yemek
Snack Suggestion → 105 benzersiz yemek

Rare class merge sonrası sınıf sayıları:
Breakfast Suggestion → 18 sınıf
Lunch Suggestion → 18 sınıf
Dinner Suggestion → 22 sınıf
Snack Suggestion → 21 sınıf


In [4]:
# =========================
# Hücre 4 – Feature Engineering
# =========================

# 1) BMI
df["BMI"] = df["Weight"] / ((df["Height"] / 100.0) ** 2)

# 2) BMI kategorisi
def bmi_cat(bmi):
    if bmi < 18.5:
        return "under"
    elif bmi < 25:
        return "normal"
    elif bmi < 30:
        return "over"
    else:
        return "obese"

df["BMI_Category"] = df["BMI"].apply(bmi_cat)

# 3) Yaş grubu
def age_group(age):
    if age < 25:
        return "young"
    elif age < 40:
        return "adult"
    elif age < 60:
        return "mid"
    else:
        return "senior"

df["Age_Group"] = df["Ages"].apply(age_group)

# 4) Activity Level → sayısal skala
activity_map = {
    "Sedentary": 1,
    "Lightly Active": 2,
    "Moderate": 3,
    "Active": 4,
    "Very Active": 5
}
df["ActivityNum"] = df["Activity Level"].map(activity_map)

# Eksik kalanlara (map edilemeyen) ortalama değer verelim
df["ActivityNum"] = df["ActivityNum"].fillna(df["ActivityNum"].median())

# 5) Makro oranları (kcal'e göre normalize)
df["protein_ratio"] = df["Protein"] / (df["Daily Calorie Target"] + 1e-6)
df["fat_ratio"]     = df["Fat"]     / (df["Daily Calorie Target"] + 1e-6)
df["sugar_ratio"]   = df["Sugar"]   / (df["Daily Calorie Target"] + 1e-6)
df["sodium_ratio"]  = df["Sodium"]  / (df["Daily Calorie Target"] + 1e-6)

# 6) Bazı etkileşim feature'ları
df["prot_act"] = df["Protein"] * df["ActivityNum"]
df["kcal_act"] = df["Daily Calorie Target"] * df["ActivityNum"]
df["bmi_age"]  = df["BMI"] * df["Ages"]
df["pf_ratio"] = df["Protein"] / (df["Fat"] + 1e-3)

print("Yeni kolonlar eklendi. Örnek satırlar:")
df[[
    "Ages", "Height", "Weight", "Daily Calorie Target",
    "BMI", "BMI_Category", "Age_Group", "ActivityNum",
    "protein_ratio", "fat_ratio", "sugar_ratio", "sodium_ratio",
    "prot_act", "kcal_act", "bmi_age", "pf_ratio"
]].head()


Yeni kolonlar eklendi. Örnek satırlar:


Unnamed: 0,Ages,Height,Weight,Daily Calorie Target,BMI,BMI_Category,Age_Group,ActivityNum,protein_ratio,fat_ratio,sugar_ratio,sodium_ratio,prot_act,kcal_act,bmi_age,pf_ratio
0,25,180,80,2000,24.691358,normal,adult,2.0,0.06,0.03,0.0625,0.012,240.0,4000.0,617.283951,1.999967
1,32,165,65,1600,23.875115,normal,adult,2.0,0.05,0.025,0.0625,0.01,160.0,3200.0,764.003673,1.99995
2,48,175,95,2200,31.020408,obese,mid,1.0,0.045455,0.029545,0.068182,0.009091,100.0,2200.0,1488.979592,1.538438
3,55,160,70,2500,27.34375,over,mid,5.0,0.056,0.032,0.07,0.0112,700.0,12500.0,1503.90625,1.749978
4,62,170,85,2000,29.411765,over,senior,1.0,0.04,0.0275,0.0625,0.008,80.0,2000.0,1823.529412,1.454519


In [5]:
# =========================
# Hücre 5 – Feature listeleri (GÜNCEL)
# =========================

# Modele vereceğimiz tüm feature'lar:
feature_cols = [
    # Orijinal kategorikler
    "Gender",
    "Activity Level",
    "Dietary Preference",
    "Disease",

    # Orijinal sayısallar
    "Ages",
    "Height",
    "Weight",
    "Daily Calorie Target",
    "Protein",
    "Fat",
    "Sugar",
    "Sodium",

    # FE sayısallar
    "BMI",
    "ActivityNum",
    "protein_ratio",
    "fat_ratio",
    "sugar_ratio",
    "sodium_ratio",
    "prot_act",
    "kcal_act",
    "bmi_age",
    "pf_ratio",

    # FE kategorikler
    "BMI_Category",
    "Age_Group",
]

cat_cols = [
    "Gender",
    "Activity Level",
    "Dietary Preference",
    "Disease",
    "BMI_Category",
    "Age_Group",
]

num_cols = [
    "Ages",
    "Height",
    "Weight",
    "Daily Calorie Target",
    "Protein",
    "Fat",
    "Sugar",
    "Sodium",
    "BMI",
    "ActivityNum",
    "protein_ratio",
    "fat_ratio",
    "sugar_ratio",
    "sodium_ratio",
    "prot_act",
    "kcal_act",
    "bmi_age",
    "pf_ratio",
]

print("Toplam feature sayısı:", len(feature_cols))
print("Kategorik kolonlar:", cat_cols)
print("Sayısal kolonlar:", num_cols)


Toplam feature sayısı: 24
Kategorik kolonlar: ['Gender', 'Activity Level', 'Dietary Preference', 'Disease', 'BMI_Category', 'Age_Group']
Sayısal kolonlar: ['Ages', 'Height', 'Weight', 'Daily Calorie Target', 'Protein', 'Fat', 'Sugar', 'Sodium', 'BMI', 'ActivityNum', 'protein_ratio', 'fat_ratio', 'sugar_ratio', 'sodium_ratio', 'prot_act', 'kcal_act', 'bmi_age', 'pf_ratio']


In [6]:
# =========================
# Hücre 6 – RF ile tek öğün modeli eğen fonksiyon (FE + tuned)
# =========================

def train_meal_model_rf(
    df: pd.DataFrame,
    target_col: str,
    feature_cols,
    cat_cols,
    num_cols,
    min_count: int = 1,
    random_state: int = 42,
    model_name: str | None = None,
):
    """
    df:             veri seti
    target_col:     tahmin edilecek kolon
    feature_cols:   input özellik listesi
    cat_cols:       kategorik kolonlar
    num_cols:       sayısal kolonlar
    min_count:      bundan az görülen sınıflar atılır (ek güvenlik)
    """

    # 1) Rare class filtreleme (merge zaten yaptık, bu ek güvenlik)
    y_all = df[target_col]
    class_counts = y_all.value_counts()
    valid_classes = class_counts[class_counts >= min_count].index

    df_t = df[df[target_col].isin(valid_classes)].copy()
    y = df_t[target_col]
    X = df_t[feature_cols]

    print(f"\n===== {target_col} – RF (FE) modeli =====")
    print("Toplam örnek:", len(df_t))
    print("Sınıf sayısı:", y.nunique())

    # 2) Train–Valid–Test bölme (60/20/20, stratified)
    X_trainval, X_test, y_trainval, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=random_state,
        stratify=y
    )

    X_train, X_val, y_train, y_val = train_test_split(
        X_trainval, y_trainval,
        test_size=0.25,
        random_state=random_state,
        stratify=y_trainval
    )

    print("Train:", X_train.shape, " Valid:", X_val.shape, " Test:", X_test.shape)

    # 3) Preprocess (OneHot + Scaling)
    preprocess = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
            ("num", StandardScaler(), num_cols),
        ]
    )

    # 4) Tuned RF parametreleri (önceden RandomizedSearch ile bulduk + class_weight)
    rf_params = dict(
        n_estimators=400,
        max_depth=16,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features="sqrt",
        class_weight="balanced_subsample",
        random_state=random_state,
        n_jobs=-1,
    )

    rf_clf = RandomForestClassifier(**rf_params)

    model = Pipeline(steps=[
        ("preprocess", preprocess),
        ("clf", rf_clf),
    ])

    # 5) Train set ile eğit
    model.fit(X_train, y_train)

    # 6) Validation performansı
    y_val_pred = model.predict(X_val)
    val_acc = accuracy_score(y_val, y_val_pred)

    y_val_proba = model.predict_proba(X_val)
    val_labels = model.named_steps["clf"].classes_

    val_top3 = top_k_accuracy_score(
        y_val,
        y_val_proba,
        k=3,
        labels=val_labels
    )

    print(f"Validation Accuracy  : {val_acc:.4f}")
    print(f"Validation Top-3 Acc : {val_top3:.4f}")

    # 7) Train + Valid birleşimi ile final modeli eğit
    X_final_train = pd.concat([X_train, X_val], axis=0)
    y_final_train = pd.concat([y_train, y_val], axis=0)

    final_model = Pipeline(steps=[
        ("preprocess", preprocess),
        ("clf", RandomForestClassifier(**rf_params)),
    ])

    final_model.fit(X_final_train, y_final_train)

    # 8) Test performansı
    y_test_pred = final_model.predict(X_test)
    test_acc = accuracy_score(y_test, y_test_pred)

    y_test_proba = final_model.predict_proba(X_test)
    test_labels = final_model.named_steps["clf"].classes_

    test_top3 = top_k_accuracy_score(
        y_test,
        y_test_proba,
        k=3,
        labels=test_labels
    )

    print(f"TEST Accuracy        : {test_acc:.4f}")
    print(f"TEST Top-3 Accuracy  : {test_top3:.4f}")

    # 9) Modeli kaydet
    if model_name is not None:
        os.makedirs("models_rf_fe", exist_ok=True)
        path = os.path.join("models_rf_fe", f"{model_name}.pkl")
        joblib.dump(final_model, path)
        print(f"Model kaydedildi → {path}")

    return {
        "model": final_model,
        "test_acc": test_acc,
        "test_top3": test_top3,
        "n_samples": len(df_t),
        "n_classes": y.nunique(),
    }


In [7]:
# =========================
# Hücre 7 – Dört öğün için RF (FE) modeli eğit
# =========================

results_rf_fe = {}

results_rf_fe["breakfast"] = train_meal_model_rf(
    df=df,
    target_col="Breakfast Suggestion",
    feature_cols=feature_cols,
    cat_cols=cat_cols,
    num_cols=num_cols,
    min_count=1,
    random_state=42,
    model_name="rf_fe_breakfast"
)

results_rf_fe["lunch"] = train_meal_model_rf(
    df=df,
    target_col="Lunch Suggestion",
    feature_cols=feature_cols,
    cat_cols=cat_cols,
    num_cols=num_cols,
    min_count=1,
    random_state=42,
    model_name="rf_fe_lunch"
)

results_rf_fe["dinner"] = train_meal_model_rf(
    df=df,
    target_col="Dinner Suggestion",
    feature_cols=feature_cols,
    cat_cols=cat_cols,
    num_cols=num_cols,
    min_count=1,
    random_state=42,
    model_name="rf_fe_dinner"
)

results_rf_fe["snack"] = train_meal_model_rf(
    df=df,
    target_col="Snack Suggestion",
    feature_cols=feature_cols,
    cat_cols=cat_cols,
    num_cols=num_cols,
    min_count=1,
    random_state=42,
    model_name="rf_fe_snack"
)

results_rf_fe



===== Breakfast Suggestion – RF (FE) modeli =====
Toplam örnek: 1698
Sınıf sayısı: 18
Train: (1018, 24)  Valid: (340, 24)  Test: (340, 24)
Validation Accuracy  : 0.2765
Validation Top-3 Acc : 0.6412
TEST Accuracy        : 0.2588
TEST Top-3 Accuracy  : 0.6735
Model kaydedildi → models_rf_fe/rf_fe_breakfast.pkl

===== Lunch Suggestion – RF (FE) modeli =====
Toplam örnek: 1698
Sınıf sayısı: 18
Train: (1018, 24)  Valid: (340, 24)  Test: (340, 24)
Validation Accuracy  : 0.2882
Validation Top-3 Acc : 0.6529
TEST Accuracy        : 0.2824
TEST Top-3 Accuracy  : 0.6529
Model kaydedildi → models_rf_fe/rf_fe_lunch.pkl

===== Dinner Suggestion – RF (FE) modeli =====
Toplam örnek: 1698
Sınıf sayısı: 22
Train: (1018, 24)  Valid: (340, 24)  Test: (340, 24)
Validation Accuracy  : 0.3118
Validation Top-3 Acc : 0.6529
TEST Accuracy        : 0.2765
TEST Top-3 Accuracy  : 0.6235
Model kaydedildi → models_rf_fe/rf_fe_dinner.pkl

===== Snack Suggestion – RF (FE) modeli =====
Toplam örnek: 1698
Sınıf sayısı

{'breakfast': {'model': Pipeline(steps=[('preprocess',
                   ColumnTransformer(transformers=[('cat',
                                                    OneHotEncoder(handle_unknown='ignore'),
                                                    ['Gender', 'Activity Level',
                                                     'Dietary Preference',
                                                     'Disease', 'BMI_Category',
                                                     'Age_Group']),
                                                   ('num', StandardScaler(),
                                                    ['Ages', 'Height', 'Weight',
                                                     'Daily Calorie Target',
                                                     'Protein', 'Fat', 'Sugar',
                                                     'Sodium', 'BMI',
                                                     'ActivityNum',
                                     

In [8]:
# =========================
# FE-only feature listeleri
# =========================

fe_feature_cols = [
    # FE sayısallar
    "BMI",
    "ActivityNum",
    "protein_ratio",
    "fat_ratio",
    "sugar_ratio",
    "sodium_ratio",
    "prot_act",
    "kcal_act",
    "bmi_age",
    "pf_ratio",

    # FE kategorikler
    "BMI_Category",
    "Age_Group",
]

fe_cat_cols = [
    "BMI_Category",
    "Age_Group",
]

fe_num_cols = [
    "BMI",
    "ActivityNum",
    "protein_ratio",
    "fat_ratio",
    "sugar_ratio",
    "sodium_ratio",
    "prot_act",
    "kcal_act",
    "bmi_age",
    "pf_ratio",
]

print("FE-only feature sayısı:", len(fe_feature_cols))
print("FE-only kategorik:", fe_cat_cols)
print("FE-only sayısal:", fe_num_cols)


FE-only feature sayısı: 12
FE-only kategorik: ['BMI_Category', 'Age_Group']
FE-only sayısal: ['BMI', 'ActivityNum', 'protein_ratio', 'fat_ratio', 'sugar_ratio', 'sodium_ratio', 'prot_act', 'kcal_act', 'bmi_age', 'pf_ratio']


In [9]:
# =========================
# FE-only RF deneyleri
# =========================

results_rf_fe_only = {}

results_rf_fe_only["breakfast"] = train_meal_model_rf(
    df=df,
    target_col="Breakfast Suggestion",
    feature_cols=fe_feature_cols,
    cat_cols=fe_cat_cols,
    num_cols=fe_num_cols,
    min_count=1,
    random_state=42,
    model_name="rf_feonly_breakfast"
)

results_rf_fe_only["lunch"] = train_meal_model_rf(
    df=df,
    target_col="Lunch Suggestion",
    feature_cols=fe_feature_cols,
    cat_cols=fe_cat_cols,
    num_cols=fe_num_cols,
    min_count=1,
    random_state=42,
    model_name="rf_feonly_lunch"
)

results_rf_fe_only["dinner"] = train_meal_model_rf(
    df=df,
    target_col="Dinner Suggestion",
    feature_cols=fe_feature_cols,
    cat_cols=fe_cat_cols,
    num_cols=fe_num_cols,
    min_count=1,
    random_state=42,
    model_name="rf_feonly_dinner"
)

results_rf_fe_only["snack"] = train_meal_model_rf(
    df=df,
    target_col="Snack Suggestion",
    feature_cols=fe_feature_cols,
    cat_cols=fe_cat_cols,
    num_cols=fe_num_cols,
    min_count=1,
    random_state=42,
    model_name="rf_feonly_snack"
)

results_rf_fe_only



===== Breakfast Suggestion – RF (FE) modeli =====
Toplam örnek: 1698
Sınıf sayısı: 18
Train: (1018, 12)  Valid: (340, 12)  Test: (340, 12)
Validation Accuracy  : 0.2529
Validation Top-3 Acc : 0.6088
TEST Accuracy        : 0.2324
TEST Top-3 Accuracy  : 0.6265
Model kaydedildi → models_rf_fe/rf_feonly_breakfast.pkl

===== Lunch Suggestion – RF (FE) modeli =====
Toplam örnek: 1698
Sınıf sayısı: 18
Train: (1018, 12)  Valid: (340, 12)  Test: (340, 12)
Validation Accuracy  : 0.2912
Validation Top-3 Acc : 0.6618
TEST Accuracy        : 0.2324
TEST Top-3 Accuracy  : 0.6265
Model kaydedildi → models_rf_fe/rf_feonly_lunch.pkl

===== Dinner Suggestion – RF (FE) modeli =====
Toplam örnek: 1698
Sınıf sayısı: 22
Train: (1018, 12)  Valid: (340, 12)  Test: (340, 12)
Validation Accuracy  : 0.2824
Validation Top-3 Acc : 0.6529
TEST Accuracy        : 0.2382
TEST Top-3 Accuracy  : 0.5765
Model kaydedildi → models_rf_fe/rf_feonly_dinner.pkl

===== Snack Suggestion – RF (FE) modeli =====
Toplam örnek: 1698


{'breakfast': {'model': Pipeline(steps=[('preprocess',
                   ColumnTransformer(transformers=[('cat',
                                                    OneHotEncoder(handle_unknown='ignore'),
                                                    ['BMI_Category',
                                                     'Age_Group']),
                                                   ('num', StandardScaler(),
                                                    ['BMI', 'ActivityNum',
                                                     'protein_ratio', 'fat_ratio',
                                                     'sugar_ratio',
                                                     'sodium_ratio', 'prot_act',
                                                     'kcal_act', 'bmi_age',
                                                     'pf_ratio'])])),
                  ('clf',
                   RandomForestClassifier(class_weight='balanced_subsample',
                        