In [10]:
# =========================
# HÃ¼cre 1 â€“ Importlar
# =========================
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, top_k_accuracy_score

import joblib  # modelleri kaydetmek iÃ§in

# Modelleri kaydedeceÄŸimiz klasÃ¶r
os.makedirs("models", exist_ok=True)


In [11]:
# =========================
# HÃ¼cre 2 â€“ Veriyi okuma
# (path'Ä± kendi dosya yapÄ±na gÃ¶re ayarla)
# =========================
df = pd.read_csv("../data/Food_and_Nutrition new.csv")

print(df.shape)
df.head()


(1698, 19)


Unnamed: 0,Gender,Activity Level,Dietary Preference,Breakfast Suggestion,Lunch Suggestion,Dinner Suggestion,Snack Suggestion,Disease,Ages,Height,Weight,Daily Calorie Target,Protein,Sugar,Sodium,Calories,Carbohydrates,Fiber,Fat
0,Male,Moderately Active,Omnivore,Oatmeal with berries and nuts,Grilled chicken salad with mixed greens,Salmon with roasted vegetables,Greek yogurt with fruit,Weight Gain,25,180,80,2000,120,125.0,24.0,2020,250,30.0,60
1,Female,Lightly Active,Vegetarian,Tofu scramble with veggies,Lentil soup with whole wheat bread,Vegetable stir-fry with brown rice,Apple with almond butter,"Weight Gain, Hypertension, Heart Disease",32,165,65,1600,80,100.0,16.0,1480,200,24.0,40
2,Male,Sedentary,Vegan,Tofu and veggie breakfast burrito,Black bean burger on a whole wheat bun,Lentil and vegetable curry,Trail mix,Weight Gain,48,175,95,2200,100,150.0,20.0,2185,300,36.0,65
3,Female,Very Active,Omnivore,Greek yogurt with granola and fruit,Chicken and vegetable stir-fry,Turkey chili with brown rice,Banana with peanut butter,Weight Gain,55,160,70,2500,140,175.0,28.0,2680,350,42.0,80
4,Male,Sedentary,Vegetarian,Scrambled eggs with whole wheat toast and avocado,Quinoa salad with chickpeas and vegetables,Vegetarian chili with cornbread,Fruit and nut mix,Weight Gain,62,170,85,2000,80,125.0,16.0,1815,250,30.0,55


In [12]:
# =========================
# HÃ¼cre 3 â€” Ã–neri kolonlarÄ±nÄ± (text) temizleme
# =========================
meal_cols = [
    "Breakfast Suggestion",
    "Lunch Suggestion",
    "Dinner Suggestion",
    "Snack Suggestion"
]

def clean_text(s: pd.Series) -> pd.Series:
    return (
        s.astype(str)
         .str.lower()        # hepsi kÃ¼Ã§Ã¼k harf
         .str.strip()        # baÅŸ/son boÅŸluklarÄ± sil
         .str.replace(r"\s+", " ", regex=True)   # fazla boÅŸluklarÄ± teke indir
    )

# Temizlik uygula
for col in meal_cols:
    df[col] = clean_text(df[col])

# Temizlik sonrasÄ± unique sayÄ±larÄ±
print("Temizlik sonrasÄ± sÄ±nÄ±f sayÄ±larÄ±:")
for col in meal_cols:
    print(col, "â†’", df[col].nunique(), "benzersiz yemek")


# =========================
# Rare Class Merge (min_count=10)
# =========================

def merge_rare_classes(df, col, min_count=10):
    """
    min_count'tan az gÃ¶rÃ¼len sÄ±nÄ±flarÄ±
    'other_<kolonadÄ±>' altÄ±nda toplar.
    """
    vc = df[col].value_counts()
    rare_classes = vc[vc < min_count].index
    new_label = "other_" + col.replace(" ", "_").lower()
    df[col] = df[col].replace(rare_classes, new_label)
    return df

# Rare class birleÅŸtirme uygula
for col in meal_cols:
    df = merge_rare_classes(df, col, min_count=10)

# BirleÅŸtirme sonrasÄ± sÄ±nÄ±f sayÄ±larÄ±
print("\nRare class merge sonrasÄ± sÄ±nÄ±f sayÄ±larÄ±:")
for col in meal_cols:
    print(col, "â†’", df[col].nunique(), "sÄ±nÄ±f")


Temizlik sonrasÄ± sÄ±nÄ±f sayÄ±larÄ±:
Breakfast Suggestion â†’ 115 benzersiz yemek
Lunch Suggestion â†’ 187 benzersiz yemek
Dinner Suggestion â†’ 174 benzersiz yemek
Snack Suggestion â†’ 105 benzersiz yemek

Rare class merge sonrasÄ± sÄ±nÄ±f sayÄ±larÄ±:
Breakfast Suggestion â†’ 18 sÄ±nÄ±f
Lunch Suggestion â†’ 18 sÄ±nÄ±f
Dinner Suggestion â†’ 22 sÄ±nÄ±f
Snack Suggestion â†’ 21 sÄ±nÄ±f


In [14]:
# =========================
# HÃ¼cre 4 â€“ Feature listeleri
# (Bunlar X olarak kullanÄ±lacak sÃ¼tunlar)
# =========================

feature_cols = [
    "Gender",
    "Activity Level",
    "Dietary Preference",
    "Disease",
    "Ages",
    "Height",
    "Weight",
    "Daily Calorie Target",
    "Protein",
    "Fat",
    "Sugar",
    "Sodium",
]

cat_cols = [
    "Gender",
    "Activity Level",
    "Dietary Preference",
    "Disease"
]

num_cols = [
    "Ages",
    "Height",
    "Weight",
    "Daily Calorie Target",
    "Protein",
    "Fat",
    "Sugar",
    "Sodium",
]


In [15]:
from sklearn.metrics import accuracy_score, top_k_accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import os, joblib


def train_meal_model(
    df: pd.DataFrame,
    target_col: str,
    feature_cols,
    cat_cols,
    num_cols,
    min_count: int = 1,
    random_state: int = 42,
    model_name: str | None = None,
):
    """
    df:             veri seti
    target_col:     tahmin edilecek kolon (Ã¶rn: 'Breakfast Suggestion')
    min_count:      bundan az gÃ¶rÃ¼len sÄ±nÄ±flar atÄ±lÄ±r (rare class temizliÄŸi)
    model_name:     None deÄŸilse, models/{model_name}.pkl olarak kaydeder
    """

    # 1) Rare class filtreleme
    y_all = df[target_col]
    class_counts = y_all.value_counts()
    valid_classes = class_counts[class_counts >= min_count].index

    df_t = df[df[target_col].isin(valid_classes)].copy()
    y = df_t[target_col]
    X = df_t[feature_cols]

    print(f"\n===== {target_col} modeli =====")
    print("Toplam Ã¶rnek:", len(df_t))
    print("SÄ±nÄ±f sayÄ±sÄ±:", y.nunique())

    # 2) Trainâ€“Validâ€“Test bÃ¶lme (60/20/20)
    X_trainval, X_test, y_trainval, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=random_state,
        stratify=y
    )

    X_train, X_val, y_train, y_val = train_test_split(
        X_trainval, y_trainval,
        test_size=0.25,              # 0.25 * 0.8 = 0.2
        random_state=random_state,
        stratify=y_trainval
    )

    print("Train:", X_train.shape, " Valid:", X_val.shape, " Test:", X_test.shape)

    # 3) Preprocess (OneHot + Scaling)
    preprocess = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
            ("num", StandardScaler(), num_cols),
        ]
    )

    # 4) Model (Random Forest)
    rf_clf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        random_state=random_state,
        n_jobs=-1
    )

    model = Pipeline(steps=[
        ("preprocess", preprocess),
        ("clf", rf_clf),
    ])

    # 5) Train set ile eÄŸit
    model.fit(X_train, y_train)

    # 6) Validation performansÄ±
    y_val_pred = model.predict(X_val)
    val_acc = accuracy_score(y_val, y_val_pred)

    y_val_proba = model.predict_proba(X_val)

    # ðŸ”´ Ã–NEMLÄ° FÄ°X: labels = modeldeki tÃ¼m sÄ±nÄ±flar
    val_labels = model.named_steps["clf"].classes_
    val_top3 = top_k_accuracy_score(
        y_val,
        y_val_proba,
        k=3,
        labels=val_labels
    )

    print(f"Validation Accuracy  : {val_acc:.4f}")
    print(f"Validation Top-3 Acc : {val_top3:.4f}")

    # 7) Train + Valid birleÅŸimi ile final modeli eÄŸit
    X_final_train = pd.concat([X_train, X_val], axis=0)
    y_final_train = pd.concat([y_train, y_val], axis=0)

    final_model = Pipeline(steps=[
        ("preprocess", preprocess),
        ("clf", rf_clf),
    ])
    final_model.fit(X_final_train, y_final_train)

    # 8) Test performansÄ±
    y_test_pred = final_model.predict(X_test)
    test_acc = accuracy_score(y_test, y_test_pred)

    y_test_proba = final_model.predict_proba(X_test)
    test_labels = final_model.named_steps["clf"].classes_

    test_top3 = top_k_accuracy_score(
        y_test,
        y_test_proba,
        k=3,
        labels=test_labels
    )

    print(f"TEST Accuracy        : {test_acc:.4f}")
    print(f"TEST Top-3 Accuracy  : {test_top3:.4f}")

    # 9) Modeli kaydet
    if model_name is not None:
        os.makedirs("models", exist_ok=True)
        path = os.path.join("models", f"{model_name}.pkl")
        joblib.dump(final_model, path)
        print(f"Model kaydedildi â†’ {path}")

    return {
        "model": final_model,
        "test_acc": test_acc,
        "test_top3": test_top3,
        "n_samples": len(df_t),
        "n_classes": y.nunique(),
    }


In [16]:
# =========================
# HÃ¼cre 6 â€“ DÃ¶rt Ã¶ÄŸÃ¼n iÃ§in
# modeli tek tek eÄŸit
# =========================
results = {}

results["breakfast"] = train_meal_model(
    df=df,
    target_col="Breakfast Suggestion",
    feature_cols=feature_cols,
    cat_cols=cat_cols,
    num_cols=num_cols,
    min_count=1,
    random_state=42,
    model_name="rf_breakfast"
)

results["lunch"] = train_meal_model(
    df=df,
    target_col="Lunch Suggestion",
    feature_cols=feature_cols,
    cat_cols=cat_cols,
    num_cols=num_cols,
    min_count=1,
    random_state=42,
    model_name="rf_lunch"
)

results["dinner"] = train_meal_model(
    df=df,
    target_col="Dinner Suggestion",
    feature_cols=feature_cols,
    cat_cols=cat_cols,
    num_cols=num_cols,
    min_count=1,
    random_state=42,
    model_name="rf_dinner"
)

results["snack"] = train_meal_model(
    df=df,
    target_col="Snack Suggestion",
    feature_cols=feature_cols,
    cat_cols=cat_cols,
    num_cols=num_cols,
    min_count=1,
    random_state=42,
    model_name="rf_snack"
)

results



===== Breakfast Suggestion modeli =====
Toplam Ã¶rnek: 1698
SÄ±nÄ±f sayÄ±sÄ±: 18
Train: (1018, 8)  Valid: (340, 8)  Test: (340, 8)
Validation Accuracy  : 0.2853
Validation Top-3 Acc : 0.5559
TEST Accuracy        : 0.2147
TEST Top-3 Accuracy  : 0.5412
Model kaydedildi â†’ models/rf_breakfast.pkl

===== Lunch Suggestion modeli =====
Toplam Ã¶rnek: 1698
SÄ±nÄ±f sayÄ±sÄ±: 18
Train: (1018, 8)  Valid: (340, 8)  Test: (340, 8)
Validation Accuracy  : 0.2706
Validation Top-3 Acc : 0.5559
TEST Accuracy        : 0.2471
TEST Top-3 Accuracy  : 0.5500
Model kaydedildi â†’ models/rf_lunch.pkl

===== Dinner Suggestion modeli =====
Toplam Ã¶rnek: 1698
SÄ±nÄ±f sayÄ±sÄ±: 22
Train: (1018, 8)  Valid: (340, 8)  Test: (340, 8)
Validation Accuracy  : 0.2735
Validation Top-3 Acc : 0.5529
TEST Accuracy        : 0.2500
TEST Top-3 Accuracy  : 0.5265
Model kaydedildi â†’ models/rf_dinner.pkl

===== Snack Suggestion modeli =====
Toplam Ã¶rnek: 1698
SÄ±nÄ±f sayÄ±sÄ±: 21
Train: (1018, 8)  Valid: (340, 8)  Test: (340

{'breakfast': {'model': Pipeline(steps=[('preprocess',
                   ColumnTransformer(transformers=[('cat',
                                                    OneHotEncoder(handle_unknown='ignore'),
                                                    ['Gender', 'Activity Level',
                                                     'Dietary Preference',
                                                     'Disease']),
                                                   ('num', StandardScaler(),
                                                    ['Ages', 'Height', 'Weight',
                                                     'Daily Calorie Target'])])),
                  ('clf',
                   RandomForestClassifier(n_estimators=300, n_jobs=-1,
                                          random_state=42))]),
  'test_acc': 0.21470588235294116,
  'test_top3': np.float64(0.5411764705882353),
  'n_samples': 1698,
  'n_classes': 18},
 'lunch': {'model': Pipeline(steps=[('preprocess'