In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp313-cp313-macosx_11_0_universal2.whl.metadata (1.4 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.8-cp313-cp313-macosx_11_0_universal2.whl (27.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.8/27.8 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Downloading graphviz-0.21-py3-none-any.whl (47 kB)
Installing collected packages: graphviz, catboost
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [catboost]1/2[0m [catboost]
Successfully installed catboost-1.2.8 graphviz-0.21
Note: you may need to restart the kernel to use updated packages.


In [1]:
# =========================
# Hücre 1 – Importlar
# =========================

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, top_k_accuracy_score

from catboost import CatBoostClassifier

import os


In [2]:
# =========================
# Hücre 2 – Veriyi oku
# =========================

csv_path = "../data/Food_and_Nutrition new.csv"  # kendi yoluna göre güncelle
df = pd.read_csv(csv_path)

print("Satır sayısı:", len(df))
df.head()


Satır sayısı: 1698


Unnamed: 0,Gender,Activity Level,Dietary Preference,Breakfast Suggestion,Lunch Suggestion,Dinner Suggestion,Snack Suggestion,Disease,Ages,Height,Weight,Daily Calorie Target,Protein,Sugar,Sodium,Calories,Carbohydrates,Fiber,Fat
0,Male,Moderately Active,Omnivore,Oatmeal with berries and nuts,Grilled chicken salad with mixed greens,Salmon with roasted vegetables,Greek yogurt with fruit,Weight Gain,25,180,80,2000,120,125.0,24.0,2020,250,30.0,60
1,Female,Lightly Active,Vegetarian,Tofu scramble with veggies,Lentil soup with whole wheat bread,Vegetable stir-fry with brown rice,Apple with almond butter,"Weight Gain, Hypertension, Heart Disease",32,165,65,1600,80,100.0,16.0,1480,200,24.0,40
2,Male,Sedentary,Vegan,Tofu and veggie breakfast burrito,Black bean burger on a whole wheat bun,Lentil and vegetable curry,Trail mix,Weight Gain,48,175,95,2200,100,150.0,20.0,2185,300,36.0,65
3,Female,Very Active,Omnivore,Greek yogurt with granola and fruit,Chicken and vegetable stir-fry,Turkey chili with brown rice,Banana with peanut butter,Weight Gain,55,160,70,2500,140,175.0,28.0,2680,350,42.0,80
4,Male,Sedentary,Vegetarian,Scrambled eggs with whole wheat toast and avocado,Quinoa salad with chickpeas and vegetables,Vegetarian chili with cornbread,Fruit and nut mix,Weight Gain,62,170,85,2000,80,125.0,16.0,1815,250,30.0,55


In [3]:
# =========================
# Hücre 3 – Öneri kolonlarını temizle + rare class merge
# =========================

meal_cols = [
    "Breakfast Suggestion",
    "Lunch Suggestion",
    "Dinner Suggestion",
    "Snack Suggestion"
]

def clean_text(s: pd.Series) -> pd.Series:
    return (
        s.astype(str)
         .str.lower()
         .str.strip()
         .str.replace(r"\s+", " ", regex=True)
    )

# 1) Text temizliği
for col in meal_cols:
    df[col] = clean_text(df[col])

print("Temizlik sonrası sınıf sayıları:")
for col in meal_cols:
    print(col, "→", df[col].nunique(), "benzersiz yemek")


# 2) Rare class merge (min_count=10)
def merge_rare_classes(df, col, min_count=10):
    vc = df[col].value_counts()
    rare_classes = vc[vc < min_count].index
    new_label = "other_" + col.replace(" ", "_").lower()
    df[col] = df[col].replace(rare_classes, new_label)
    return df

for col in meal_cols:
    df = merge_rare_classes(df, col, min_count=10)

print("\nRare class merge sonrası sınıf sayıları:")
for col in meal_cols:
    print(col, "→", df[col].nunique(), "sınıf")


Temizlik sonrası sınıf sayıları:
Breakfast Suggestion → 115 benzersiz yemek
Lunch Suggestion → 187 benzersiz yemek
Dinner Suggestion → 174 benzersiz yemek
Snack Suggestion → 105 benzersiz yemek

Rare class merge sonrası sınıf sayıları:
Breakfast Suggestion → 18 sınıf
Lunch Suggestion → 18 sınıf
Dinner Suggestion → 22 sınıf
Snack Suggestion → 21 sınıf


In [4]:
# =========================
# Hücre 4 – Feature listeleri
# =========================

feature_cols = [
    "Gender",
    "Activity Level",
    "Dietary Preference",
    "Disease",
    "Ages",
    "Height",
    "Weight",
    "Daily Calorie Target",
    "Protein",
    "Fat",
    "Sugar",
    "Sodium",
]

cat_cols = [
    "Gender",
    "Activity Level",
    "Dietary Preference",
    "Disease",
]

num_cols = [
    "Ages",
    "Height",
    "Weight",
    "Daily Calorie Target",
    "Protein",
    "Fat",
    "Sugar",
    "Sodium",
]

print("Toplam feature:", len(feature_cols))
print("Kategorik:", cat_cols)
print("Sayısal:", num_cols)


Toplam feature: 12
Kategorik: ['Gender', 'Activity Level', 'Dietary Preference', 'Disease']
Sayısal: ['Ages', 'Height', 'Weight', 'Daily Calorie Target', 'Protein', 'Fat', 'Sugar', 'Sodium']


In [5]:
# =========================
# Hücre 5 – CatBoost ile tek öğün modeli eğen fonksiyon
# =========================

def train_catboost_meal(
    df: pd.DataFrame,
    target_col: str,
    feature_cols,
    cat_cols,
    min_count: int = 1,
    random_state: int = 42,
    model_name: str | None = None,
):
    """
    df:          veri seti
    target_col:  tahmin edilecek kolon (örn: 'Breakfast Suggestion')
    min_count:   bundan az görülen sınıflar atılır (ek güvenlik için)
    model_name:  'cat_breakfast' gibi; None değilse models/{model_name}.cbm kaydedilir
    """

    # 1) Rare sınıfları tamamen at (gerekirse)
    y_all = df[target_col]
    class_counts = y_all.value_counts()
    valid_classes = class_counts[class_counts >= min_count].index

    df_t = df[df[target_col].isin(valid_classes)].copy()
    y = df_t[target_col]
    X = df_t[feature_cols]

    print(f"\n===== {target_col} – CatBoost modeli =====")
    print("Toplam örnek:", len(df_t))
    print("Sınıf sayısı:", y.nunique())

    # 2) Train–Valid–Test bölme (60/20/20, stratified)
    X_trainval, X_test, y_trainval, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=random_state,
        stratify=y
    )

    X_train, X_val, y_train, y_val = train_test_split(
        X_trainval, y_trainval,
        test_size=0.25,    # 0.25 * 0.8 = 0.2
        random_state=random_state,
        stratify=y_trainval
    )

    print("Train:", X_train.shape, " Valid:", X_val.shape, " Test:", X_test.shape)

    # 3) CatBoost – kategorik kolon index'lerini hazırla
    #    CatBoost'a X_train'deki index'e göre veriyoruz
    cat_feature_indices = [feature_cols.index(c) for c in cat_cols]

    model = CatBoostClassifier(
        loss_function="MultiClass",
        eval_metric="MultiClass",
        depth=6,
        learning_rate=0.3,
        l2_leaf_reg=3,
        iterations=500,
        random_state=random_state,
        verbose=False,
        task_type="CPU"
    )

    # 4) Train (early stopping için eval_set veriyoruz)
    model.fit(
        X_train,
        y_train,
        cat_features=cat_feature_indices,
        eval_set=(X_val, y_val),
        use_best_model=True,
        verbose=False
    )

    # 5) Validation performansı
    y_val_pred = model.predict(X_val)
    y_val_pred = y_val_pred.reshape(-1)  # CatBoost (n,1) döndürebiliyor

    val_acc = accuracy_score(y_val, y_val_pred)

    y_val_proba = model.predict_proba(X_val)
    val_top3 = top_k_accuracy_score(
        y_val,
        y_val_proba,
        k=3
    )

    print(f"Validation Accuracy  : {val_acc:.4f}")
    print(f"Validation Top-3 Acc : {val_top3:.4f}")

    # 6) Train + Valid birleştir, final modeli yeniden eğit
    X_final_train = pd.concat([X_train, X_val], axis=0)
    y_final_train = pd.concat([y_train, y_val], axis=0)

    final_model = CatBoostClassifier(
        loss_function="MultiClass",
        eval_metric="MultiClass",
        depth=8,
        learning_rate=0.1,
        iterations=model.tree_count_,  # best_model kaç ağaç kullandıysa onu kullan
        random_state=random_state,
        verbose=False,
        task_type="CPU"
    )

    final_model.fit(
        X_final_train,
        y_final_train,
        cat_features=cat_feature_indices,
        verbose=False
    )

    # 7) Test performansı
    y_test_pred = final_model.predict(X_test).reshape(-1)
    test_acc = accuracy_score(y_test, y_test_pred)

    y_test_proba = final_model.predict_proba(X_test)
    test_top3 = top_k_accuracy_score(
        y_test,
        y_test_proba,
        k=3
    )

    print(f"TEST Accuracy        : {test_acc:.4f}")
    print(f"TEST Top-3 Accuracy  : {test_top3:.4f}")

    # 8) Modeli kaydet
    if model_name is not None:
        os.makedirs("models_catboost", exist_ok=True)
        path = os.path.join("models_catboost", f"{model_name}.cbm")
        final_model.save_model(path)
        print(f"Model kaydedildi → {path}")

    return {
        "model": final_model,
        "test_acc": test_acc,
        "test_top3": test_top3,
        "n_samples": len(df_t),
        "n_classes": y.nunique(),
    }


In [None]:
# =========================
# CatBoost Hyperparameter Tuning – Breakfast örneği
# =========================

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

target_col = "Breakfast Suggestion"

X = df[feature_cols]
y = df[target_col]

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

cat_feature_indices = [feature_cols.index(c) for c in cat_cols]

# En iyi ayarları aramak için grid
param_grid = {
    "depth": [6, 8, 10],
    "learning_rate": [0.03, 0.05, 0.1],
    "l2_leaf_reg": [3, 10, 30],
    "iterations": [300, 500, 700]
}

best_score = 0
best_params = None

for depth in param_grid["depth"]:
    for lr in param_grid["learning_rate"]:
        for l2 in param_grid["l2_leaf_reg"]:
            for it in param_grid["iterations"]:

                model = CatBoostClassifier(
                    loss_function="MultiClass",
                    eval_metric="Accuracy",
                    depth=depth,
                    learning_rate=lr,
                    l2_leaf_reg=l2,
                    iterations=it,
                    random_state=42,
                    cat_features=cat_feature_indices,
                    verbose=False
                )

                model.fit(X_train, y_train, verbose=False)

                preds = model.predict(X_test).reshape(-1)
                acc = accuracy_score(y_test, preds)

                print(f"depth={depth}, lr={lr}, l2={l2}, it={it} → acc={acc:.3f}")

                if acc > best_score:
                    best_score = acc
                    best_params = (depth, lr, l2, it)

print("\nBEST PARAMS:", best_params)
print("BEST ACC:", best_score)


depth=6, lr=0.03, l2=3, it=300 → acc=0.262
depth=6, lr=0.03, l2=3, it=500 → acc=0.271
depth=6, lr=0.03, l2=3, it=700 → acc=0.256
depth=6, lr=0.03, l2=10, it=300 → acc=0.250
depth=6, lr=0.03, l2=10, it=500 → acc=0.256


In [None]:
# =========================
# Hücre 6 – Dört öğün için modeli eğit
# =========================

results_cat = {}

results_cat["breakfast"] = train_catboost_meal(
    df=df,
    target_col="Breakfast Suggestion",
    feature_cols=feature_cols,
    cat_cols=cat_cols,
    min_count=1,
    random_state=42,
    model_name="cat_breakfast"
)

results_cat["lunch"] = train_catboost_meal(
    df=df,
    target_col="Lunch Suggestion",
    feature_cols=feature_cols,
    cat_cols=cat_cols,
    min_count=1,
    random_state=42,
    model_name="cat_lunch"
)

results_cat["dinner"] = train_catboost_meal(
    df=df,
    target_col="Dinner Suggestion",
    feature_cols=feature_cols,
    cat_cols=cat_cols,
    min_count=1,
    random_state=42,
    model_name="cat_dinner"
)

results_cat["snack"] = train_catboost_meal(
    df=df,
    target_col="Snack Suggestion",
    feature_cols=feature_cols,
    cat_cols=cat_cols,
    min_count=1,
    random_state=42,
    model_name="cat_snack"
)

results_cat
