In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import numpy as np
import numpy.ma as ma
import pandas as pd

In [4]:
user_data = pd.read_csv('/content/drive/MyDrive/Bangkit/training_data.csv', sep = ',', nrows=150)
user_data['user_id'] = [f"UD{i:04d}" for i in range(1, len(user_data) + 1)]
user_data

Unnamed: 0,user_id,remaining_calories,remaining_carbo,remaining_protein,remaining_fat,remaining_cholesterol
0,UD0001,-491,-69.530,-24.765,-11.502,282
1,UD0002,814,120.062,42.164,17.856,-12
2,UD0003,190,22.552,7.790,1.247,-28
3,UD0004,1397,212.401,70.798,32.252,186
4,UD0005,-107,-20.050,-3.979,-1.821,129
...,...,...,...,...,...,...
145,UD0146,-220,-35.605,-13.039,-3.413,59
146,UD0147,1376,198.814,68.674,31.298,278
147,UD0148,773,127.194,39.114,16.107,272
148,UD0149,1483,294.876,36.383,16.031,-8


In [5]:
user_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   user_id                150 non-null    object 
 1   remaining_calories     150 non-null    int64  
 2   remaining_carbo        150 non-null    float64
 3   remaining_protein      150 non-null    float64
 4   remaining_fat          150 non-null    float64
 5   remaining_cholesterol  150 non-null    int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 7.2+ KB


In [6]:
food_datas = pd.read_csv('/content/drive/MyDrive/Bangkit/food_dataaa.csv', sep = ',')
food_data = food_datas.drop(columns = ['No.', 'food'])
food_data

Unnamed: 0,food_id,Caloric Value,Fat,Saturated Fats,Carbohydrates,Protein,Cholesterol
0,FD0001,380,9.93,0.940,69.10,13.70,0
1,FD0002,105,0.63,0.090,21.60,2.60,0
2,FD0003,165,1.56,0.210,32.00,5.40,0
3,FD0004,168,0.57,0.100,37.10,2.50,0
4,FD0005,202,0.79,0.150,43.90,3.50,0
...,...,...,...,...,...,...,...
145,FD0146,132,7.41,1.386,10.90,7.14,78
146,FD0147,138,2.29,0.585,16.30,12.26,43
147,FD0148,168,12.40,7.584,3.33,11.94,35
148,FD0149,254,19.00,7.613,0.94,19.10,68


In [7]:
food_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   food_id         150 non-null    object 
 1   Caloric Value   150 non-null    int64  
 2   Fat             150 non-null    float64
 3   Saturated Fats  150 non-null    float64
 4   Carbohydrates   150 non-null    float64
 5   Protein         150 non-null    float64
 6   Cholesterol     150 non-null    int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 8.3+ KB


In [8]:
def is_good_match(user_row, food_row, thresholds):
    return (
        abs(user_row['remaining_calories'] - food_row['Caloric Value']) <= thresholds['calories'] and
        abs(user_row['remaining_fat'] - food_row['Fat']) <= thresholds['fat'] and
        abs(user_row['remaining_carbo'] - food_row['Carbohydrates']) <= thresholds['carbo'] and
        abs(user_row['remaining_protein'] - food_row['Protein']) <= thresholds['protein'] and
        abs(user_row['remaining_cholesterol'] - food_row['Cholesterol']) <= thresholds['cholesterol']
    )

thresholds = {
    'calories': 300,
    'fat': 10,
    'carbo': 50,
    'protein': 20,
    'cholesterol': 30
}

def generate_pairs(user_data, food_data, thresholds, negative_ratio=3):
    pairs = []
    for _, user_row in user_data.iterrows():
        for _, food_row in food_data.iterrows():
            if is_good_match(user_row, food_row, thresholds):
                pairs.append({
                    'user_id': user_row['user_id'],
                    'food_id': food_row['food_id'],
                    'user_features': user_row[['remaining_calories', 'remaining_fat', 'remaining_carbo', 'remaining_protein', 'remaining_cholesterol']].tolist(),
                    'food_features': food_row[['Caloric Value', 'Fat', 'Carbohydrates', 'Protein', 'Cholesterol']].tolist(),
                    'label': 1
                })

        sampled_foods = food_data.sample(n=negative_ratio)
        for _, food_row in sampled_foods.iterrows():
            pairs.append({
                'user_id': user_row['user_id'],
                'food_id': food_row['food_id'],
                'user_features': user_row[['remaining_calories', 'remaining_fat', 'remaining_carbo', 'remaining_protein', 'remaining_cholesterol']].tolist(),
                'food_features': food_row[['Caloric Value', 'Fat', 'Carbohydrates', 'Protein', 'Cholesterol']].tolist(),
                'label': 0
            })

    return pd.DataFrame(pairs)

In [9]:
pairs = generate_pairs(user_data, food_data, thresholds)

In [10]:
pairs

Unnamed: 0,user_id,food_id,user_features,food_features,label
0,UD0001,FD0144,"[-491, -11.502, -69.53, -24.765, 282]","[225, 14.82, 4.87, 19.54, 49]",0
1,UD0001,FD0057,"[-491, -11.502, -69.53, -24.765, 282]","[60, 0.24, 15.2, 1.0, 0]",0
2,UD0001,FD0054,"[-491, -11.502, -69.53, -24.765, 282]","[38, 0.0, 9.9, 0.6, 0]",0
3,UD0002,FD0115,"[814, 17.856, 120.062, 42.164, -12]","[248, 0.57, 59.3, 2.1, 0]",0
4,UD0002,FD0058,"[814, 17.856, 120.062, 42.164, -12]","[56, 0.0, 15.5, 0.2, 0]",0
...,...,...,...,...,...
1117,UD0149,FD0001,"[1483, 16.031, 294.876, 36.383, -8]","[380, 9.93, 69.1, 13.7, 0]",0
1118,UD0149,FD0063,"[1483, 16.031, 294.876, 36.383, -8]","[43, 0.15, 11.3, 0.3, 0]",0
1119,UD0150,FD0094,"[377, 8.976, 55.971, 16.778, -86]","[125, 2.17, 9.9, 15.13, 217]",0
1120,UD0150,FD0069,"[377, 8.976, 55.971, 16.778, -86]","[59, 0.02, 15.7, 0.4, 0]",0


In [11]:
pairs.to_csv("user_food_pairs.csv", index=False)
print("Generated pairs saved to 'user_food_pairs.csv'")

Generated pairs saved to 'user_food_pairs.csv'
