In [12]:
# === Imports & configuration ===
from pathlib import Path
import pandas as pd
from pprint import pprint
from IPython.display import display

# Pour un affichage plus lisible
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 160)

# Définir le dossier des données (relatif au notebook)
DATA_DIR = Path("Data")
FILE = "RAW_recipes.csv"
FILEPATH = DATA_DIR / FILE

# Vérification du chemin
if not FILEPATH.exists():
    raise FileNotFoundError(f"Fichier introuvable : {FILEPATH.resolve()}")

# === Chargement du fichier ===
df_reciês = pd.read_csv(FILEPATH, low_memory=False)

# === Infos générales ===
print(f"Fichier : {FILE}")
print(f"Shape : {df_recipes.shape[0]} lignes × {df_recipes.shape[1]} colonnes\n")

print("Colonnes :")
print(list(df_recipes.columns))

print("\nTypes de données :")
print(df_recipes.dtypes)

# === Aperçu des premières lignes ===
display(df_recipes.head(20))

# === Stats rapides sur quelques colonnes clés ===
for col in ["user_id", "name", "n_recipes", "n_reviews", "n_interactions"]:
    if col in df_recipes.columns:
        print(f"\nValue counts pour '{col}' (top 10) :")
        print(df_recipes[col].value_counts(dropna=False).head(10))

Fichier : RAW_recipes.csv
Shape : 231637 lignes × 12 colonnes

Colonnes :
['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags', 'nutrition', 'n_steps', 'steps', 'description', 'ingredients', 'n_ingredients']

Types de données :
name              object
id                 int64
minutes            int64
contributor_id     int64
submitted         object
tags              object
nutrition         object
n_steps            int64
steps             object
description       object
ingredients       object
n_ingredients      int64
dtype: object


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8
5,apple a day milk shake,5289,0,1533,1999-12-06,"['15-minutes-or-less', 'time-to-make', 'course...","[160.2, 10.0, 55.0, 3.0, 9.0, 20.0, 7.0]",4,"['combine ingredients in blender', 'cover and ...",,"['milk', 'vanilla ice cream', 'frozen apple ju...",4
6,aww marinated olives,25274,15,21730,2002-04-14,"['15-minutes-or-less', 'time-to-make', 'course...","[380.7, 53.0, 7.0, 24.0, 6.0, 24.0, 6.0]",4,['toast the fennel seeds and lightly crush the...,my italian mil was thoroughly impressed by my ...,"['fennel seeds', 'green olives', 'ripe olives'...",9
7,backyard style barbecued ribs,67888,120,10404,2003-07-30,"['weeknight', 'time-to-make', 'course', 'main-...","[1109.5, 83.0, 378.0, 275.0, 96.0, 86.0, 36.0]",10,['in a medium saucepan combine all the ingredi...,this recipe is posted by request and was origi...,"['pork spareribs', 'soy sauce', 'fresh garlic'...",22
8,bananas 4 ice cream pie,70971,180,102353,2003-09-10,"['weeknight', 'time-to-make', 'course', 'main-...","[4270.8, 254.0, 1306.0, 111.0, 127.0, 431.0, 2...",8,"['crumble cookies into a 9-inch pie plate , or...",,"['chocolate sandwich style cookies', 'chocolat...",6
9,beat this banana bread,75452,70,15892,2003-11-04,"['weeknight', 'time-to-make', 'course', 'main-...","[2669.3, 160.0, 976.0, 107.0, 62.0, 310.0, 138.0]",12,"['preheat oven to 350 degrees', 'butter two 9x...",from ann hodgman's,"['sugar', 'unsalted butter', 'bananas', 'eggs'...",9



Value counts pour 'name' (top 10) :
name
brown sugar frosting                            3
broccoli cauliflower soup                       3
banana oatmeal chocolate chip cookies           3
banana chocolate chip muffins                   3
tex mex chicken and rice                        3
cream cheese banana nut bread                   3
peanut butter oatmeal chocolate chip cookies    3
crock pot lemon garlic chicken                  3
broccoli cheese soup                            3
three bean chili                                3
Name: count, dtype: int64


In [8]:
FILE = "RAW_interactions.csv"
FILEPATH = DATA_DIR / FILE

# === Chargement du fichier ===
df_interactions = pd.read_csv(FILEPATH, low_memory=False)

# === Infos générales ===
print(f"Fichier : {FILE}")
print(f"Shape : {df_interactions.shape[0]} lignes × {df_interactions.shape[1]} colonnes\n")

print("Colonnes :")
print(list(df_interactions.columns))

print("\nTypes de données :")
print(df_interactions.dtypes)

# === Aperçu des premières lignes ===
display(df_interactions.head(20))

Fichier : RAW_interactions.csv
Shape : 231637 lignes × 12 colonnes

Colonnes :
['user_id', 'recipe_id', 'date', 'rating', 'review']

Types de données :
user_id       int64
recipe_id     int64
date         object
rating        int64
review       object
dtype: object


Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."
5,52282,120345,2005-05-21,4,very very sweet. after i waited the 2 days i b...
6,124416,120345,2011-08-06,0,"Just an observation, so I will not rate. I fo..."
7,2000192946,120345,2015-05-10,2,This recipe was OVERLY too sweet. I would sta...
8,76535,134728,2005-09-02,4,Very good!
9,273745,134728,2005-12-22,5,Better than the real!!


In [10]:
len(set(df_interactions['user_id']))

226570

In [13]:
len(set(df_recipes['contributor_id']))

27926

In [62]:
# Normalization of recipes and users id's
df_recipes['user_id_normalized'], uniques = pd.factorize(df_recipes['contributor_id'])
df_recipes['user_id_normalized'] += 1

df_recipes['recipe_id_normalized'], uniques = pd.factorize(df_recipes['id'])
df_recipes['recipe_id_normalized'] += 1

df_interactions['user_id_normalized'], uniques = pd.factorize(df_interactions['user_id'])
df_interactions['user_id_normalized'] += 1

# Sorting the dataframes by the new normalized id's
df_recipes = df_recipes.sort_values(by='recipe_id_normalized').reset_index(drop=True)
df_interactions = df_interactions.sort_values(by='user_id_normalized').reset_index(drop=True)

# Filling the missing reviews and descriptions
df_interactions['review'] = df_interactions['review'].fillna('No review was given.')
df_recipes['description'] = df_recipes['description'].fillna('No description was given.')

In [23]:
df_recipes

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,user_id_normalized,recipe_id_normalized
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,1,1
1,rumford s baking powder biscuits,441527,22,47892,2010-11-09,"['30-minutes-or-less', 'time-to-make', 'course...","[212.5, 16.0, 0.0, 18.0, 7.0, 14.0, 8.0]",10,"['preheat oven to 450 degrees', 'sift flour , ...",an oldie but a goodie! printed on the back of ...,"['unbleached white flour', 'baking powder', 's...",5,1,2
2,tomato and grilled corn salad with avocado,186074,15,47892,2006-09-13,"['15-minutes-or-less', 'time-to-make', 'course...","[169.7, 9.0, 21.0, 1.0, 12.0, 4.0, 9.0]",10,['arrange the tomato slices artfully on a larg...,a layered salad as lovely to look at as it is ...,"['tomatoes', 'yellow cherry tomatoes', 'corn',...",13,1,3
3,badda bing pizza with pesto esque sauce,139455,45,47892,2005-09-29,"['60-minutes-or-less', 'time-to-make', 'course...","[300.0, 38.0, 16.0, 18.0, 18.0, 26.0, 4.0]",17,['prepare the pizza dough as indicated in reci...,"same old story: home from the farmers' market,...","['pizza crust', 'onion', 'sliced mushrooms', '...",13,1,4
4,potato watercress soup,202706,30,47892,2006-12-31,"['30-minutes-or-less', 'time-to-make', 'course...","[209.3, 20.0, 6.0, 13.0, 10.0, 40.0, 6.0]",8,['melt the butter in a large saucepan over med...,this one's been in my recipe file for years.\r...,"['butter', 'watercress', 'yukon gold potatoes'...",10,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231632,zucchini blueberry muffins,183345,45,277129,2006-08-27,"['60-minutes-or-less', 'time-to-make', 'course...","[103.5, 1.0, 37.0, 7.0, 5.0, 0.0, 7.0]",9,['preheat oven to 325 degrees f and grease a m...,moist and extremely healthy muffins!,"['zucchini', 'applesauce', 'old fashioned oats...",15,27922,231633
231633,zucchini onion salad,66897,18,92725,2003-07-16,"['30-minutes-or-less', 'time-to-make', 'course...","[231.1, 28.0, 27.0, 8.0, 8.0, 56.0, 5.0]",10,"['grate zucchini', 'dice onion', 'over medium ...","i wrinkled my nose at this in the beginning, t...","['onion', 'sour cream', 'butter', 'fresh groun...",7,27923,231634
231634,zuke muffins,386947,30,396497,2009-08-24,"['30-minutes-or-less', 'time-to-make', 'course...","[127.1, 3.0, 49.0, 7.0, 7.0, 2.0, 7.0]",16,"['1', ') preheat oven to 350', 'spray / line m...",delisious and healthy muffins,"['eggs', 'fat free greek yogurt', 'sugar', 'br...",11,27924,231635
231635,zuvers barbecue sauce,381216,30,763977,2009-07-13,"['30-minutes-or-less', 'time-to-make', 'course...","[595.5, 1.0, 527.0, 62.0, 5.0, 0.0, 50.0]",6,"['mix first 5 ingredients in sauce pan', 'turn...","this barbecue sauce is great on ribs, chicken ...","['molasses', 'ketchup', 'apple cider vinegar',...",12,27925,231636


In [49]:
def analyse_nutrition(nutrilist):
    
    if type(nutrilist) == str:
        nutrilist = list(map(float, nutrilist[1:-1].split(', ')))
        
    nutri = ['Number of calories', 'Total fat (PDV)', 'Sugar (PDV)', 'Sodium (PDV)', 'Protein (PDV)', 'Saturated fat (PDV)', 'Carbohydrates (PDV)']
    d = dict(zip(nutri, nutrilist))
    return d

In [50]:
analyse_nutrition(df_recipes[df_recipes['user_id_normalized'] == 1]['nutrition'][2])

{'Number of calories': 169.7,
 'Total fat (PDV)': 9.0,
 'Sugar (PDV)': 21.0,
 'Sodium (PDV)': 1.0,
 'Protein (PDV)': 12.0,
 'Saturated fat (PDV)': 4.0,
 'Carbohydrates (PDV)': 9.0}

In [61]:
df_recipes[df_recipes.isnull().any(axis=1)]

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,user_id_normalized,recipe_id_normalized
57780,,368257,10,779451,2009-04-27,"['15-minutes-or-less', 'time-to-make', 'course...","[1596.2, 249.0, 155.0, 0.0, 2.0, 112.0, 14.0]",6,"['in a bowl , combine ingredients except for o...",-------------,"['lemon', 'honey', 'horseradish mustard', 'gar...",10,565,57781
