1. Affichage des données

In [4]:
# === Imports & configuration ===
from pathlib import Path
import pandas as pd
from pprint import pprint
from IPython.display import display

# Pour un affichage plus lisible
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 160)

# Définir le dossier des données (relatif au notebook)
DATA_DIR = Path("Data")
FILE = "RAW_recipes.csv"
FILEPATH = DATA_DIR / FILE

# Vérification du chemin
if not FILEPATH.exists():
    raise FileNotFoundError(f"Fichier introuvable : {FILEPATH.resolve()}")

# === Chargement du fichier ===
df_users = pd.read_csv(FILEPATH, low_memory=False)

# === Infos générales ===
print(f"Fichier : {FILE}")
print(f"Shape : {df_users.shape[0]} lignes × {df_users.shape[1]} colonnes\n")

print("Colonnes :")
pprint(list(df_users.columns))

print("\nTypes de données :")
print(df_users.dtypes)

# === Aperçu des premières lignes ===
display(df_users.head(20))

# === Stats rapides sur quelques colonnes clés ===
for col in ["user_id", "name", "n_recipes", "n_reviews", "n_interactions"]:
    if col in df_users.columns:
        print(f"\nValue counts pour '{col}' (top 10) :")
        print(df_users[col].value_counts(dropna=False).head(10))

Fichier : RAW_recipes.csv
Shape : 231637 lignes × 12 colonnes

Colonnes :
['name',
 'id',
 'minutes',
 'contributor_id',
 'submitted',
 'tags',
 'nutrition',
 'n_steps',
 'steps',
 'description',
 'ingredients',
 'n_ingredients']

Types de données :
name              object
id                 int64
minutes            int64
contributor_id     int64
submitted         object
tags              object
nutrition         object
n_steps            int64
steps             object
description       object
ingredients       object
n_ingredients      int64
dtype: object


Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7
1,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6
2,all in the kitchen chili,112140,130,196586,2005-02-25,"['time-to-make', 'course', 'preparation', 'mai...","[269.8, 22.0, 32.0, 48.0, 39.0, 27.0, 5.0]",6,"['brown ground beef in large pot', 'add choppe...",this modified version of 'mom's' chili was a h...,"['ground beef', 'yellow onions', 'diced tomato...",13
3,alouette potatoes,59389,45,68585,2003-04-14,"['60-minutes-or-less', 'time-to-make', 'course...","[368.1, 17.0, 10.0, 2.0, 14.0, 8.0, 20.0]",11,['place potatoes in a large pot of lightly sal...,"this is a super easy, great tasting, make ahea...","['spreadable cheese with garlic and herbs', 'n...",11
4,amish tomato ketchup for canning,44061,190,41706,2002-10-25,"['weeknight', 'time-to-make', 'course', 'main-...","[352.9, 1.0, 337.0, 23.0, 3.0, 0.0, 28.0]",5,['mix all ingredients& boil for 2 1 / 2 hours ...,my dh's amish mother raised him on this recipe...,"['tomato juice', 'apple cider vinegar', 'sugar...",8
5,apple a day milk shake,5289,0,1533,1999-12-06,"['15-minutes-or-less', 'time-to-make', 'course...","[160.2, 10.0, 55.0, 3.0, 9.0, 20.0, 7.0]",4,"['combine ingredients in blender', 'cover and ...",,"['milk', 'vanilla ice cream', 'frozen apple ju...",4
6,aww marinated olives,25274,15,21730,2002-04-14,"['15-minutes-or-less', 'time-to-make', 'course...","[380.7, 53.0, 7.0, 24.0, 6.0, 24.0, 6.0]",4,['toast the fennel seeds and lightly crush the...,my italian mil was thoroughly impressed by my ...,"['fennel seeds', 'green olives', 'ripe olives'...",9
7,backyard style barbecued ribs,67888,120,10404,2003-07-30,"['weeknight', 'time-to-make', 'course', 'main-...","[1109.5, 83.0, 378.0, 275.0, 96.0, 86.0, 36.0]",10,['in a medium saucepan combine all the ingredi...,this recipe is posted by request and was origi...,"['pork spareribs', 'soy sauce', 'fresh garlic'...",22
8,bananas 4 ice cream pie,70971,180,102353,2003-09-10,"['weeknight', 'time-to-make', 'course', 'main-...","[4270.8, 254.0, 1306.0, 111.0, 127.0, 431.0, 2...",8,"['crumble cookies into a 9-inch pie plate , or...",,"['chocolate sandwich style cookies', 'chocolat...",6
9,beat this banana bread,75452,70,15892,2003-11-04,"['weeknight', 'time-to-make', 'course', 'main-...","[2669.3, 160.0, 976.0, 107.0, 62.0, 310.0, 138.0]",12,"['preheat oven to 350 degrees', 'butter two 9x...",from ann hodgman's,"['sugar', 'unsalted butter', 'bananas', 'eggs'...",9



Value counts pour 'name' (top 10) :
name
brown sugar frosting                            3
broccoli cauliflower soup                       3
banana oatmeal chocolate chip cookies           3
banana chocolate chip muffins                   3
tex mex chicken and rice                        3
cream cheese banana nut bread                   3
peanut butter oatmeal chocolate chip cookies    3
crock pot lemon garlic chicken                  3
broccoli cheese soup                            3
three bean chili                                3
Name: count, dtype: int64


In [10]:
# Pour un affichage plus lisible
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 160)

# Définir le dossier des données (relatif au notebook)
DATA_DIR = Path("Data")
FILE = "PP_recipes.csv"
FILEPATH = DATA_DIR / FILE

# Vérification du chemin
if not FILEPATH.exists():
    raise FileNotFoundError(f"Fichier introuvable : {FILEPATH.resolve()}")

# === Chargement du fichier ===
df_users = pd.read_csv(FILEPATH, low_memory=False)

# === Infos générales ===
print(f"Fichier : {FILE}")
print(f"Shape : {df_users.shape[0]} lignes × {df_users.shape[1]} colonnes\n")

print("Colonnes :")
pprint(list(df_users.columns))

print("\nTypes de données :")
print(df_users.dtypes)

# === Aperçu des premières lignes ===
display(df_users.head(20))

# === Stats rapides sur quelques colonnes clés ===
for col in ["user_id", "name", "n_recipes", "n_reviews", "n_interactions"]:
    if col in df_users.columns:
        print(f"\nValue counts pour '{col}' (top 10) :")
        print(df_users[col].value_counts(dropna=False).head(10))


Fichier : PP_recipes.csv
Shape : 178265 lignes × 8 colonnes

Colonnes :
['id',
 'i',
 'name_tokens',
 'ingredient_tokens',
 'steps_tokens',
 'techniques',
 'calorie_level',
 'ingredient_ids']

Types de données :
id                    int64
i                     int64
name_tokens          object
ingredient_tokens    object
steps_tokens         object
techniques           object
calorie_level         int64
ingredient_ids       object
dtype: object


Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]"
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,..."
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696..."
3,74301,168258,"[40480, 10025, 31156, 40481]","[[1270, 1645, 28447], [21601], [27952, 29471, ...","[40480, 40482, 5539, 21601, 1073, 903, 2324, 4...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[7940, 3609, 7060, 6265, 1170, 6654, 5003, 3561]"
4,76272,109030,"[40480, 17841, 252, 782, 2373, 1641, 2373, 252...","[[1430, 11434], [1430, 17027], [1615, 23, 695,...","[40480, 40482, 14046, 1430, 11434, 488, 17027,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[3484, 6324, 7594, 243]"
5,465171,111231,"[40480, 3390, 829, 35873, 7047, 13731, 2640, 1...","[[13731, 30684, 260, 245, 17843, 25592, 10601]...","[40480, 40482, 7087, 13731, 30684, 260, 245, 5...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,"[6861, 7655, 6846, 6906, 1789, 131, 6863, 1833..."
6,163861,85356,"[40480, 1966, 488, 5218, 252, 5867, 10994, 118...","[[31801, 12395, 25808], [17918], [6953], [1133...","[40480, 40482, 604, 704, 15110, 244, 15684, 24...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2,"[5574, 2683, 6270, 5319, 2499, 869, 1278, 4987..."
7,186383,105140,"[40480, 5317, 7, 491, 11274, 5639, 40481]","[[17918], [25916], [15473, 8361], [15473, 1016...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 332, 335, 6270, 800, 4987, 7470, ..."
8,116395,8671,"[40480, 16190, 13249, 4914, 5639, 40481]","[[17918], [36374, 3388, 650, 256, 6444], [2361...","[40480, 40482, 19093, 271, 40478, 40482, 23667...","[1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",1,"[2683, 1689, 5687, 1098, 840, 7782, 7011, 1910..."
9,303460,160334,"[40480, 1287, 7912, 504, 22118, 19276, 831, 47...","[[559, 1164, 6020], [511, 532, 543, 241], [664...","[40480, 40482, 14259, 1055, 11, 4364, 488, 827...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,"[6413, 7997, 3148, 3710, 1799, 2007, 3203, 265..."


In [11]:
# Pour un affichage plus lisible
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 160)

# Définir le dossier des données (relatif au notebook)
DATA_DIR = Path("Data")
FILE = "RAW_interactions.csv"
FILEPATH = DATA_DIR / FILE

# Vérification du chemin
if not FILEPATH.exists():
    raise FileNotFoundError(f"Fichier introuvable : {FILEPATH.resolve()}")

# === Chargement du fichier ===
df_users = pd.read_csv(FILEPATH, low_memory=False)

# === Infos générales ===
print(f"Fichier : {FILE}")
print(f"Shape : {df_users.shape[0]} lignes × {df_users.shape[1]} colonnes\n")

print("Colonnes :")
pprint(list(df_users.columns))

print("\nTypes de données :")
print(df_users.dtypes)

# === Aperçu des premières lignes ===
display(df_users.head(20))

# === Stats rapides sur quelques colonnes clés ===
for col in ["user_id", "name", "n_recipes", "n_reviews", "n_interactions"]:
    if col in df_users.columns:
        print(f"\nValue counts pour '{col}' (top 10) :")
        print(df_users[col].value_counts(dropna=False).head(10))


Fichier : RAW_interactions.csv
Shape : 1132367 lignes × 5 colonnes

Colonnes :
['user_id', 'recipe_id', 'date', 'rating', 'review']

Types de données :
user_id       int64
recipe_id     int64
date         object
rating        int64
review       object
dtype: object


Unnamed: 0,user_id,recipe_id,date,rating,review
0,38094,40893,2003-02-17,4,Great with a salad. Cooked on top of stove for...
1,1293707,40893,2011-12-21,5,"So simple, so delicious! Great for chilly fall..."
2,8937,44394,2002-12-01,4,This worked very well and is EASY. I used not...
3,126440,85009,2010-02-27,5,I made the Mexican topping and took it to bunk...
4,57222,85009,2011-10-01,5,"Made the cheddar bacon topping, adding a sprin..."
5,52282,120345,2005-05-21,4,very very sweet. after i waited the 2 days i b...
6,124416,120345,2011-08-06,0,"Just an observation, so I will not rate. I fo..."
7,2000192946,120345,2015-05-10,2,This recipe was OVERLY too sweet. I would sta...
8,76535,134728,2005-09-02,4,Very good!
9,273745,134728,2005-12-22,5,Better than the real!!



Value counts pour 'user_id' (top 10) :
user_id
424680    7671
37449     5603
383346    4628
169430    4076
128473    3917
89831     3353
58104     3288
133174    3107
199848    3018
305531    2902
Name: count, dtype: int64
