In [19]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

In [26]:
# EXTRACT: Load the recipe dataset and sample 50k random rows
file_path = r"c:\Users\FRBN6849\Downloads\Projet-ETL-main\Projet-ETL-main\data\raw\recipe_sample.csv"
df = pd.read_csv(file_path)
print(f"Original dataset: {len(df)} rows, {len(df.columns)} columns")
print(f"Columns: {df.columns.tolist()}")

# Sample 50k random rows (or all if less than 50k)
df_sample = df.sample(n=20000, random_state=42)
print(f"Sample extracted: {len(df_sample)} rows")
print(df_sample.head())

Original dataset: 20000 rows, 7 columns
Columns: ['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source', 'NER']
Sample extracted: 20000 rows
       Unnamed: 0                                  title  \
10650       16393           Parmesan Chicken(Serves 8)     
2041      1273074                           Garlic Soup    
8668      2190185            Einar's Marinated Mushrooms   
1114      1788278  Quick, Easy, Delicious Spritz Cookies   
13902      533951                Sausage And Green Beans   

                                             ingredients  \
10650  ["8 chicken breasts, skinned, boned and pounde...   
2041   ["2 c. water", "2 cloves garlic, pressed or mi...   
8668   ["1-1/2 lb. whole mushrooms", "1-2/3 tbsp. oli...   
1114   ["50 grams Margarine (butter)", "40 grams Powd...   
13902  ["1/2 lb. German sausage", "1/2 lb. Italian sa...   

                                              directions  \
10650  ["Dip each breast in a combination of eggs and...   
20

In [27]:
# TRANSFORM: Remove the 'Unnamed: 0' column (index column)
if 'Unnamed: 0' in df_sample.columns:
    df_sample = df_sample.drop(columns=['Unnamed: 0'])

print(f"Final columns: {df_sample.columns.tolist()}")
print(f"Final shape: {df_sample.shape}")

Final columns: ['title', 'ingredients', 'directions', 'link', 'source', 'NER']
Final shape: (20000, 6)


In [28]:
# TRANSFORM: Check for null values
print("Null values per column:")
print(df_sample.isnull().sum())
print(f"\nTotal rows with at least one null: {df_sample.isnull().any(axis=1).sum()}")

Null values per column:
title          0
ingredients    0
directions     0
link           0
source         0
NER            0
dtype: int64

Total rows with at least one null: 0


In [29]:
# TRANSFORM: Remove rows with at least one null column
df_no_nulls = df_sample.dropna()
print(f"Rows after removing any nulls: {len(df_no_nulls)}")
print(f"Removed {len(df_sample) - len(df_no_nulls)} rows with null values")

Rows after removing any nulls: 20000
Removed 0 rows with null values


In [30]:
# TRANSFORM: Remove duplicates
df_clean = df_no_nulls.drop_duplicates()
print(f"Rows after removing duplicates: {len(df_clean)}")
print(f"Removed {len(df_no_nulls) - len(df_clean)} duplicate rows")

Rows after removing duplicates: 20000
Removed 0 duplicate rows


In [31]:
# Show final clean data sample
print("Final clean data sample:")
df_clean.head(10)

Final clean data sample:


Unnamed: 0,title,ingredients,directions,link,source,NER
10650,Parmesan Chicken(Serves 8),"[""8 chicken breasts, skinned, boned and pounde...","[""Dip each breast in a combination of eggs and...",www.cookbooks.com/Recipe-Details.aspx?id=84167,Gathered,"[""chicken breasts"", ""eggs"", ""milk"", ""bread cru..."
2041,Garlic Soup,"[""2 c. water"", ""2 cloves garlic, pressed or mi...","[""Place water, garlic, salt, thyme, and pasta ...",www.epicurious.com/recipes/member/views/garlic...,Gathered,"[""water"", ""garlic"", ""thyme"", ""salt"", ""multi co..."
8668,Einar's Marinated Mushrooms,"[""1-1/2 lb. whole mushrooms"", ""1-2/3 tbsp. oli...","[""Put all ingredients into a sauce pan; stir.""...",www.foodgeeks.com/recipes/6375,Recipes1M,"[""mushrooms"", ""olive oil"", ""fennel seeds"", ""co..."
1114,"Quick, Easy, Delicious Spritz Cookies","[""50 grams Margarine (butter)"", ""40 grams Powd...","[""Preparation: Sift the ingredients from toget...",cookpad.com/us/recipes/145240-quick-easy-delic...,Recipes1M,"[""Margarine"", ""Powdered sugar"", ""flour"", ""star..."
13902,Sausage And Green Beans,"[""1/2 lb. German sausage"", ""1/2 lb. Italian sa...","[""Cut sausages into bite-sized pieces and brow...",www.cookbooks.com/Recipe-Details.aspx?id=956338,Gathered,"[""sausage"", ""Italian sausage"", ""onions"", ""fres..."
11963,Peanut Butter Crispy Rice Treats,"[""6 tablespoons unsalted butter, plus more for...","[""Lightly butter a 13- by 9-inch baking dish.""...",www.foodnetwork.com/recipes/food-network-kitch...,Recipes1M,"[""unsalted butter"", ""marshmallows"", ""smooth pe..."
11072,Chocolate Cake,"[""2 c. all-purpose flour"", ""1 c. water"", ""1 st...","[""Aerate the flour and sugar in a large bowl.""...",www.cookbooks.com/Recipe-Details.aspx?id=773865,Gathered,"[""flour"", ""water"", ""margarine"", ""eggs"", ""bakin..."
3002,Egg Salad With A Twist,"[""4 large eggs"", ""2 tablespoons green onions"",...","[""For best results in making the eggs easier t...",www.food.com/recipe/egg-salad-with-a-twist-97575,Gathered,"[""eggs"", ""green onions"", ""green olives"", ""mayo..."
19771,Slow Cooker Scalloped Potatoes with Ham,"[""3 pounds potatoes, peeled and thinly sliced""...","[""Place sliced potatoes in slow cooker."", ""In ...",allrecipes.com/recipe/slow-cooker-scalloped-po...,Recipes1M,"[""potatoes"", ""Cheddar cheese"", ""onion"", ""ham"",..."
8115,Frosted Peanut Butter Brownies,"[""1 1/2 cups butter or 1 1/2 cups margarine, d...","[""Cook 1 cup butter and 1/3 cup cocoa in a sau...",www.food.com/recipe/frosted-peanut-butter-brow...,Gathered,"[""butter"", ""cocoa"", ""sugar"", ""flour"", ""salt"", ..."


In [33]:
# Save the clean data
output_path = "recipe_cleaned.csv"
df_clean.to_csv(output_path, index=False)
print(f"Clean data saved to: {output_path}")
print(f"Final shape: {df_clean.shape}")
print("ETL Complete!")

Clean data saved to: recipe_cleaned.csv
Final shape: (20000, 6)
ETL Complete!
