In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

In [None]:
recipes_df = pd.read_csv('../data/raw/RAW_recipes.csv')
interactions_df = pd.read_csv('../data/raw/RAW_interactions.csv')
recipes_and_interactions_df = recipes_df.merge(interactions_df, how='left', left_on='id', right_on='recipe_id')

In [3]:
print(f"Recipes shape: {recipes_df.shape}")
print(f"Interactions shape: {interactions_df.shape}")
print(f"Merged recipes and interactions shape: {recipes_and_interactions_df.shape}")

Recipes shape: (231637, 12)
Interactions shape: (1132367, 5)
Merged recipes and interactions shape: (1132367, 17)


In [4]:
recipes_and_interactions_df.dtypes

name              object
id                 int64
minutes            int64
contributor_id     int64
submitted         object
tags              object
nutrition         object
n_steps            int64
steps             object
description       object
ingredients       object
n_ingredients      int64
user_id            int64
recipe_id          int64
date              object
rating             int64
review            object
dtype: object

In [5]:
recipes_and_interactions_df.head()

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients,user_id,recipe_id,date,rating,review
0,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,4470,137739,2006-02-18,5,I used an acorn squash and recipe#137681 Swee...
1,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,593927,137739,2010-08-21,5,This was a nice change. I used butternut squas...
2,arriba baked winter squash mexican style,137739,55,47892,2005-09-16,"['60-minutes-or-less', 'time-to-make', 'course...","[51.5, 0.0, 13.0, 0.0, 2.0, 0.0, 4.0]",11,"['make a choice and proceed with recipe', 'dep...",autumn is my favorite time of year to cook! th...,"['winter squash', 'mexican seasoning', 'mixed ...",7,178427,137739,2011-12-05,5,Excellent recipe! I used butternut squash and ...
3,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,28603,31490,2002-08-19,0,"Have not tried this, but it sounds delicious. ..."
4,a bit different breakfast pizza,31490,30,26278,2002-06-17,"['30-minutes-or-less', 'time-to-make', 'course...","[173.4, 18.0, 0.0, 17.0, 22.0, 35.0, 1.0]",9,"['preheat oven to 425 degrees f', 'press dough...",this recipe calls for the crust to be prebaked...,"['prepared pizza crust', 'sausage patty', 'egg...",6,346277,31490,2006-08-27,5,This recipe was wonderful. Instead of using t...


### Data types observations

| Identifier (int64) | numeric (int64) | text        | date/time        | muti-values                   |
|--------------------|-----------------|-------------|------------------|-------------------------------|
| id                 | minutes         | name        | submitted        | nutrition (list of integers)  |
| recipe_id          | n_steps         | description | date (of review) | ingredients (list of strings) |
| user_id            | n_ingredients   | review      |                  | steps (list of strings)       |
| contributor_id     | rating          |             |                  | tags (list of strings)        |

- The `date` and `submitted` columns are of type `object`, which indicates that they are stored as a string. This could pose challenges for any temporal analysis or operations that require date manipulations. It would be beneficial to convert/cast those columns to a datetime type for easier handling of date-related tasks.
- Multivalue columns should be expanded or exploded into multiple rows or columns.
  - `nutrition` is a list of integers representing respectively: calories (#), total fat (PDV), sugar (PDV), sodium (PDV), protein (PDV), saturated fat, and carbohydrates (PDV). We should explode it in 7 columns
  - The other multivalue columns are lists of strings associated to another column wich give their length. We should decide if we keep them (by expanding them into separate rows) or not.

In [None]:
# Numeric columns of analytical interest
numeric_cols = ['minutes', 'n_ingredients', 'n_steps', 'rating']
recipes_and_interactions_df[numeric_cols].describe()

Unnamed: 0,minutes,n_ingredients,n_steps,rating
count,1132367.0,1132367.0,1132367.0,1132367.0
mean,36146.84,8.952028,9.62797,4.411016
std,8796494.0,3.689908,5.819689,1.264752
min,0.0,1.0,0.0,0.0
25%,20.0,6.0,6.0,4.0
50%,40.0,9.0,8.0,5.0
75%,70.0,11.0,12.0,5.0
max,2147484000.0,43.0,145.0,5.0


: 

In [None]:
## Distribution of numeric columns
for col in numeric_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(recipes_and_interactions_df[col].dropna(), kde=True)
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

