# Data Cleaning


In [1]:
import pandas as pd
from matplotlib import pyplot as plt 
import numpy as np
import re

First, we load required data from reviews, and remove rows based on several criteria. 

In [2]:
# choosing only the columns that are needed. The column 'Review' contains the text that makes the dataframe heavy. So it is dropped.
reviews = pd.read_csv('../data/reviews.csv', usecols=['AuthorId', 'ReviewId', 'RecipeId', 'Rating'])
# Data analysis shows 'Rating'=0 is not neccesarily bad rating 
# (Refer to https://www.kaggle.com/code/gemmin/sentiment-analysis)
# Removing 0 ratings
reviews = reviews[reviews['Rating']!=0]
# Only reviews by active reviewers (selected_authors) are accepted, as the model will be based on active reviewers.
# Active reviewer is whoever has more than a minimum number of reviews in the data
review_counts = reviews.groupby('AuthorId').count()
selected_authors = review_counts.loc[review_counts['ReviewId']>=10].index.values
reviews = reviews.loc[reviews['AuthorId'].isin(selected_authors)]

print('number of selected reviews:', reviews['ReviewId'].nunique())
print('number of selected reviewers:', reviews['AuthorId'].nunique())
print('number of selected recipes:', reviews['RecipeId'].nunique())
 

number of selected reviews: 978879
number of selected reviewers: 14544
number of selected recipes: 238699


spliting train and test based on authors (the train and test datasets have different author lists)

In [23]:
import random
author_list = reviews['AuthorId'].unique()
random.shuffle(author_list)
split = len(author_list)//10
author_list_test, author_list_train = author_list[:split], author_list[split:]
train = reviews[reviews['AuthorId'].isin(author_list_train)]
test = reviews[reviews['AuthorId'].isin(author_list_test)]
train.to_csv('../data/reviews_train.csv')
test.to_csv('../data/reviews_test.csv')



creating train and test datasets

In [4]:
from sklearn.model_selection import train_test_split

reviews = reviews[['RecipeId', 'AuthorId', 'Rating']]
train, test = train_test_split(reviews, test_size=0.2)
train.to_csv('../data/reviews_train.csv')
test.to_csv('../data/reviews_test.csv')

Now the ratings are normalized based on the reviewer ratings statistics.

This normalization is not needed for KNN model of scikit-learn surprise library.


In [4]:
# substracting mean of reviewer's ratings
reviews['Rating'] -= reviews.groupby('AuthorId')['Rating'].transform(np.mean)
# deviding by std of reviewer's ratings
reviews['Rating'] /= reviews.groupby('AuthorId')['Rating'].transform(np.std)
reviews.describe()

Unnamed: 0,ReviewId,RecipeId,AuthorId,Rating
count,978879.0,978879.0,978879.0,943329.0
mean,708076.8,153753.910811,12647800.0,1.1087530000000001e-17
std,447018.5,128094.099809,154107400.0,0.9931838
min,7.0,38.0,1533.0,-34.65545
25%,326420.5,49047.0,102937.0,0.1066013
50%,667458.0,112362.0,227586.0,0.3495521
75%,1072974.0,233848.0,465056.0,0.5175935
max,2090332.0,541030.0,2002872000.0,3.015113


Sorting recipes based on the ratings and saving them

In [None]:
recipes = pd.read_csv('../data/recipes.csv')
recipes = recipes.sort_values('ReviewCount', ascending=False)
recipes.to_csv('../data/recipes.csv')

creating a selected list of popular recipes with high review rate

In [5]:
def str2url(s: str) -> str:
    'returns a list of strings breaking the original string by "" '
    return re.search(r'"(.*?)"', s).group(0)[1:-1]

In [2]:
recipes = pd.read_csv('../data/recipes.csv')
# selecting ~ 10000 recipes
recipes_selected = recipes[(recipes['ReviewCount']>20) & (recipes['AggregatedRating']>=4)]
# removing recipes with Nan category and keywords
recipes_selected = recipes_selected.dropna(subset=['RecipeCategory', 'Keywords'])

# removing recipes with no images
recipe_ids = []
for ind in recipes_selected.index:
    recipe = recipes_selected.loc[ind]
    if re.search(r'"(.*?)"', recipe['Images'])==None:
        recipe_ids.append(ind)
recipes_selected = recipes_selected.drop(recipe_ids)
recipes_selected['Image'] = recipes_selected['Images'].map(str2url)

#recipes_selected.to_csv('../data/recipes_selected.csv')

Creating a light version of selected recipes with needed columns for streamlit app

In [1]:
import pandas as pd

cols = ['RecipeId', 'Name', 'Description', 'Image', 'RecipeCategory', 'Keywords',
       'AggregatedRating', 'ReviewCount', 'RecipeInstructions']
recipes = pd.read_csv('../data/recipes_selected.csv', usecols=cols, index_col='RecipeId')
recipes
#recipes.to_csv('../data/recipes_selected_summarized.csv')


Unnamed: 0_level_0,Name,Description,RecipeCategory,Keywords,AggregatedRating,ReviewCount,RecipeInstructions,Image
RecipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
45809,Bourbon Chicken,I searched and finally found this recipe on th...,Chicken Breast,"c(""Chicken"", ""Poultry"", ""Meat"", ""Chinese"", ""As...",5.0,3063.0,"c(""Editor's Note: Named Bourbon Chicken becau...",https://img.sndimg.com/food/image/upload/w_555...
2886,Best Banana Bread,Make and share this Best Banana Bread recipe f...,Quick Breads,"c(""Breads"", ""Fruit"", ""Oven"", ""< 4 Hours"")",5.0,2273.0,"c(""Remove odd pots and pans from oven."", ""Preh...",https://img.sndimg.com/food/image/upload/w_555...
27208,To Die for Crock Pot Roast,"Amazing flavor, and so simple! No salt needed ...",One Dish Meal,"c(""Roast Beef"", ""Meat"", ""Kid Friendly"", ""Potlu...",5.0,1692.0,"c(""Place beef roast in crock pot."", ""Mix the d...",https://img.sndimg.com/food/image/upload/w_555...
89204,Crock-Pot Chicken With Black Beans &amp; Cream...,I love this Crock-Pot chicken recipe for two r...,One Dish Meal,"c(""Chicken Breast"", ""Chicken"", ""Poultry"", ""Cor...",4.5,1657.0,"c(""Take 4-5 frozen, yes, frozen, boneless chic...",https://img.sndimg.com/food/image/upload/w_555...
39087,Creamy Cajun Chicken Pasta,Make and share this Creamy Cajun Chicken Pasta...,Chicken Breast,"c(""Chicken"", ""Poultry"", ""Meat"", ""Cajun"", ""Kid ...",5.0,1586.0,"c(""Place chicken and Cajun seasoning in a bowl...",https://img.sndimg.com/food/image/upload/w_555...
...,...,...,...,...,...,...,...,...
150759,Deluxe Fried Eggs,A simple and delicious way to transform fried ...,Breakfast,"c(""Lunch/Snacks"", ""Onions"", ""Vegetable"", ""Aust...",5.0,21.0,"c(""Heat the oil and the lemon juice in a non-s...",https://img.sndimg.com/food/image/upload/w_555...
449167,Healthy Quinoa and Ground Turkey Stuffed Peppers,Make and share this Healthy Quinoa and Ground ...,Poultry,"c(""Vegetable"", ""Meat"", ""Weeknight"", ""< 4 Hours"")",5.0,21.0,"c(""Preheat oven to 350 degrees."", ""Combine qui...",https://img.sndimg.com/food/image/upload/w_555...
121399,Crock Pot Low Calorie Lemon Chicken,"A quick and easy, slow-cooked lemon chicken ma...",Chicken Breast,"c(""Chicken"", ""Poultry"", ""Vegetable"", ""Meat"", ""...",5.0,21.0,"c(""Coat the chicken with lemon-pepper seasonin...",https://img.sndimg.com/food/image/upload/w_555...
186785,Broccoli and Tortellini Salad,You can lighten this recipe up a little by usi...,Potluck,"""< 30 Mins""",5.0,21.0,"c(""Place bacon in a large, deep skillet. Cook ...",https://img.sndimg.com/food/image/upload/w_555...
