# Data Cleaning


In [1]:
import pandas as pd
from matplotlib import pyplot as plt 
import numpy as np

First, we load required data from reviews, and remove rows based on several criteria. 

In [3]:
# choosing only the columns that are needed. The column 'Review' contains the text that makes the dataframe heavy. So it is dropped.
reviews = pd.read_csv('../data/reviews.csv', usecols=['AuthorId', 'ReviewId', 'RecipeId', 'Rating'])
# Data analysis shows 'Rating'=0 is not neccesarily bad rating 
# (Refer to https://www.kaggle.com/code/gemmin/sentiment-analysis)
# Removing 0 ratings
reviews = reviews[reviews['Rating']!=0]
# Only reviews by active reviewers (selected_authors) are accepted, as the model will be based on active reviewers.
# Active reviewer is whoever has more than a minimum number of reviews in the data
review_counts = reviews.groupby('AuthorId').count()
selected_authors = review_counts.loc[review_counts['ReviewId']>=10].index.values
reviews = reviews.loc[reviews['AuthorId'].isin(selected_authors)]

print('number of selected reviews:', reviews['ReviewId'].nunique())
print('number of selected reviewers:', reviews['AuthorId'].nunique())
print('number of selected recipes:', reviews['RecipeId'].nunique())
 

number of selected reviews: 978879
number of selected reviewers: 14544
number of selected recipes: 238699


Now the ratings are normalized based on the reviewer ratings statistics


In [20]:
# substracting mean of reviewer's ratings
reviews['Rating'] -= reviews.groupby('AuthorId')['Rating'].transform(np.mean)
# deviding by std of reviewer's ratings
reviews['Rating'] /= reviews.groupby('AuthorId')['Rating'].transform(np.std)
reviews.describe()

Unnamed: 0,ReviewId,RecipeId,AuthorId,Rating
count,978879.0,978879.0,978879.0,943329.0
mean,708076.8,153753.910811,12647800.0,1.1087530000000001e-17
std,447018.5,128094.099809,154107400.0,0.9931838
min,7.0,38.0,1533.0,-34.65545
25%,326420.5,49047.0,102937.0,0.1066013
50%,667458.0,112362.0,227586.0,0.3495521
75%,1072974.0,233848.0,465056.0,0.5175935
max,2090332.0,541030.0,2002872000.0,3.015113
