# Download Data from Kaggle and perform split:
The data comes in 2 files : train, validation and test.
Unfortunately, the split performed seems to seperate movies from sets that don't cross over
A merge of data and a split is performed according taking in account the temporality: the 20% latest rating from each user will go to test set

At end, new train/test files are generated in/Data/generated

In [1]:
!pwd

/Users/seb/MachineLearning/MLProjects/Recipes/src


In [1]:
import pandas as pd
import kaggle as kg
import numpy as np
from sklearn.model_selection import train_test_split
import gc, sys
from tqdm import tqdm_notebook as tqdm

DATA_FOLDER='../data/'

In [3]:
api = kg.KaggleApi('~/.kaggle/kaggle.json')
api.authenticate()
api.dataset_download_files('shuyangli94/food-com-recipes-and-user-interactions', DATA_FOLDER,
                           quiet=False, unzip=True)

  1%|          | 2.00M/267M [00:00<00:14, 19.6MB/s]

Downloading food-com-recipes-and-user-interactions.zip to ../data


100%|██████████| 267M/267M [00:12<00:00, 22.1MB/s] 





In [2]:
df_full_data = pd.concat([pd.read_csv(f'{DATA_FOLDER}interactions_{typ}.csv') 
                          for typ in ['train', 'validation', 'test']])

#convert date column to datime and sort dataframe by date
df_full_data['date'] = pd.to_datetime(df_full_data['date'], format='%Y-%m-%d')
df_full_data.sort_values(['date'])

#perform the split
train, test = train_test_split(df_full_data, random_state = 122, shuffle = False, test_size = 0.2)

Unnamed: 0,user_id,recipe_id,date,rating,u,i
0,2046,4684,2000-02-25,5.0,22095,44367
1,2046,517,2000-02-25,5.0,22095,87844
2,1773,7435,2000-03-13,5.0,24732,138181
3,1773,278,2000-03-13,4.0,24732,93054
4,2046,3431,2000-04-07,5.0,22095,101723
5,2046,13307,2000-05-21,5.0,22095,134551
6,2312,780,2000-09-12,5.0,1674,127175
7,2312,51964,2000-09-26,5.0,1674,151793
1627,2369,7762,2000-10-05,3.0,20970,37104
8,2312,1232,2000-10-17,4.0,1674,15498


In [3]:
print('number of recipes : ', len(df_full_data['i'].unique()))
print('number of users : ', len(df_full_data['u'].unique()))

number of recipes :  178265
number of users :  25076


## Filter sparse recipes and users

In [3]:
#######  Filter sparse recipes and users (not many associated ratings)
min_recipe_ratings = 10
min_user_ratings = 5

# Filter sparse recipes
filter_recipes = (df_full_data['i'].value_counts()>min_recipe_ratings)
filter_recipes = filter_recipes[filter_recipes].index.tolist() # list of sparse recipes

# Create new dataset where these recipes are filtered
df_filter = df_full_data[df_full_data['i'].isin(filter_recipes)]

# Filter sparse users
filter_users = (df_filter['u'].value_counts()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

# Create again new dataset, where users are now filtered
# We have do to it sequentially otherwise some users have rated more than 10 recipes but they can all get deleted as each of them was not rated more than 10 times
df_filter = df_filter[df_filter['u'].isin(filter_users)]

# Every recipe is rated at least 5-6 times. Same for users (more obvious as filtered in 2nd position)
print('Ensure that the value is 0 :',(df_filter['i'].value_counts().values < 5).astype(int).sum())

print('Nomber of records unfiltered :', len(df_full_data))
print('Nomber of records filtered :', len(df_filter))

Ensure that the value is 0 : 0
Nomber of records unfiltered : 718379
Nomber of records filtered : 281263


In [4]:
#print the minimum number of rating per recipes
print('the minimum rating per recipe is :',
      df_filter.groupby(['i'])['rating'].count().sort_values().head(1).values[0])

#print the minimum number of rating per user
print('the minimum rating per user is :',
      df_filter.groupby(['u'])['rating'].count().sort_values().head(1).values[0])

the minimum rating per recipe is : 5
the minimum rating per user is : 6


In [6]:
# Save full data
df_filter.to_csv(f'{DATA_FOLDER}generated/full_data_filtered.csv', index=False)

## Train /test set 

In [78]:
##### Approach consists in placing the last ratings given by each user (approx 20%) in the test set. 

# Start with empty dataframes
train_set = pd.DataFrame(columns=['user_id', 'recipe_id', 'date', 'rating', 'u', 'i'])
test_set = pd.DataFrame(columns=['user_id', 'recipe_id', 'date', 'rating', 'u', 'i'])

train_df_list = []
test_df_list = []

# Loop over users - add last ratings given by each user to the test set 
mylist = list(set(df_filter.u))
for item in tqdm(mylist, 'iterating over users'): 
    subdata = df_filter[df_filter['u']==item].sort_values(by='date')  # define a new dataframe for each user
    limit = round(0.2 * len(subdata))
    head = subdata.head(len(subdata) - limit)
    tail = subdata.tail(limit)
    train_df_list.append(head.copy())
    test_df_list.append(tail.copy())
    # put the other part of the data with ratings N/A => ensure that train and test have the same dimension
    with pd.option_context('mode.chained_assignment', None):
        head['rating']= np.nan
        tail['rating']= np.nan
    train_df_list.append(tail)
    test_df_list.append(head)
 
print("Starting build of train/test datasets")
train_set = pd.concat(train_df_list)
test_set = pd.concat(test_df_list)
print('Build completed')

print('train set shape is :',train_set.shape, 
      'deleted ratings number is :',train_set['rating'].isnull().sum(),
     'ratio is :', train_set['rating'].isnull().sum()/len(train_set))
print('test set shape is :',test_set.shape, 'deleted ratings number is :',
      test_set['rating'].isnull().sum(),
     'ratio is :', test_set['rating'].isnull().sum()/len(test_set))
    

HBox(children=(IntProgress(value=0, description='iterating over users', max=10007, style=ProgressStyle(descrip…


Starting build of train/test datasets
Build completed
train set shape is : (281263, 6) deleted ratings number is : 55925 ratio is : 0.19883525383715597
test set shape is : (281263, 6) deleted ratings number is : 225338 ratio is : 0.801164746162844


In [62]:
##### Create a user-movie matrix for this approach
train_matrix = train_set.pivot_table(index='u', columns='i', values='rating', dropna=False)
print(f'Shape of train User-Movie-Matrix:\t{train_matrix.shape}')

test_matrix = test_set.pivot_table(index='u', columns='i', values='rating',dropna=False)
print(f'Shape of test User-Movie-Matrix:\t{test_matrix.shape}')

Shape of train User-Movie-Matrix:	(10007, 11120)
Shape of test User-Movie-Matrix:	(10007, 11120)


## Save the files in 'DATA_FOLDER/data/generated'

In [31]:
# 5 columns 
train_set[['rating', 'u', 'i']].to_csv(f'{DATA_FOLDER}generated/inter_train.csv')
test_set[['rating', 'u', 'i']].to_csv(f'{DATA_FOLDER}generated/inter_test.csv')

## Ensure that re building matrix provide the same result (size)

In [75]:
# How to read the dataset normally 
test = pd.read_csv(f'{DATA_FOLDER}generated/inter_test.csv')

# Way to read the matrix as a pivot table directly - does not need to be stored a such
test_matrix_1 = pd.read_csv(f'{DATA_FOLDER}generated/inter_test.csv').pivot_table(
                            index='u', columns='i', values='rating', dropna=False)


## Merging PP_recipes and RAW_recipes

Write them in /data/generated/

In [36]:
pp_recipes = pd.read_csv('../data/PP_recipes.csv')
raw_recipes = pd.read_csv('../data/RAW_recipes.csv')

new_recipes = pp_recipes.join(raw_recipes.set_index(['id']), on='id', how = 'left')

display(new_recipes.head(3))

new_recipes.to_csv(f'{DATA_FOLDER}generated/recipes.csv', index=False)

Unnamed: 0,id,i,name_tokens,ingredient_tokens,steps_tokens,techniques,calorie_level,ingredient_ids,name,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,ingredients,n_ingredients
0,424415,23,"[40480, 37229, 2911, 1019, 249, 6878, 6878, 28...","[[2911, 1019, 249, 6878], [1353], [6953], [153...","[40480, 40482, 21662, 481, 6878, 500, 246, 161...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[389, 7655, 6270, 1527, 3406]",aromatic basmati rice rice cooker,61,496803,2010-05-10,"['weeknight', 'time-to-make', 'course', 'main-...","[228.2, 2.0, 2.0, 8.0, 9.0, 1.0, 15.0]",6,"['rinse the rice in a fine strainer , then dra...",from the ultimate rice cooker cookbook. the a...,"['basmati rice', 'water', 'salt', 'cinnamon st...",5
1,146223,96900,"[40480, 18376, 7056, 246, 1531, 2032, 40481]","[[17918], [25916], [2507, 6444], [8467, 1179],...","[40480, 40482, 729, 2525, 10906, 485, 43, 8393...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0,"[2683, 4969, 800, 5298, 840, 2499, 6632, 7022,...",pumpkin pie a la easy,55,229619,2005-11-25,"['60-minutes-or-less', 'time-to-make', 'course...","[249.4, 16.0, 92.0, 8.0, 11.0, 27.0, 11.0]",10,"['preheat oven to 350', 'combine flour , oats ...",this is a pampered chef recipe for their stone...,"['flour', 'oats', 'brown sugar', 'pecans', 'bu...",12
2,312329,120056,"[40480, 21044, 16954, 8294, 556, 10837, 40481]","[[5867, 24176], [1353], [6953], [1301, 11332],...","[40480, 40482, 8240, 481, 24176, 296, 1353, 66...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1,"[1257, 7655, 6270, 590, 5024, 1119, 4883, 6696...",cheesy tomato soup with potatoes,25,621626,2008-07-07,"['30-minutes-or-less', 'time-to-make', 'course...","[351.3, 34.0, 15.0, 50.0, 25.0, 70.0, 8.0]",6,"['pour the broth & water into a large pot', 'a...",after modifying another recipe i came up with ...,"['chicken broth', 'water', 'salt', 'black pepp...",15


## Create a dataframe with a the name of a recipe and 'i' as index

In [39]:
# Construct a dataframe with 'i' as index and the recipe name as value
id_target=[]
id_target = list(set(train_set['i'].tolist()))
id_target.sort()
name_recipes={}
for item in id_target: 
    a = new_recipes[new_recipes['i']== item].name.tolist()
    name_recipes[item] = a 

recipe_names = pd.DataFrame(name_recipes).T
recipe_names.columns = ['Name']
recipe_names.to_csv(f'{DATA_FOLDER}generated/names.csv')

In [None]:
"""
# Construct a dataframe with 'i' as index and the recipe name as value
id_target=[]
id_target = list(set(train_set['i'].tolist()))
id_target.sort()
name_recipes={}
for item in id_target: 
    b = new_recipes[new_recipes['i']== item].id
    a = new_recipes[new_recipes['i']== item].name
    name_recipes[item] = [a, b]

recipe_names = pd.DataFrame(name_recipes).T
recipe_names.columns = ['Name']
recipe_names.to_csv(f'{DATA_FOLDER}generated/names.csv')
"""

In [13]:
#show object in memory
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir()], key=lambda x: x[1], reverse=True)

[('test_matrix', 890302808),
 ('train_matrix', 890302808),
 ('new_recipes', 538071181),
 ('raw_recipes', 388410531),
 ('pp_recipes', 254454202),
 ('df_full_data', 40229256),
 ('train', 32183400),
 ('df_filter', 15750760),
 ('test_set', 15750760),
 ('train_set', 15750760),
 ('test', 8045888),
 ('test_df_list', 178032),
 ('train_df_list', 178032),
 ('mylist', 90176),
 ('filter_recipes', 89032),
 ('filter_users', 80128),
 ('_i8', 1665),
 ('_i6', 1206),
 ('_i4', 490),
 ('_i9', 411),
 ('_i7', 392),
 ('subdata', 368),
 ('_i', 339),
 ('_i12', 339),
 ('head', 312),
 ('_i11', 281),
 ('_ii', 281),
 ('_i3', 257),
 ('_i10', 249),
 ('_iii', 249),
 ('Out', 248),
 ('_oh', 248),
 ('_i2', 238),
 ('In', 200),
 ('_ih', 200),
 ('_i5', 172),
 ('_i13', 167),
 ('tqdm', 144),
 ('train_test_split', 144),
 ('__doc__', 113),
 ('__builtin__', 88),
 ('__builtins__', 88),
 ('gc', 88),
 ('kg', 88),
 ('np', 88),
 ('pd', 88),
 ('sys', 88),
 ('tail', 88),
 ('_dh', 80),
 ('get_ipython', 72),
 ('api', 64),
 ('exit', 64),