In [2]:
# Import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


#Import datasets 
inter_train = pd.read_csv('../../data/interactions_train.csv')
inter_test = pd.read_csv('../../data/interactions_test.csv')
inter_val = pd.read_csv('../../data/interactions_validation.csv')
raw_inter = pd.read_csv('../../data/RAW_interactions.csv')
raw_recipes = pd.read_csv('../../data/RAW_recipes.csv')
pp_recipes = pd.read_csv('../../data/PP_recipes.csv')


In [3]:
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)

In [4]:
# Concatenate interactions dataframes
frames = [inter_train, inter_test, inter_val]
df = pd.concat(frames)
features = df.columns.tolist()

In [5]:
# Concatenate interactions dataframes
frames = [inter_train, inter_test, inter_val]
df = pd.concat(frames)
features = df.columns.tolist()

In [6]:
# No missing values
print ("\nMissing values :  ", df.isnull().sum().values.sum()) # 0 missing values 


Missing values :   0


In [9]:
######  When were the recipes rated

# Get data
data = df['date'].value_counts().sort_index()

# Create trace
trace = go.Scatter(x = data.index,
                   y = data.values,
                   marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = '{} Movies Grouped By Year Of Release'.format(df.shape[0]),
              xaxis = dict(title = 'Release Year'),
              yaxis = dict(title = 'Recipes'))

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [10]:
### How are the ratings distributed

# Get data
data = df['rating'].value_counts().sort_index(ascending=False)

# Create trace
trace = go.Bar(x = data.index,
               text = ['{:.1f} %'.format(val) for val in (data.values / df.shape[0] * 100)],
               textposition = 'auto',
               textfont = dict(color = '#000000'),
               y = data.values,
               marker = dict(color = '#db0000'))
# Create layout
layout = dict(title = 'Distribution Of {} recipe-ratings'.format(df.shape[0]),
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'Count'))
# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [11]:
##### How are the number of ratings distributed for movies and users


##### Ratings Per Movie #####
# Get data
data = df.groupby('i')['rating'].count().clip(upper=9999)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'rating',
                     xbins = dict(start = 0,
                                  end = 10000,
                                  size = 100),
                     marker = dict(color = '#db0000'))
# Create layout
layout = go.Layout(title = 'Distribution Of Ratings Per Recipe',
                   xaxis = dict(title = 'ratings per recipe'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)



##### Ratings Per User #####
# Get data
data = df.groupby('u')['rating'].count().clip(upper=199)

# Create trace
trace = go.Histogram(x = data.values,
                     name = 'rating',
                     xbins = dict(start = 0,
                                  end = 200,
                                  size = 2),
                     marker = dict(color = '#db0000'))
# Create layout
layout = go.Layout(title = 'Distribution Of Ratings Per User',
                   xaxis = dict(title = 'Ratings Per User'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

# Create plot
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)


In [20]:
## More details
# Study users and recipes' ratings 
a = df['i'].value_counts() > 5
a[a==True] # 27020 recipes have been rated more than 5 times 
a = df['i'].value_counts() > 10
a[a==True] # 11120 recipes have been rated more than 10 times
a = df['i'].value_counts() > 20
a[a==True] # 4219 recipes have been rated more than 20 times
a = df['i'].value_counts() > 50
a[a==True] # 1158 recipes have been rated more than 50 times

b = df['u'].value_counts() > 5
b[b==True] # 14533 users have uploaded a more than 5 ratings 
b = df['u'].value_counts() > 10
b[b==True] # 9246 users have uploaded a more than 10 ratings 
b = df['u'].value_counts() > 15
b[b==True] # 6778 users have uploaded a more than 15 ratings 
b = df['u'].value_counts() > 20
b[b==True] # 5312 users have uploaded a more than 20 ratings 
b = df['u'].value_counts() > 50
b[b==True] # 2308 users have uploaded a more than 50 ratings 

94       True
275      True
193      True
241      True
208      True
130      True
564      True
164      True
319      True
267      True
100      True
56       True
260      True
245      True
345      True
552      True
82       True
384      True
141      True
236      True
377      True
470      True
736      True
124      True
108      True
55       True
453      True
132      True
243      True
17       True
         ... 
3190     True
2173     True
1939     True
9430     True
7484     True
2740     True
1376     True
6955     True
1363     True
4587     True
8604     True
7817     True
1880     True
1541     True
10025    True
4390     True
6045     True
7632     True
4057     True
5352     True
3718     True
8630     True
3111     True
2323     True
813      True
3161     True
644      True
1947     True
11181    True
1109     True
Name: u, Length: 2308, dtype: bool

In [16]:
#######  Filter sparse recipes and users (not many associated ratings)

min_recipe_ratings = 10
filter_recipes = (df['i'].value_counts()>min_recipe_ratings)
filter_recipes = filter_recipes[filter_recipes].index.tolist()

# Filter sparse users
min_user_ratings = 10
filter_users = (df['u'].value_counts()>min_user_ratings)
filter_users = filter_users[filter_users].index.tolist()

# Actual filtering
df_filter = df[(df['i'].isin(filter_recipes)) & (df['u'].isin(filter_users))]
del filter_recipes, filter_users, min_recipe_ratings, min_user_ratings
print('Shape User-Ratings unfiltered:\t{}'.format(df.shape))
print('Shape User-Ratings filtered:\t{}'.format(df_filterd.shape))


Shape User-Ratings unfiltered:	(718379, 6)
Shape User-Ratings filtered:	(272257, 6)


In [14]:
###### Create test and train set based on date.

In [21]:
##### Create a user-movie matrix 
# Do it on training set created above
df_matrix = df_filter.pivot_table(index='u', columns='i', values='rating')
print('Shape User-recipe-Matrix:\t{}'.format(df_matrix.shape))

Shape User-recipe-Matrix:	(9214, 11120)
