# Exercises for "Hands-on with Pydata: How to Build a Minimal Recommendation Engine"

# Systems check: imports and files

In [1]:
import numpy as np
import pandas as pd

# Pandas questions: Series and DataFrames
## 1. Adding a column in a DataFrame

In [2]:
# given the following DataFrame, add a new column to it
df = pd.DataFrame({'col1': [1,2,3,4]})
df

Unnamed: 0,col1
0,1
1,2
2,3
3,4


## 2. Deleting a row in a DataFrame

In [3]:
# given the following DataFrame, delete row 'd' from it
df = pd.DataFrame({'col1': [1,2,3,4]}, index = ['a','b','c','d'])
df

Unnamed: 0,col1
a,1
b,2
c,3
d,4


## 3. Creating a DataFrame from a few Series

In [5]:
# given the following three Series, create a DataFrame such that it holds them as its columns
ser_1 = pd.Series(np.random.randn(6))
ser_2 = pd.Series(np.random.randn(6))
ser_3 = pd.Series(np.random.randn(6))
pd.DataFrame({'col1':ser_1, 'col2':ser_2, 'col3':ser_3})

Unnamed: 0,col1,col2,col3
0,-0.574196,0.660319,-0.431792
1,-0.069982,0.30809,0.659886
2,-1.315193,1.385178,0.515006
3,0.533424,-0.537994,-0.891257
4,-0.695879,-0.544731,0.093379
5,-2.55685,-1.34007,-0.011012


# Pandas questions: Indexing

## 1. Indexing into a specific column

In [6]:
# given the following DataFrame, try to index into the 'col_2' column
df = pd.DataFrame(data={'col_1': [0.12, 7, 45, 10], 'col_2': [0.9, 9, 34, 11]},
                  columns=['col_1', 'col_2', 'col_3'],
                  index=['obs1', 'obs2', 'obs3', 'obs4'])
df

Unnamed: 0,col_1,col_2,col_3
obs1,0.12,0.9,
obs2,7.0,9.0,
obs3,45.0,34.0,
obs4,10.0,11.0,


## 2. Label-based indexing

In [9]:
# using the same DataFrame, index into the row whose index is 'obs3'
df.loc['obs3']

col_1     45
col_2     34
col_3    NaN
Name: obs3, dtype: object

## 2. Position-based indexing

In [12]:
# using the same DataFrame, index into into its first row
df.iloc[0,0]

0.12

# Mini-Challenge prep: data loading

## 1. How to load the `users` and `movies` portions of MovieLens

In [13]:
import pandas as pd

users = pd.read_table('data/ml-1m/users.dat',
                      sep='::', header=None,
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'])

movies = pd.read_table('data/ml-1m/movies.dat',
                       sep='::', header=None,
                       names=['movie_id', 'title', 'genres'])



## 2. How to load the training and testing subsets

In [18]:
# subset version (hosted notebook)
movielens_train = pd.read_csv('data/movielens_train.csv', index_col=0, encoding='latin1')
movielens_test = pd.read_csv('data/movielens_test.csv', index_col=0, encoding='latin1')

In [20]:
movielens_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
593263,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),Thriller,False
235597,1051,3793,4,974958593,F,25,0,60513,X-Men (2000),Action|Sci-Fi,False
219003,3727,2366,3,966309522,M,35,7,74401,King Kong (1933),Action|Adventure|Horror,False
685090,4666,1094,3,963843918,M,35,1,53704,"Crying Game, The (1992)",Drama|Romance|War,False
312377,3261,1095,4,968251750,M,45,20,87505,Glengarry Glen Ross (1992),Drama,False


In [21]:
movielens_test.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693323,4653,2648,4,975532459,M,35,12,95051,Frankenstein (1931),Horror,False
24177,2259,1270,4,974591524,F,56,16,70503,Back to the Future (1985),Comedy|Sci-Fi,False
202202,3032,1378,5,970343147,M,25,0,47303,Young Guns (1988),Action|Comedy|Western,False
262003,3029,2289,4,972846393,M,18,4,92037,"Player, The (1992)",Comedy|Drama,False
777848,4186,2403,3,1017931262,M,25,7,33308,First Blood (1982),Action,False


# Mini-Challenge prep: evaluation functions

These are the two functions that you will need to test your `estimate` method.

In [22]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [23]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)

Test a dummy solution!

In [24]:
def my_estimate_func(user_id, movie_id):
    return 3.0

You can test for performance with the following line, which assumes that your function is called `my_estimate_func`:

In [26]:
print ('RMSE for my estimate function: %s' % evaluate(my_estimate_func))

RMSE for my estimate function: 1.23237195265


# Reco systems questions: Minimal reco engine v1.0

## 1. Simple collaborative filtering using mean ratings

In [34]:
# write an 'estimate' function that computes the mean rating of a particular user
def collab_mean(user_id, movie_id):
    # first, index into all ratings of this movie
    i_cond = movielens_train.movie_id == movie_id
    
    # second, compute the mean of those ratings
    return movielens_train.loc[i_cond, 'rating'].mean()

    
# try it out for a user_id, movie_id pair
collab_mean(4653, 2648)

4.0

# Mini-Challenge: first round
Implement an `estimate` function of your own using other similarity notions, eg.:

- collaborative filter based on age similarities
- collaborative filter based on zip code similarities
- collaborative filter based on occupation similarities
- content filter based on movie genre

In [35]:
movielens_train[movielens_train.movie_id==2648]

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693112,1717,2648,5,974706281,F,50,6,30307,Frankenstein (1931),Horror,False
693286,4260,2648,3,965322649,M,25,16,59079,Frankenstein (1931),Horror,False


### Age

In [48]:
def filt_age(age, movie_id):
    
    item_condition=movielens_train.age==age
    item_condtion=movielens_train.loc[item_condition, 'movie_id']==movie_id
    return movielens_train.loc[item_condition, 'rating'].mean()
    

filt_age(50, 3348)

3.704156479217604

### Occupation

In [44]:
def filt_occupation(occupation, movie_id):
    
    item_condition=movielens_train.occupation==occupation
    item_condtion=movielens_train.loc[item_condition, 'movie_id']==movie_id
    return movielens_train.loc[item_condition, 'rating'].mean()


filt_occupation(10, 2148)

3.373015873015873

### Genre

In [49]:
def filt_genre(genre, movie_id):
    
    item_condition=movielens_train.genres==genre
    item_condtion=movielens_train.loc[item_condition, 'movie_id']==movie_id
    return movielens_train.loc[item_condition, 'rating'].mean()


filt_genre('Drama', 2648)

3.7360912981455066

# Mini-Challenge: second round
Implement an `estimate` function of your own using other custom similarity notions, eg.:

- euclidean
- cosine

In [65]:
s1 = movielens_train.iloc[:,0]
s2 = movielens_train.iloc[:,1]

593263    3798
235597    3793
219003    2366
685090    1094
312377    1095
916102    3317
757805    3707
216517    1036
28065      527
284940    2959
677643     466
241780     296
487663    3704
647317    2533
8747      1287
8203      1197
926550    2448
637336     293
900406    2793
612863    1645
662005    2085
773298    2313
707370    3000
994951    3368
30581     1097
538648    2021
799606     846
821550    1946
309693      24
388758    1673
          ... 
548487    2108
56665     3114
880205    1011
491480    1025
248580       6
278225    3176
152149    1124
737848    3104
716473    2671
162703     920
928420    2092
107273    2858
349073    1997
139488      95
256262    3083
888129    1280
950896    3921
668851    2253
739828      11
393014     223
500935    3087
397583    1777
424247    1252
942640    3901
249648    3006
418478    1307
730445    1249
502516    3591
888991     766
466639    2096
Name: movie_id, Length: 5838, dtype: int64

In [66]:
def euclidean(s1, s2):
    """Take two pd.Series objects and return their euclidean 'similarity'."""
    diff = s1 - s2
    return 1 / (1 + np.sqrt(np.sum(diff ** 2)))

euclidean(s1,s2)

5.5996532506751466e-06