# Exercises for "Hands-on with Pydata: How to Build a Minimal Recommendation Engine"

# Systems check: imports and files

In [7]:
import numpy as np
import pandas as pd

# Pandas questions: Series and DataFrames
## 1. Adding a column in a DataFrame

In [8]:
# given the following DataFrame, add a new column to it
df = pd.DataFrame({'col1': [1,2,3,4]})
df

Unnamed: 0,col1
0,1
1,2
2,3
3,4


## 2. Deleting a row in a DataFrame

In [9]:
# given the following DataFrame, delete row 'd' from it
df = pd.DataFrame({'col1': [1,2,3,4]}, index = ['a','b','c','d'])
df

Unnamed: 0,col1
a,1
b,2
c,3
d,4


## 3. Creating a DataFrame from a few Series

In [10]:
# given the following three Series, create a DataFrame such that it holds them as its columns
ser_1 = pd.Series(np.random.randn(6))
ser_2 = pd.Series(np.random.randn(6))
ser_3 = pd.Series(np.random.randn(6))


# Pandas questions: Indexing

## 1. Indexing into a specific column

In [11]:
# given the following DataFrame, try to index into the 'col_2' column
df = pd.DataFrame(data={'col_1': [0.12, 7, 45, 10], 'col_2': [0.9, 9, 34, 11]},
                  columns=['col_1', 'col_2', 'col_3'],
                  index=['obs1', 'obs2', 'obs3', 'obs4'])
df

Unnamed: 0,col_1,col_2,col_3
obs1,0.12,0.9,
obs2,7.0,9.0,
obs3,45.0,34.0,
obs4,10.0,11.0,


## 2. Label-based indexing

In [12]:
# using the same DataFrame, index into the row whose index is 'obs3'
df.loc['obs3']

col_1     45
col_2     34
col_3    NaN
Name: obs3, dtype: object

## 2. Position-based indexing

In [13]:
# using the same DataFrame, index into into its first row
df.iloc[0]

col_1    0.12
col_2     0.9
col_3     NaN
Name: obs1, dtype: object

# Mini-Challenge prep: data loading

## 1. How to load the `users` and `movies` portions of MovieLens

In [14]:
import pandas as pd

users = pd.read_table('data/ml-1m/users.dat',
                      sep='::', header=None,
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'])

movies = pd.read_table('data/ml-1m/movies.dat',
                       sep='::', header=None,
                       names=['movie_id', 'title', 'genres'])

  """
  """
  if __name__ == '__main__':
  if __name__ == '__main__':


## 2. How to load the training and testing subsets

In [15]:
# subset version (hosted notebook)
## Gives error message as provided, so changed encoding to latin-1
movielens_train = pd.read_csv('data/movielens_train.csv', index_col=0, encoding = 'latin-1')
movielens_test = pd.read_csv('data/movielens_test.csv', index_col=0, encoding = 'latin-1')


In [16]:
movielens_train.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
593263,3562,3798,4,967332344,F,25,6,32812,What Lies Beneath (2000),Thriller,False
235597,1051,3793,4,974958593,F,25,0,60513,X-Men (2000),Action|Sci-Fi,False
219003,3727,2366,3,966309522,M,35,7,74401,King Kong (1933),Action|Adventure|Horror,False
685090,4666,1094,3,963843918,M,35,1,53704,"Crying Game, The (1992)",Drama|Romance|War,False
312377,3261,1095,4,968251750,M,45,20,87505,Glengarry Glen Ross (1992),Drama,False


In [17]:
movielens_test.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,for_testing
693323,4653,2648,4,975532459,M,35,12,95051,Frankenstein (1931),Horror,False
24177,2259,1270,4,974591524,F,56,16,70503,Back to the Future (1985),Comedy|Sci-Fi,False
202202,3032,1378,5,970343147,M,25,0,47303,Young Guns (1988),Action|Comedy|Western,False
262003,3029,2289,4,972846393,M,18,4,92037,"Player, The (1992)",Comedy|Drama,False
777848,4186,2403,3,1017931262,M,25,7,33308,First Blood (1982),Action,False


# Mini-Challenge prep: evaluation functions

These are the two functions that you will need to test your `estimate` method.

In [18]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [19]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)

Test a dummy solution!

In [20]:
def my_estimate_func(user_id, movie_id):
    return 3.0

You can test for performance with the following line, which assumes that your function is called `my_estimate_func`:

In [21]:
## Error as provided because print needs parentheses these days
print('RMSE for my estimate function: %s' % evaluate(my_estimate_func))

RMSE for my estimate function: 1.2323719526527521


# Reco systems questions: Minimal reco engine v1.0

## 1. Simple collaborative filtering using mean ratings

In [22]:
def content_mean(user_id, movie_id):
    """ Simple content-filtering based on mean ratings. """
    
    user_condition = movielens_train.user_id == user_id
    return movielens_train.loc[user_condition, 'rating'].mean()

print('RMSE for content mean: %s' % evaluate(content_mean))

RMSE for content mean: 1.2307824759704098


In [23]:
# write an 'estimate' function that computes the mean rating of a particular user
def collab_mean(user_id, movie_id):
    # first, index into all ratings of this movie
    movie_condition = movielens_train.movie_id == movie_id
    # second, compute the mean of those ratings
    mean = movielens_train.loc[movie_condition, 'rating'].mean()
    if np.isnan(mean):
        # if no one has rated this movie
        return 3
    else:
        return mean

    
# try it out for a user_id, movie_id pair
collab_mean(4653, 2648)

4.0

In [24]:
print('RMSE for my collab_mean function: %s' % evaluate(collab_mean))

RMSE for my collab_mean function: 1.1234279896011794


In [25]:
# write an 'estimate' function that computes the mean rating of a particular user
def collab_mean2(user_id, movie_id):
    # first, index into all ratings of this movie
    user_condition  = movielens_train.user_id  != user_id
    movie_condition = movielens_train.movie_id == movie_id
    # second, compute the mean of those ratings
    matches = movielens_train.loc[movie_condition & user_condition]
    if matches.empty:
        # if no one has rated this movie
        return 3
    else:
        return matches.rating.mean()

    
# try it out for a user_id, movie_id pair
print(collab_mean2(4653, 2648))
print('RMSE for another collab_mean function: %s' % evaluate(collab_mean2))

4.0
RMSE for another collab_mean function: 1.1234279896011794


# Mini-Challenge: first round
Implement an `estimate` function of your own using other similarity notions, eg.:


- collaborative filter based on age similarities
- collaborative filter based on zip code similarities
- collaborative filter based on occupation similarities
- content filter based on movie genre

In [26]:
def genre_filter(user_id, movie_id):
    movie_condition = (movielens_train.movie_id == movie_id)
    if sum(movie_condition) == 0: 
        #If the movie isn't in the training set, find it's genre from the test set
        movie_condition = (movielens_test.movie_id == movie_id)
        genre = movielens_test.genres.loc[ movie_condition].iloc[0]
    else:
        genre = movielens_train.genres.loc[movie_condition].iloc[0]
    genre_condition = (movielens_train.genres == genre)
    
    user_condition = (movielens_train.user_id == user_id)
    genre_mean = movielens_train.rating.loc[genre_condition & user_condition].mean()
    if np.isnan(genre_mean):
        #if user hasn't rated any films of this genre, maybe they don't like them much
        return 2 
    else:
        return genre_mean

print(genre_filter(4653, 2648))
print('RMSE for my genre filter function: %s' % evaluate(genre_filter))

2
RMSE for my genre filter function: 1.8418294739907481


In [27]:
user_info = users.set_index('user_id')
movie_info = movies.set_index('movie_id')
class ContentGenreReco:
    """ Content filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_genre = movielens_train.pivot_table('rating', index='user_id', columns='genres')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if user_id not in self.means_by_genre.index: 
            return 3.0
        
        movie_genre = movie_info.loc[movie_id, 'genres']
        if movie_genre not in self.means_by_genre.columns:
            print('%s not in matrix' %movie_genre)
            return 3.0
        
        if ~np.isnan(self.means_by_genre.loc[user_id, movie_genre]):
            return self.means_by_genre.loc[user_id, movie_genre]
        else:
            return self.means_by_genre.loc[user_id].mean()

In [28]:
reco = ContentGenreReco()
reco.learn()
reco.estimate(4906, 3461)
print('RMSE for ContentGenreReco: %s' % evaluate(reco.estimate))

Adventure|Drama|Thriller not in matrix
Adventure|Drama|Thriller not in matrix
Comedy|Mystery|Romance|Thriller not in matrix
Action|Adventure|Children's|Fantasy not in matrix
Horror|Mystery not in matrix
Comedy|Mystery|Romance|Thriller not in matrix
Adventure|Animation|Children's|Fantasy not in matrix
Adventure|Animation|Children's|Fantasy not in matrix
Comedy|Mystery|Romance|Thriller not in matrix
Action|Adventure|Animation|Horror|Sci-Fi not in matrix
Comedy|Mystery|Romance not in matrix
Animation|Children's|Fantasy|War not in matrix
Action|Drama|Thriller|War not in matrix
Comedy|Horror|Sci-Fi not in matrix
RMSE for ContentGenreReco: 1.2543760415676073


In [71]:
def genre_list(genre):
    return genre.split('|')

def genre_sim(genre1, genre2):
    #similarity between genres for weighting
    glist1 = genre_list(genre1)
    glist2 = genre_list(genre2)
    overlap1 = np.mean([(g in glist2) for g in glist1])
    overlap2 = np.mean([(g in glist1) for g in glist2])
    pwr = 1 #optional weighting to adjust penalization for genre term mismatches
    return (overlap1 * overlap2) ** pwr

def weighted_mean(values, weights):
    return np.sum(values * weights) / np.sum(weights)

def sim_matrix(series, sim_fn, keys = None):
    # creates similarity matrix as a dataframe in the form:
    #           x1              x2              x3
    #     x1    sim_fn(x1, x1)  sim_fn(x1, x2)  sim_fn(x1, x3)
    #     x2    sim_fn(x2, x1)  sim_fn(x2, x2)  sim_fn(x2, x3)
    #     x3    sim_fn(x3, x1)  sim_fn(x3, x2)  sim_fn(x3, x3)
    if keys is None:
        return pd.DataFrame({s1:[sim_fn(s2, s1) for s2 in series] 
                             for s1 in series},index = series)
    else:
        return pd.DataFrame({k1:[sim_fn(s2, s1) for s2 in series] 
                             for (k1,s1) in zip(keys,series)},index = series)

class ContentGenreReco2:
    """ Content filtering using an implicit sim(m,m'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        genre_list = movie_info.genres.unique()
        self.genre_similarity = sim_matrix(genre_list, genre_sim)
        self.means_by_genre = movielens_train.pivot_table('rating', index='user_id', columns='genres')
        
    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        if user_id not in self.means_by_genre.index: 
            return 3.0
        
        movie_genre = movie_info.loc[movie_id, 'genres']
        #print(movie_genre)
        user_row = self.means_by_genre.loc[user_id]
        genre_cols = self.means_by_genre.columns
        genre_row = self.genre_similarity.loc[movie_genre, genre_cols]
        return weighted_mean(user_row, genre_row)
        

In [67]:
reco = ContentGenreReco2()
reco.learn()
reco.estimate(4906, 3461)
print('RMSE for ContentGenreReco: %s' % evaluate(reco.estimate))
#reco.genre_similarity.head()

RMSE for ContentGenreReco: 3.6734961844877265


In [None]:
class CollabAgeReco:
    """ Collaborative filtering using an implicit sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.means_by_age = movielens_train.pivot_table('rating', index='movie_id', columns='age')

    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same age. """
        
        if movie_id not in self.means_by_age.index: 
            return 3.0
        
        user_age = user_info.ix[user_id, 'age']
        if ~np.isnan(self.means_by_age.loc[movie_id, user_age]):
            return self.means_by_age.loc[movie_id, user_age]
        else:
            return self.means_by_age.loc[movie_id].mean()

reco = CollabAgeReco()
reco.learn()
print 'RMSE for CollabGenderReco: %s' % evaluate(reco.estimate)

# Mini-Challenge: second round
Implement an `estimate` function of your own using other custom similarity notions, eg.:

- euclidean
- cosine

In [62]:
# Pearson sim from lecture for comparison
# First need to load the stuff that this notebook didn't
ratings = pd.read_table('data/ml-1m/ratings.dat',
                        sep='::', header=None, 
                        names=['user_id', 'movie_id', 'rating', 'timestamp'])
movielens = pd.merge(pd.merge(ratings, users), movies)

def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

class CollabPearsonReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: pearson(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = CollabPearsonReco()
reco.learn()
print('RMSE for CollabPearsonReco: %s' % evaluate(reco.estimate))

  """
  """


RMSE for CollabPearsonReco: 1.1227767489967162


In [None]:
all_user_profiles = movielens.pivot_table('rating', index='movie_id', columns='user_id')
user_profile_list = [
    all_user_profiles[uid] for uid in all_user_profiles.columns]
user_similarity = sim_matrix(user_profile_list, pearson, keys = all_user_profiles.columns)


In [None]:
user_similarity.head()

In [64]:
class CollabPearsonReco2:
    """ Content filtering using an implicit sim(m,m'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        self.all_user_profiles = movielens.pivot_table('rating', index='movie_id', columns='user_id')
        user_profile_list = [
            self.all_user_profiles[uid] for uid in self.all_user_profiles.columns]
        self.user_similarity = sim_matrix(user_profile_list, pearson)
        
    def estimate(self, user_id, movie_id):
        """ Mean ratings by other users of the same gender. """
        
        #Copied from notes
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[
            user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 3.0
        
        sim_this_user = self.user_similarity[user_id]
        similar_condition = sim_this_user > 0 # Pearson corr > 0
        if similar_condition.empty:
            return ratings_by_others.rating.mean()
        else:
            #return weighted_mean( , sim_this_user[similar_condition])
            print(sim_this_user)
              
reco = CollabPearsonReco2()
reco.learn()        
reco.estimate(4906, 3461)

TypeError: 'Series' objects are mutable, thus they cannot be hashed