# Exercises for "Hands-on with Pydata: How to Build a Minimal Recommendation Engine"

# Systems check: imports and files

In [126]:
import numpy as np
import pandas as pd

# Pandas questions: Series and DataFrames
## 1. Adding a column in a DataFrame

In [127]:
label = ['a','b','c']
values = np.array([1,2,3])
ser = pd.Series(data = values, index = label)
ser.loc[['a','b']]


a    1
b    2
dtype: int32

In [128]:
# given the following DataFrame, add a new column to it
df = pd.DataFrame({'col1': [1,2,3,4]})
df['new_col'] = [5,6,7,8] # added a new col
df

Unnamed: 0,col1,new_col
0,1,5
1,2,6
2,3,7
3,4,8


## 2. Deleting a row in a DataFrame

In [129]:
# given the following DataFrame, delete row 'd' from it
df = pd.DataFrame({'col1': [1,2,3,4]}, index = ['a','b','c','d'])
df.drop('c')
# df.drop('c',inplace = True) to save the changes
df
# df.drop? # to generate a docstring of explanation


Unnamed: 0,col1
a,1
b,2
c,3
d,4


## 3. Creating a DataFrame from a few Series

In [130]:
# given the following three Series, create a DataFrame such that it holds them as its columns
ser_1 = pd.Series(np.random.randn(6))
ser_2 = pd.Series(np.random.randn(6))
ser_3 = pd.Series(np.random.randn(6))
#Answer
pd.DataFrame({'col1': ser_1,'col_2':ser_2,'col_3':ser_3})

Unnamed: 0,col1,col_2,col_3
0,-0.608093,1.360931,1.41843
1,-0.29521,-0.810073,0.445599
2,0.038262,0.074015,0.040338
3,1.548776,0.43301,-0.83749
4,0.400279,-1.152958,-1.019243
5,0.972922,1.018641,0.160875


# Pandas questions: Indexing

## 1. Indexing into a specific column

In [131]:
# given the following DataFrame, try to index into the 'col_2' column
df = pd.DataFrame(data={'col_1': [0.12, 7, 45, 10], 'col_2': [0.9, 9, 34, 11]},
                  columns=['col_1', 'col_2', 'col_3'],
                  index=['obs1', 'obs2', 'obs3', 'obs4'])
df.col_2

obs1     0.9
obs2     9.0
obs3    34.0
obs4    11.0
Name: col_2, dtype: float64

## 2. Label-based indexing

In [132]:
# using the same DataFrame, index into the row whose index is 'obs3'
df.ix['obs3']
df.loc['obs3']

col_1     45
col_2     34
col_3    NaN
Name: obs3, dtype: object

## 2. Position-based indexing

In [133]:
# using the same DataFrame, index into into its first row
df.iloc[0]


col_1    0.12
col_2     0.9
col_3     NaN
Name: obs1, dtype: object

# Mini-Challenge prep: data loading

## 1. How to load the `users` and `movies` portions of MovieLens

In [167]:
import pandas as pd

users = pd.read_table('C:/ml-1m/users.dat',
                      sep='::', header=None,
                      names=['user_id', 'gender', 'age', 'occupation', 'zip'],engine = 'python')

movies = pd.read_table('C:/ml-1m/movies.dat',
                       sep='::', header=None,
                       names=['movie_id', 'title', 'genres'],engine = 'python')

## 2. How to load the training and testing subsets

In [154]:
# subset version (hosted notebook)
import sys  
# sys.getdefaultencoding()
# sys.setdefaultencoding('utf-8')
# sys.setdefaultencoding

# df = pd.read_csv('C:/human_body_temperature.csv')
movielens_train = pd.read_csv('C:/my_generated_movielens_train.csv', index_col = 0, encoding = 'Latin1')
movielens_test = pd.read_csv('C:/my_generated_movielens_test.csv', index_col=0, encoding = 'Latin1')

In [157]:
movielens_train.tail(2)
len(movielens_train)

5796

In [158]:
movielens_test.head()
len(movielens_test)

2675

# Mini-Challenge prep: evaluation functions

These are the two functions that you will need to test your `estimate` method.

In [168]:
def compute_rmse(y_pred, y_true):
    """ Compute Root Mean Squared Error. """
    
    return np.sqrt(np.mean(np.power(y_pred - y_true, 2)))

In [207]:
def evaluate(estimate_f):
    """ RMSE-based predictive performance evaluation with pandas. """
    
    ids_to_estimate = zip(movielens_test.user_id, movielens_test.movie_id)
    estimated = np.array([estimate_f(u,i) for (u,i) in ids_to_estimate])
    real = movielens_test.rating.values
    return compute_rmse(estimated, real)

Test a dummy solution!

In [170]:
def my_estimate_func(user_id, movie_id):
    return 3.0
evaluate(my_estimate_func)

1.255567042467602

You can test for performance with the following line, which assumes that your function is called `my_estimate_func`:

In [162]:
print ('RMSE for my estimate function: %s' % evaluate(my_estimate_func))

RMSE for my estimate function: 1.25556704247


# Reco systems questions: Minimal reco engine v1.0

## 1. Simple collaborative filtering using mean ratings

In [177]:
# write an 'estimate' function that computes the mean rating of a particular user
def collab_mean(user_id, movie_id):
    # first, index into all ratings of this movie
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty:
        return 4.0
    else:
        return ratings_by_others.rating.mean()
    
print("RMSE for estimate2: " + str(evaluate(collab_mean)))

RMSE for estimate2: 1.15591747248


In [178]:
movielens_train.groupby('gender')['rating'].mean()

gender
F    3.507564
M    3.535762
Name: rating, dtype: float64

In [182]:
movielens_train.groupby(['gender','age'])['rating'].mean()

gender  age
F       1      3.510638
        18     3.259109
        25     3.540117
        35     3.589431
        45     3.517986
        50     3.684783
        56     3.675000
M       1      3.528736
        18     3.485075
        25     3.511170
        35     3.556039
        45     3.611765
        50     3.620130
        56     3.661765
Name: rating, dtype: float64

In [185]:
ratings_mtx_df= movielens_train.pivot_table(values = 'rating',index = 'user_id',columns = 'movie_id')
ratings_mtx_df.head(3)

movie_id,1,2,3,4,5,6,7,8,9,10,...,3916,3921,3926,3928,3936,3937,3944,3948,3949,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


In [187]:
movielens_train.pivot_table(values = 'rating',index = 'age',columns = 'gender',aggfunc = [np.mean,np.std])
# mynotes: We can use custom function above.

Unnamed: 0_level_0,mean,mean,std,std
gender,F,M,F,M
age,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
1,3.510638,3.528736,1.195505,1.199216
18,3.259109,3.485075,1.181526,1.190885
25,3.540117,3.51117,1.152089,1.146939
35,3.589431,3.556039,1.068152,1.086089
45,3.517986,3.611765,1.137965,0.999636
50,3.684783,3.62013,1.078556,1.071629
56,3.675,3.661765,1.095152,1.103668


In [190]:
user_info = users.set_index ('user_id')
user_info.head()

Unnamed: 0_level_0,gender,age,occupation,zip
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,2460
5,M,25,20,55455


# Mini-Challenge: first round
Implement an `estimate` function of your own using other similarity notions, eg.:

- collaborative filter based on age similarities
- collaborative filter based on zip code similarities
- collaborative filter based on occupation similarities
- content filter based on movie genre

In [215]:
def collab_gender(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on gender. """
    
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    means_by_gender = ratings_by_others.pivot_table('rating', index='movie_id', columns='gender')
    user_gender = user_info.ix[user_id, 'gender']
    if user_gender in means_by_gender.columns: 
        return means_by_gender.ix[movie_id, user_gender]
    else:
        return means_by_gender.ix[movie_id].mean()

print ('RMSE for collab_gender: %s' % evaluate(collab_gender))

RMSE for collab_gender: 1.16850722228


In [218]:
def collab_zip(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on zip. """
    
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    means_by_zip = ratings_by_others.pivot_table('rating', index='movie_id', columns='zip')
    user_zip = user_info.ix[user_id, 'zip']
    if user_zip in means_by_zip.columns: 
        return means_by_zip.ix[movie_id, user_zip]
    else:
        return means_by_zip.ix[movie_id].mean()

print ('RMSE for collab_zip: %s' % evaluate(collab_zip))

RMSE for collab_gender: 1.13142555561


In [234]:
def collab_occ(user_id, movie_id):
    """ Collaborative filtering using an implicit sim(u,u') based on occupation. """
    
    user_condition = movielens_train.user_id != user_id
    movie_condition = movielens_train.movie_id == movie_id
    ratings_by_others = movielens_train.loc[user_condition & movie_condition]
    if ratings_by_others.empty: 
        return 3.0
    
    means_by_occ = ratings_by_others.pivot_table('rating', index='movie_id', columns='occupation')
    user_occ = user_info.ix[user_id, 'occupation']
    if user_occ in means_by_occ.columns: 
        return means_by_occ.ix[movie_id, user_occ]
    else:
        return means_by_occ.ix[movie_id].mean()

print ('RMSE for collab_occupation: %s' % evaluate(collab_occ))

RMSE for collab_occupation: 1.2025720571


# Mini-Challenge: second round
Implement an `estimate` function of your own using other custom similarity notions, eg.:

- euclidean
- cosine

In [219]:
# euclidean similarity function
def euclidean(s1, s2):
    """Take two pd.Series objects and return their euclidean 'similarity'."""
    diff = s1 - s2
    return 1 / (1 + np.sqrt(np.sum(diff ** 2)))

In [220]:
# cosine similarity function
def cosine(s1, s2):
    """Take two pd.Series objects and return their cosine similarity."""
    return np.sum(s1 * s2) / np.sqrt(np.sum(s1 ** 2) * np.sum(s2 ** 2))

In [227]:
# Pearson similarity function
def pearson(s1, s2):
    """Take two pd.Series objects and return a pearson correlation."""
    s1_c = s1 - s1.mean()
    s2_c = s2 - s2.mean()
    return np.sum(s1_c * s2_c) / np.sqrt(np.sum(s1_c ** 2) * np.sum(s2_c ** 2))

In [233]:
class CollabPearsonReco:
    """ Collaborative filtering using a custom sim(u,u'). """

    def learn(self):
        """ Prepare datastructures for estimation. """
        
        self.all_user_profiles = movielens_train.pivot_table('rating', index='movie_id', columns='user_id')

    def estimate(self, user_id, movie_id):
        """ Ratings weighted by correlation similarity. """
        
        user_condition = movielens_train.user_id != user_id
        movie_condition = movielens_train.movie_id == movie_id
        ratings_by_others = movielens_train.loc[user_condition & movie_condition]
        if ratings_by_others.empty: 
            return 3.0
        
        ratings_by_others.set_index('user_id', inplace=True)
        their_ids = ratings_by_others.index
        their_ratings = ratings_by_others.rating
        their_profiles = self.all_user_profiles[their_ids]
        user_profile = self.all_user_profiles[user_id]
        sims = their_profiles.apply(lambda profile: pearson(profile, user_profile), axis=0)
        ratings_sims = pd.DataFrame({'sim': sims, 'rating': their_ratings})
        ratings_sims = ratings_sims[ratings_sims.sim > 0]
        if ratings_sims.empty:
            return their_ratings.mean()
        else:
            return np.average(ratings_sims.rating, weights=ratings_sims.sim)
        
reco = CollabPearsonReco()
reco.learn()
print ('RMSE for CollabPearsonReco: %s' % evaluate(reco.estimate))



RMSE for CollabPearsonReco: 1.13627468233
