<a href="https://colab.research.google.com/github/CjRax/Pythonworkspace/blob/master/Collaborative_filtering_with_side_information.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import os
java8_location= '/usr/lib/jvm/java-8-openjdk-amd64'
os.environ['JAVA_HOME'] = java8_location
main_location = "/content/gdrive/My Drive/Colab Notebooks/Collaborative filtering with side information"
os.chdir(main_location)

In [0]:
!pip install cmfrec

In [0]:
import tensorflow as tf
tf.__version__

In [0]:
import numpy as np, pandas as pd, time, re
from datetime import datetime
from cmfrec import CMF

ratings = pd.read_table('movielens/ml-1m/ratings.dat', sep='::', engine='python', names=['UserId','ItemId','Rating','Timestamp'])
del ratings['Timestamp']
ratings.head(5)

In [0]:
np.random.seed(1)
user_ids = ratings.UserId.drop_duplicates().values
item_ids = ratings.ItemId.drop_duplicates().values
users_train = set(np.random.choice(user_ids, size=int(user_ids.shape[0] * .75), replace=False))
items_train = set(np.random.choice(item_ids, size=int(item_ids.shape[0] * .75), replace=False))
train = ratings.loc[ratings.UserId.isin(users_train) & ratings.ItemId.isin(items_train)].reset_index(drop=True)


np.random.seed(1)
train_ix = train.sample(frac=.85).index
test_ix = np.setdiff1d(train.index.values, train_ix)
test_warm_start = train.loc[test_ix].reset_index(drop=True)
train = train.loc[train_ix].reset_index(drop=True)
users_train = set(train.UserId)
items_train = set(train.ItemId)
test_warm_start = test_warm_start.loc[test_warm_start.UserId.isin(users_train) &
                                      test_warm_start.ItemId.isin(items_train)].reset_index(drop=True)

test_cold_start = ratings.loc[~ratings.UserId.isin(users_train) & ~ratings.ItemId.isin(items_train)].reset_index(drop=True)
test_new_users = ratings.loc[(~ratings.UserId.isin(users_train)) & (ratings.ItemId.isin(items_train))].reset_index(drop=True)
test_new_items = ratings.loc[(ratings.UserId.isin(users_train)) & (~ratings.ItemId.isin(items_train))].reset_index(drop=True)
test_new_items_set = set(test_new_items.ItemId)
users_coldstart = set(test_cold_start.UserId)
items_coldstart = set(test_cold_start.ItemId)

print("train.shape:\t{}".format(train.shape))
print("test_warm_start:\t{}".format(test_warm_start.shape))

print("Number of users_train:\t{}".format(len(users_train)))
print("Number of items_train:\t{}".format(len(items_train)))

In [0]:
movie_titles = pd.read_table('movielens/ml-1m/movies.dat', sep='::', engine='python', header=None)
movie_titles.columns = ['ItemId', 'title', 'genres']
movie_titles = movie_titles[['ItemId', 'title']]

# will save the movie titles for later
movie_id_to_title = {i.ItemId:i.title for i in movie_titles.itertuples()}

movies = pd.read_csv('movielens/ml-latest/movies.csv')
movies = movies[['movieId', 'title']]
movies = pd.merge(movies, movie_titles)
movies = movies[['movieId', 'ItemId']]

tags = pd.read_csv('movielens/ml-latest/genome-scores.csv')
tags_wide = tags.pivot(index='movieId', columns='tagId', values='relevance')
tags_wide.columns=["tag"+str(i) for i in tags_wide.columns.values]

item_side_info = pd.merge(movies, tags_wide, how='inner', left_on='movieId', right_index=True)
del item_side_info['movieId']
items_w_sideinfo = set(item_side_info.ItemId)
test_new_items = test_new_items.loc[test_new_items.ItemId.isin(items_w_sideinfo)].reset_index(drop=True)
item_sideinfo_train = item_side_info.loc[item_side_info.ItemId.isin(items_train)].reset_index(drop=True)
item_sideinfo_testnew = item_side_info.loc[item_side_info.ItemId.isin(test_new_items_set)].reset_index(drop=True)
test_cold_start = test_cold_start.loc[test_cold_start.ItemId.isin(items_w_sideinfo)].reset_index(drop=True)
item_sideinfo_train.head()

In [0]:
from sklearn.decomposition import PCA

pca_obj = PCA(n_components = 50)
item_sideinfo_reduced = item_side_info.copy()
del item_sideinfo_reduced['ItemId']
pca_obj.fit(item_sideinfo_reduced)
item_sideinfo_pca = pca_obj.transform(item_sideinfo_reduced)
item_sideinfo_pca = pd.DataFrame(item_sideinfo_pca)
item_sideinfo_pca.columns = ["pc"+str(i) for i in range(item_sideinfo_pca.shape[1])]
item_sideinfo_pca['ItemId'] = item_side_info.ItemId.values.copy()

item_sideinfo_pca_train = item_sideinfo_pca.loc[item_sideinfo_pca.ItemId.isin(items_train)].reset_index(drop=True)
item_sideinfo_pca_testnew = item_sideinfo_pca.loc[item_sideinfo_pca.ItemId.isin(test_new_items_set)].reset_index(drop=True)
item_sideinfo_pca_coldstart = item_sideinfo_pca.loc[item_sideinfo_pca.ItemId.isin(items_coldstart)].reset_index(drop=True)
item_sideinfo_pca_train.head()


In [0]:
print(test_new_items.shape[0])
print(test_new_items.ItemId.drop_duplicates().shape[0])

## Processing user demographic info

In [0]:
zipcode_abbs = pd.read_csv("movielens/states.csv")
zipcode_abbs_dct = {z.State:z.Abbreviation for z in zipcode_abbs.itertuples()}
us_regs_table = [
    ('New England', 'Connecticut, Maine, Massachusetts, New Hampshire, Rhode Island, Vermont'),
    ('Middle Atlantic', 'Delaware, Maryland, New Jersey, New York, Pennsylvania'),
    ('South', 'Alabama, Arkansas, Florida, Georgia, Kentucky, Louisiana, Mississippi, Missouri, North Carolina, South Carolina, Tennessee, Virginia, West Virginia'),
    ('Midwest', 'Illinois, Indiana, Iowa, Kansas, Michigan, Minnesota, Nebraska, North Dakota, Ohio, South Dakota, Wisconsin'),
    ('Southwest', 'Arizona, New Mexico, Oklahoma, Texas'),
    ('West', 'Alaska, California, Colorado, Hawaii, Idaho, Montana, Nevada, Oregon, Utah, Washington, Wyoming')
    ]
us_regs_table = [(x[0], [i.strip() for i in x[1].split(",")]) for x in us_regs_table]
us_regs_dct = dict()
for r in us_regs_table:
    for s in r[1]:
        us_regs_dct[zipcode_abbs_dct[s]] = r[0]
        
zipcode_info = pd.read_csv("movielens/free-zipcode-database.csv")
zipcode_info = zipcode_info.groupby('Zipcode').first().reset_index()
zipcode_info['State'].loc[zipcode_info.Country != "US"] = 'UnknownOrNonUS'
zipcode_info['Region'] = zipcode_info['State'].copy()
zipcode_info['Region'].loc[zipcode_info.Country == "US"] = \
        zipcode_info.Region\
        .loc[zipcode_info.Country == "US"]\
        .map(lambda x: us_regs_dct[x] if x in us_regs_dct else 'UsOther')
zipcode_info = zipcode_info[['Zipcode', 'Region']]

users = pd.read_table('movielens/ml-1m/users.dat', sep='::', names=["UserId", "Gender", "Age", "Occupation", "Zipcode"], engine='python')
users["Zipcode"] = users.Zipcode.map(lambda x: np.int(re.sub("-.*","",x)))
users = pd.merge(users,zipcode_info,on='Zipcode',how='left')
users['Region'] = users.Region.fillna('UnknownOrNonUS')

users['Occupation'] = users.Occupation.map(lambda x: str(x))
users['Age'] = users.Age.map(lambda x: str(x))
user_side_info = pd.get_dummies(users[['UserId', 'Gender', 'Age', 'Occupation', 'Region']])
users_w_sideinfo = set(user_side_info.UserId)
test_new_users = test_new_users.loc[test_new_users.ItemId.isin(users_w_sideinfo)].reset_index(drop=True)
user_sideinfo_train = user_side_info.loc[user_side_info.UserId.isin(users_train)].reset_index(drop=True)
test_cold_start = test_cold_start.loc[test_cold_start.UserId.isin(users_w_sideinfo)].reset_index(drop=True)
user_sideinfo_train.head()


In [0]:
print(test_new_users.shape[0])
print(test_new_users.UserId.drop_duplicates().shape[0])

In [0]:
print(test_cold_start.shape[0])
print(test_cold_start.UserId.unique().shape[0])
print(test_cold_start.ItemId.unique().shape[0])

## Basic model - only movie ratings

In [0]:
%%time
from copy import deepcopy
from cmfrec import CMF

model_no_side_info = CMF(k=40, reg_param=1e-4, random_seed=1)
model_no_side_info.fit(deepcopy(train))
test_warm_start['Predicted'] = model_no_side_info.predict(test_warm_start.UserId, test_warm_start.ItemId)


In [0]:
print("RMSE (no side info, warm start): ", np.sqrt(np.mean( (test_warm_start.Predicted - test_warm_start.Rating)**2) ))

In [0]:
avg_ratings = train.groupby('ItemId')['Rating'].mean().to_frame().rename(columns={"Rating" : "AvgRating"})
test_ = pd.merge(test_warm_start, avg_ratings, left_on='ItemId', right_index=True, how='left')

print('Averge movie rating:', test_.groupby('UserId')['Rating'].mean().mean())
print('Average rating for top-10 rated by each user:', test_.sort_values(['UserId','Rating'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('Average rating for bottom-10 rated by each user:', test_.sort_values(['UserId','Rating'], ascending=True).groupby('UserId')['Rating'].head(10).mean())
print('Average rating for top-10 recommendations of best-rated movies:', test_.sort_values(['UserId','AvgRating'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('----------------------')
print('Average rating for top-10 recommendations from this model:', test_.sort_values(['UserId','Predicted'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('Average rating for bottom-10 (non-)recommendations from this model:', test_.sort_values(['UserId','Predicted'], ascending=True).groupby('UserId')['Rating'].head(10).mean())

## Model with user side information

In [0]:
%%time
model_user_info = CMF(k=40, w_main=10.0, w_user=1.0, reg_param=1e-3, random_seed=1)
model_user_info.fit(deepcopy(train),
                    user_info=deepcopy(user_sideinfo_train),
                    cols_bin_user=[cl for cl in user_side_info.columns if cl!='UserId'])
test_warm_start['Predicted'] = model_user_info.predict(test_warm_start.UserId, test_warm_start.ItemId)

In [0]:
%%time
model_user_info_all = CMF(k=40, w_main=10.0, w_user=1.0, reg_param=1e-3, random_seed=1)
model_user_info_all.fit(deepcopy(train),
                        user_info=deepcopy(user_side_info),
                        cols_bin_user=[cl for cl in user_side_info.columns if cl!='UserId'])
test_warm_start['PredictedAll'] = model_user_info_all.predict(test_warm_start.UserId, test_warm_start.ItemId)
test_new_users['PredictedAll'] = model_user_info_all.predict(test_new_users.UserId, test_new_users.ItemId)

In [0]:
print("Rho (user side info, warm start): ", np.corrcoef(test_warm_start.Predicted, test_warm_start.Rating)[0][1])
print("Rho (user side info, warm start, extra users): ", np.corrcoef(test_warm_start.PredictedAll, test_warm_start.Rating)[0][1])
# print("RMSE (user side info, new users, added afterwards): ", np.corrcoef(test_new_users.Predicted, test_new_users.Rating)[0][1])
print("Rho (user side info, users trained without ratings): ", np.corrcoef(test_new_users.PredictedAll, test_new_users.Rating)[0][1])

In [0]:
avg_ratings = train.groupby('ItemId')['Rating'].mean().to_frame().rename(columns={"Rating" : "AvgRating"})
test_ = pd.merge(test_warm_start, avg_ratings, left_on='ItemId', right_index=True, how='left')

print('Averge movie rating:', test_.groupby('UserId')['Rating'].mean().mean())
print('Average rating for top-10 rated by each user:', test_.sort_values(['UserId','Rating'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('Average rating for bottom-10 rated by each user:', test_.sort_values(['UserId','Rating'], ascending=True).groupby('UserId')['Rating'].head(10).mean())
print('Average rating for top-10 recommendations of best-rated movies:', test_.sort_values(['UserId','AvgRating'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('----------------------')
print('Average rating for top-10 recommendations from this model:', test_.sort_values(['UserId','Predicted'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('Average rating for bottom-10 (non-)recommendations from this model:', test_.sort_values(['UserId','Predicted'], ascending=True).groupby('UserId')['Rating'].head(10).mean())

In [0]:
print('Average rating for top-10 recommendations (per user) from this model per configuration')
print('warm start: ', test_warm_start.sort_values(['UserId','Predicted'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('warm start, extra users: ', test_warm_start.sort_values(['UserId','PredictedAll'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
# print('new users, added afterwards: ', test_new_users.sort_values(['UserId','Predicted'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('users trained without ratings: ', test_new_users.sort_values(['UserId','PredictedAll'], ascending=False).groupby('UserId')['Rating'].head(10).mean())

In [0]:
%%time
model_user_info2 = CMF(k=40, reg_param=1e-4, offsets_model=True, random_seed=1)
model_user_info2.fit(deepcopy(train),
                     user_info = deepcopy(user_sideinfo_train))
test_warm_start['Predicted'] = model_user_info2.predict(test_warm_start.UserId, test_warm_start.ItemId)

In [0]:
%%time
for u in list(test_new_users.UserId.unique()):
    user_vec = deepcopy(user_side_info.loc[user_side_info.UserId == u])
    del user_vec['UserId']
    user_vec = user_vec.values.reshape((1, -1))
    model_user_info2.add_user(new_id = u, attributes = user_vec)
test_new_users['Predicted'] = model_user_info2.predict(test_new_users.UserId, test_new_users.ItemId)

In [0]:
print("RMSE (user side info, warm start): ", np.sqrt(np.mean( (test_warm_start.Predicted - test_warm_start.Rating)**2) ))
print("RMSE (user side info, new users, added afterwards): ", np.sqrt(np.mean( (test_new_users.Predicted - test_new_users.Rating)**2) ))

In [0]:
print("Rho (user side info, warm start): ", np.corrcoef(test_warm_start.Predicted, test_warm_start.Rating)[0][1])
print("Rho (user side info, new users, added afterwards): ", np.corrcoef(test_new_users.Predicted, test_new_users.Rating)[0][1])

In [0]:
avg_ratings = train.groupby('ItemId')['Rating'].mean().to_frame().rename(columns={"Rating" : "AvgRating"})
test_ = pd.merge(test_warm_start, avg_ratings, left_on='ItemId', right_index=True, how='left')

print('Averge movie rating:', test_.groupby('UserId')['Rating'].mean().mean())
print('Average rating for top-10 rated by each user:', test_.sort_values(['UserId','Rating'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('Average rating for bottom-10 rated by each user:', test_.sort_values(['UserId','Rating'], ascending=True).groupby('UserId')['Rating'].head(10).mean())
print('Average rating for top-10 recommendations of best-rated movies:', test_.sort_values(['UserId','AvgRating'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('----------------------')
print('Average rating for top-10 recommendations from this model:', test_.sort_values(['UserId','Predicted'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('Average rating for bottom-10 (non-)recommendations from this model:', test_.sort_values(['UserId','Predicted'], ascending=True).groupby('UserId')['Rating'].head(10).mean())

In [0]:
print('Average rating for top-10 recommendations (per user) from this model per configuration')
print('warm start: ', test_warm_start.sort_values(['UserId','Predicted'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('new users, added afterwards: ', test_new_users.sort_values(['UserId','Predicted'], ascending=False).groupby('UserId')['Rating'].head(10).mean())

## Model with item side information

In [0]:
%%time
model_item_info = CMF(k=35, k_main=15, k_item=10, reg_param=1e-3, w_main=10.0, w_item=0.5, random_seed=1)
model_item_info.fit(deepcopy(train),
                    item_info = deepcopy(item_sideinfo_train))
test_warm_start['Predicted'] = model_item_info.predict(test_warm_start.UserId, test_warm_start.ItemId)

In [0]:
%%time
model_item_info_all = CMF(k=35, k_main=15, k_item=10, reg_param=1e-3, w_main=10.0, w_item=0.5, random_seed=1)
model_item_info_all.fit(deepcopy(train), item_info = deepcopy(item_side_info))
test_warm_start['PredictedAll'] = model_item_info_all.predict(test_warm_start.UserId, test_warm_start.ItemId)
test_new_items['PredictedAll'] = model_item_info_all.predict(test_new_items.UserId, test_new_items.ItemId)

In [0]:
%%time
model_item_info_diffweight = CMF(k=50, k_main=0, k_item=0, reg_param=1e-3, w_main=5.0, w_item=5.0, random_seed=1)
model_item_info_diffweight.fit(deepcopy(train), item_info = deepcopy(item_side_info))
test_warm_start['PredictedAll2'] = model_item_info_diffweight.predict(test_warm_start.UserId, test_warm_start.ItemId)
test_new_items['PredictedAll2'] = model_item_info_diffweight.predict(test_new_items.UserId, test_new_items.ItemId)

In [0]:
print("RMSE (item side info, warm start): ", np.sqrt(np.mean( (test_warm_start.Predicted - test_warm_start.Rating)**2) ))
print("RMSE (item side info, warm start, extra items): ", np.sqrt(np.mean( (test_warm_start.PredictedAll - test_warm_start.Rating)**2) ))
print("RMSE (item side info, warm start, extra items, diff. weighting): ", np.sqrt(np.mean( (test_warm_start.PredictedAll2 - test_warm_start.Rating)**2) ))
# print("RMSE (item side info, new items, added afterwards): ", np.sqrt(np.mean( (test_new_items.Predicted - test_new_items.Rating)**2) ))
print("RMSE (item side info, items trained without ratings): ", np.sqrt(np.mean( (test_new_items.PredictedAll - test_new_items.Rating)**2) ))
print("RMSE (item side info, items trained without ratings, diff. weighting): ", np.sqrt(np.mean( (test_new_items.PredictedAll2 - test_new_items.Rating)**2) ))


In [0]:
print("Rho (item side info, warm start): ", np.corrcoef(test_warm_start.Predicted, test_warm_start.Rating)[0][1])
print("Rho (item side info, warm start, extra items): ", np.corrcoef(test_warm_start.PredictedAll, test_warm_start.Rating)[0][1])
print("Rho (item side info, warm start, extra items, diff. weighting): ", np.corrcoef(test_warm_start.PredictedAll2, test_warm_start.Rating)[0][1])
# print("Rho (item side info, new items, added afterwards): ", np.corrcoef(test_new_items.Predicted, test_new_items.Rating)[0][1])
print("Rho (item side info, items trained without ratings): ", np.corrcoef(test_new_items.PredictedAll, test_new_items.Rating)[0][1])
print("Rho (item side info, items trained without ratings, diff. weighting): ", np.corrcoef(test_new_items.PredictedAll2, test_new_items.Rating)[0][1])

In [0]:
avg_ratings = train.groupby('ItemId')['Rating'].mean().to_frame().rename(columns={"Rating" : "AvgRating"})
test_ = pd.merge(test_warm_start, avg_ratings, left_on='ItemId', right_index=True, how='left')

print('Averge movie rating:', test_.groupby('UserId')['Rating'].mean().mean())
print('Average rating for top-10 rated by each user:', test_.sort_values(['UserId','Rating'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('Average rating for bottom-10 rated by each user:', test_.sort_values(['UserId','Rating'], ascending=True).groupby('UserId')['Rating'].head(10).mean())
print('Average rating for top-10 recommendations of best-rated movies:', test_.sort_values(['UserId','AvgRating'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('----------------------')
print('Average rating for top-10 recommendations from this model:', test_.sort_values(['UserId','Predicted'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('Average rating for bottom-10 (non-)recommendations from this model:', test_.sort_values(['UserId','Predicted'], ascending=True).groupby('UserId')['Rating'].head(10).mean())

In [0]:
print('Average rating for top-10 recommendations (per user) from this model per configuration')
print('warm start: ', test_warm_start.sort_values(['UserId','Predicted'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('warm start, extra items: ', test_warm_start.sort_values(['UserId','PredictedAll'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
# print('new items, added afterwards: ', test_new_items.sort_values(['UserId','Predicted'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('items trained without ratings: ', test_new_items.sort_values(['UserId','PredictedAll'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('items trained without ratings, diff. weighting: ', test_new_items.sort_values(['UserId','PredictedAll2'], ascending=False).groupby('UserId')['Rating'].head(10).mean())

### Offsets model

In [0]:
%%time
model_item_info2 = CMF(k=50, reg_param=1e-4, offsets_model=True, random_seed=1)
model_item_info2.fit(deepcopy(train), item_info = deepcopy(item_sideinfo_pca_train))
test_warm_start['Predicted'] = model_item_info2.predict(test_warm_start.UserId, test_warm_start.ItemId)

In [0]:
%%time
for i in list(test_new_items.ItemId.unique()):
    item_vec = deepcopy(item_sideinfo_pca.loc[item_sideinfo_pca.ItemId == i])
    del item_vec['ItemId']
    item_vec = item_vec.values.reshape((1, -1))
    model_item_info2.add_item(new_id = i, attributes = item_vec)
test_new_items['Predicted'] = model_item_info2.predict(test_new_items.UserId, test_new_items.ItemId)

In [0]:
print("RMSE (item side info, warm start): ", np.sqrt(np.mean( (test_warm_start.Predicted - test_warm_start.Rating)**2) ))
print("RMSE (item side info, new items, added afterwards): ", np.sqrt(np.mean( (test_new_items.Predicted - test_new_items.Rating)**2) ))

In [0]:
print("Rho (item side info, warm start): ", np.corrcoef(test_warm_start.Predicted, test_warm_start.Rating)[0][1])
print("Rho (item side info, new items, added afterwards): ", np.corrcoef(test_new_items.Predicted, test_new_items.Rating)[0][1])

In [0]:
avg_ratings = train.groupby('ItemId')['Rating'].mean().to_frame().rename(columns={"Rating" : "AvgRating"})
test_ = pd.merge(test_warm_start, avg_ratings, left_on='ItemId', right_index=True, how='left')

print('Averge movie rating:', test_.groupby('UserId')['Rating'].mean().mean())
print('Average rating for top-10 rated by each user:', test_.sort_values(['UserId','Rating'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('Average rating for bottom-10 rated by each user:', test_.sort_values(['UserId','Rating'], ascending=True).groupby('UserId')['Rating'].head(10).mean())
print('Average rating for top-10 recommendations of best-rated movies:', test_.sort_values(['UserId','AvgRating'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('----------------------')
print('Average rating for top-10 recommendations from this model:', test_.sort_values(['UserId','Predicted'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('Average rating for bottom-10 (non-)recommendations from this model:', test_.sort_values(['UserId','Predicted'], ascending=True).groupby('UserId')['Rating'].head(10).mean())

In [0]:
print('Average rating for top-10 recommendations (per user) from this model per configuration')
print('warm start: ', test_warm_start.sort_values(['UserId','Predicted'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('new items, added afterwards: ', test_new_items.sort_values(['UserId','Predicted'], ascending=False).groupby('UserId')['Rating'].head(10).mean())

## Full model

In [0]:
%%time
model_user_item_info = CMF(k=40, k_main=10, k_user=5, k_item=15,
                           w_main=1.0, w_user=2.0, w_item=0.05,
                           reg_param=5e-5, random_seed=1)
model_user_item_info.fit(deepcopy(train),
                         user_info=deepcopy(user_side_info),
                         item_info=deepcopy(item_side_info),
                         cols_bin_user=[cl for cl in user_side_info.columns if cl!='UserId'])
test_warm_start['PredictedAll'] = model_user_item_info.predict(test_warm_start.UserId, test_warm_start.ItemId)
test_cold_start['PredictedAll'] = model_user_item_info.predict(test_cold_start.UserId, test_cold_start.ItemId)
test_new_users['PredictedAll'] = model_user_item_info.predict(test_new_users.UserId, test_new_users.ItemId)
test_new_items['PredictedAll'] = model_user_item_info.predict(test_new_items.UserId, test_new_items.ItemId)

In [0]:
print("RMSE (user and item side info, warm start, extra users and items): ", np.sqrt(np.mean( (test_warm_start.PredictedAll - test_warm_start.Rating)**2) ))
print("RMSE (user and item side info, cold start): ", np.sqrt(np.mean( (test_cold_start.PredictedAll - test_cold_start.Rating)**2) ))
print("RMSE (user and item side info, users trained without ratings, extra items): ", np.sqrt(np.mean( (test_new_users.PredictedAll - test_new_users.Rating)**2) ))
print("RMSE (user and item side info, items trained without ratings, extra users): ", np.sqrt(np.mean( (test_new_items.PredictedAll - test_new_items.Rating)**2) ))

In [0]:
print("Rho (user and item side info, warm start, extra users and items): ", np.corrcoef(test_warm_start.PredictedAll, test_warm_start.Rating)[0][1])
print("Rho (user and item side info, cold start): ", np.corrcoef(test_cold_start.PredictedAll, test_cold_start.Rating)[0][1])
print("Rho (user and item side info, users trained without ratings, extra items): ", np.corrcoef(test_new_users.PredictedAll, test_new_users.Rating)[0][1])
print("Rho (user and item side info, items trained without ratings, extra users): ", np.corrcoef(test_new_items.PredictedAll, test_new_items.Rating)[0][1])

In [0]:
print('Average rating for top-10 recommendations (per user) from this model per configuration')
print('warm start, extra users and items: ', test_warm_start.sort_values(['UserId','PredictedAll'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('cold start: ', test_cold_start.sort_values(['UserId','PredictedAll'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('users trained without ratings: ', test_new_users.sort_values(['UserId','PredictedAll'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('items trained without ratings: ', test_new_items.sort_values(['UserId','PredictedAll'], ascending=False).groupby('UserId')['Rating'].head(10).mean())

In [0]:
%%time
model_user_item_info2 = CMF(k=50, reg_param=5e-3, offsets_model=True, random_seed=1)
model_user_item_info2.fit(deepcopy(train),
                          user_info=deepcopy(user_sideinfo_train),
                          item_info=deepcopy(item_sideinfo_pca_train))
test_warm_start['Predicted'] = model_user_item_info2.predict(test_warm_start.UserId, test_warm_start.ItemId)

In [0]:
%%time
for u in list(np.unique(np.r_[test_new_users.UserId, test_cold_start.UserId])):
    user_vec = deepcopy(user_side_info.loc[user_side_info.UserId == u])
    del user_vec['UserId']
    user_vec = user_vec.values.reshape((1, -1))
    model_user_item_info2.add_user(new_id = u, attributes = user_vec)

for i in list(np.unique(np.r_[test_new_items.ItemId.unique(), test_cold_start.ItemId.unique()])):
    item_vec = deepcopy(item_sideinfo_pca.loc[item_sideinfo_pca.ItemId == i])
    if item_vec.shape[0] > 0:
        del item_vec['ItemId']
        item_vec = item_vec.values.reshape((1, -1))
        model_user_item_info2.add_item(new_id = i, attributes = item_vec)
test_new_users['Predicted'] = model_user_item_info2.predict(test_new_users.UserId, test_new_users.ItemId)
test_new_items['Predicted'] = model_user_item_info2.predict(test_new_items.UserId, test_new_items.ItemId)
test_cold_start['Predicted'] = model_user_item_info2.predict(test_cold_start.UserId, test_cold_start.ItemId)

In [0]:
print("RMSE (user and item side info, warm start, extra users and items): ", np.sqrt(np.mean( (test_warm_start.Predicted - test_warm_start.Rating)**2) ))
print("RMSE (user and item side info, cold start, users and items added afterwards): ", np.sqrt(np.mean( (test_cold_start.Predicted - test_cold_start.Rating)**2) ))
print("RMSE (user and item side info, users added afterwards): ", np.sqrt(np.mean( (test_new_users.Predicted - test_new_users.Rating)**2) ))
print("RMSE (user and item side info, items added afterwards): ", np.sqrt(np.mean( (test_new_items.Predicted - test_new_items.Rating)**2) ))

In [0]:
print("Rho (user and item side info, warm start, extra users and items): ", np.corrcoef(test_warm_start.Predicted, test_warm_start.Rating)[0][1])
print("Rho (user and item side info, cold start, users and items added afterwards): ", np.corrcoef(test_cold_start.Predicted, test_cold_start.Rating)[0][1])
print("Rho (user and item side info, users added afterwards): ", np.corrcoef(test_new_users.Predicted, test_new_users.Rating)[0][1])
print("Rho (user and item side info, items added afterwards): ", np.corrcoef(test_new_items.Predicted, test_new_items.Rating)[0][1])

In [0]:
print('Average rating for top-10 recommendations (per user) from this model per configuration')
print('warm start: ', test_warm_start.sort_values(['UserId','Predicted'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('cold start: ', test_cold_start.sort_values(['UserId','Predicted'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('users added afterwards: ', test_new_users.sort_values(['UserId','Predicted'], ascending=False).groupby('UserId')['Rating'].head(10).mean())
print('items added afterwards: ', test_new_items.sort_values(['UserId','Predicted'], ascending=False).groupby('UserId')['Rating'].head(10).mean())

## Examining some recomendations

In [0]:
from collections import defaultdict

# aggregate statistics
avg_movie_rating = defaultdict(lambda: 0)
num_ratings_per_movie = defaultdict(lambda: 0)
for i in train.groupby('ItemId')['Rating'].mean().to_frame().itertuples():
    avg_movie_rating[i.Index] = i.Rating
for i in train.groupby('ItemId')['Rating'].agg(lambda x: len(tuple(x))).to_frame().itertuples():
    num_ratings_per_movie[i.Index] = i.Rating

# function to print recommended lists more nicely
def print_reclist(reclist):
    list_w_info = [str(m + 1) + ") - " + movie_id_to_title[reclist[m]] +\
        " - Average Rating: " + str(np.round(avg_movie_rating[reclist[m]], 2))+\
        " - Number of ratings: " + str(num_ratings_per_movie[reclist[m]])\
                   for m in range(len(reclist))]
    print("\n".join(list_w_info))


In [0]:
reclist1 = model_no_side_info.topN(user=948, n=10, exclude_seen=True)
reclist2 = model_user_info_all.topN(user=948, n=10, exclude_seen=True)
reclist3 = model_item_info_all.topN(user=948, n=10, exclude_seen=True)
reclist4 = model_user_item_info.topN(user=948, n=10, exclude_seen=True)
reclist5 = model_user_item_info2.topN(user=948, n=10, exclude_seen=True)

print('Recommendations from ratings-only model:')
print_reclist(reclist1)
print("------")
print('Recommendations from ratings + user demographics model:')
print_reclist(reclist2)
print("------")
print('Recommendations from ratings + movie tags model:')
print_reclist(reclist3)
print("------")
print('Recommendations from ratings + user demographics + movie tags model:')
print_reclist(reclist4)
print("------")
print('Recommendations from ratings + user demographics + movie tags model (alternative formulation):')
print_reclist(reclist5)

In [0]:
# reclist1 = model_no_side_info.topN(user=1, n=10) # not possible with this model
reclist2 = model_user_info_all.topN(user=1, n=10)
# reclist3 = model_item_info_all.topN(user=1, n=10) # not possible with this model
reclist4 = model_user_item_info.topN(user=1, n=10)
reclist5 = model_user_item_info2.topN(user=1, n=10)

# print('Recommendations from ratings-only model:')
# print_reclist(reclist1)
# print("------")
print('Recommendations from ratings + user demographics model:')
print_reclist(reclist2)
# print("------")
# print('Recommendations from ratings + movie tags model:')
# print_reclist(reclist3)
print("------")
print('Recommendations from ratings + user demographics + movie tags model:')
print_reclist(reclist4)
print("------")
print('Recommendations from ratings + user demographics + movie tags model (alternative formulation):')
print_reclist(reclist5)