# Hybrid Models for Recommendation Systems

Load Pandas, we are going to need it for manipulating data

In [5]:
import pandas as pd
import numpy as np
from IPython.display import Image
np.set_printoptions(precision = 3)

Now load the data

In [9]:
user_ratings_df = pd.read_csv("user_ratings.csv")
user_features_df = pd.read_csv("user_features.csv")
item_features_df = pd.read_csv("item_features.csv")


In [13]:
#Printing the first data set
print("The First data set is the following:\n",user_ratings_df)

The First data set is the following:
    The Call of Cthulhu   Frankenstein   Dracula   Neuromancer   Space Odyssey
0                  8.0            2.0       NaN           5.0             4.0
1                  3.0            2.0       NaN           7.0             7.0
2                  9.0            NaN       7.0           8.0             5.0
3                  NaN            NaN       7.0           8.0             9.0
4                  NaN            1.0       8.0           3.0             7.0
5                  2.0            3.0       5.0           NaN             NaN
6                  4.0            2.0       NaN           2.0             7.0
7                  7.0            1.0       2.0           7.0             9.0
8                  3.0            3.0       NaN           7.0             3.0
9                  4.0            NaN       5.0           3.0             3.0


In [14]:
#Printing the second data set
print("The Second data set is the following:\n",user_features_df)

The Second data set is the following:
    Sex   Over60
0  1.0      0.0
1  0.0      1.0
2  0.0      0.0
3  1.0      0.0
4  0.0      1.0
5  0.0      0.0
6  0.0      0.0
7  1.0      0.0
8  0.0      1.0
9  1.0      0.0


In [15]:
#Printing the third data set
print("The Third data set is the following:\n",item_features_df)

The Third data set is the following:
    Critic0   Critic1
0      0.3       0.9
1      0.9       0.3
2      0.6       0.4
3      0.2       0.1
4      0.7       0.8


In [25]:
user_features_df["key"] = 0
user_features_df["user_id"] = range(0,user_features_df.shape[0])
item_features_df["key"] = 0
item_features_df["item_id"] = range(0,item_features_df.shape[0])

merged_df = pd.merge(user_features_df, item_features_df,left_index=True,on="key")
merged_df[["item_id", "user_id"]]




merged_df["rating"] = map(lambda ids: user_ratings_df.values[ids[1]][ids[2]], 
                          merged_df[["user_id", "item_id"]].itertuples())

print(merged_df)



   Sex   Over60  key  user_id  Critic0   Critic1  item_id  \
0  1.0      0.0    0        0      0.3       0.9        0   
1  1.0      0.0    0        0      0.9       0.3        1   
2  1.0      0.0    0        0      0.6       0.4        2   
3  1.0      0.0    0        0      0.2       0.1        3   
4  1.0      0.0    0        0      0.7       0.8        4   
0  0.0      1.0    0        1      0.3       0.9        0   
1  0.0      1.0    0        1      0.9       0.3        1   
2  0.0      1.0    0        1      0.6       0.4        2   
3  0.0      1.0    0        1      0.2       0.1        3   
4  0.0      1.0    0        1      0.7       0.8        4   
0  0.0      0.0    0        2      0.3       0.9        0   
1  0.0      0.0    0        2      0.9       0.3        1   
2  0.0      0.0    0        2      0.6       0.4        2   
3  0.0      0.0    0        2      0.2       0.1        3   
4  0.0      0.0    0        2      0.7       0.8        4   
0  1.0      0.0    0    

In [28]:
train = merged_df.dropna()
test = merged_df[merged_df.isnull().any(axis=1)]
#print (test.to_latex())

print("The train head is:\n", train.head())
print("The test head is:\n",test.head())

The train head is:
    Sex   Over60  key  user_id  Critic0   Critic1  item_id  \
0  1.0      0.0    0        0      0.3       0.9        0   
1  1.0      0.0    0        0      0.9       0.3        1   
2  1.0      0.0    0        0      0.6       0.4        2   
3  1.0      0.0    0        0      0.2       0.1        3   
4  1.0      0.0    0        0      0.7       0.8        4   

                               rating  
0  <map object at 0x0000023F79F587B8>  
1  <map object at 0x0000023F79F587B8>  
2  <map object at 0x0000023F79F587B8>  
3  <map object at 0x0000023F79F587B8>  
4  <map object at 0x0000023F79F587B8>  
The test head is:
 Empty DataFrame
Columns: [Sex,  Over60, key, user_id, Critic0,  Critic1, item_id, rating]
Index: []


In [31]:
n_latent_features = 2

user_ratings = user_ratings_df.values
latent_user_preferences = np.random.random((user_ratings.shape[0], n_latent_features))
latent_item_features = np.random.random((user_ratings.shape[1],n_latent_features))

user_features = user_features_df.values
item_features = item_features_df.values

print (item_features_df.to_latex())


user_features = np.concatenate([np.ones(shape = (user_features.shape[0],1)), user_features], axis = 1)
item_features = np.concatenate([np.ones(shape = (item_features.shape[0],1)), item_features], axis = 1)



user_features_weights = np.random.random((user_ratings.shape[0], user_features.shape[1] ))
item_features_weights = np.random.random((user_ratings.shape[1],item_features.shape[1] ))


# print user_features
#Printing the second data set
print("The user_features data set is the following:\n",user_features_df)

\begin{tabular}{lrrrr}
\toprule
{} &  Critic0 &   Critic1 &  key &  item\_id \\
\midrule
0 &      0.3 &       0.9 &    0 &        0 \\
1 &      0.9 &       0.3 &    0 &        1 \\
2 &      0.6 &       0.4 &    0 &        2 \\
3 &      0.2 &       0.1 &    0 &        3 \\
4 &      0.7 &       0.8 &    0 &        4 \\
\bottomrule
\end{tabular}

The user_features data set is the following:
    Sex   Over60  key  user_id
0  1.0      0.0    0        0
1  0.0      1.0    0        1
2  0.0      0.0    0        2
3  1.0      0.0    0        3
4  0.0      1.0    0        4
5  0.0      0.0    0        5
6  0.0      0.0    0        6
7  1.0      0.0    0        7
8  0.0      1.0    0        8
9  1.0      0.0    0        9


In [33]:
def predict_rating(user_id,item_id):
    """ Predict a rating given a user_id and an item_id.
    """
    user_preference = latent_user_preferences[user_id]
    item_preference = latent_item_features[item_id]
    
    user_score = user_features_weights[user_id].dot(user_features[user_id])
    item_score = item_features_weights[item_id].dot(item_features[item_id])
    #print user_preference.dot(item_preference), user_score, item_score
    return user_preference.dot(item_preference) + user_score + item_score

def train(user_id, item_id, rating,alpha = 0.001, 
                                   latent_feature_weight_decay = 0.1, 
                                   user_weight_decay = 0.01,
                                   item_weight_decay = 0.0001):
    
    #print item_id
    prediction_rating = predict_rating(user_id, item_id)
    err =  ( prediction_rating - rating );
    #print err
    user_pref_values = latent_user_preferences[user_id][:]
    latent_user_preferences[user_id] -= alpha * err *  ( latent_item_features[item_id] + latent_feature_weight_decay*latent_user_preferences[user_id])
    latent_item_features[item_id] -= alpha * err * ( user_pref_values + latent_feature_weight_decay*latent_item_features[item_id])
    
    user_features_weights[user_id] -=alpha * err *(  user_features[user_id] + user_weight_decay* user_features_weights[user_id])
    item_features_weights[item_id] -=alpha * err * ( item_features_weights[item_id] + item_weight_decay* item_features_weights[item_id])
    
    
    return(err)
    


def sgd(iterations = 30000):
    """ Iterate over all users and all items and train for 
        a certain number of iterations
    """
    for iteration in range(0,iterations):
        error = []
        for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                rating = user_ratings[user_id][item_id]
                if(not np.isnan(rating)):
                    err = train(user_id,item_id,rating)
                    error.append(err)
    mse = (np.array(error) ** 2).mean()          
    print(mse)
                    
                    
    
                    
                    
    


In [34]:
for _ in range(0,10): 
    sgd()

0.37085184087283307
0.36629401137359413
0.36435734143161
0.3632596998242436
0.36254562175103305
0.36204425703610077
0.3616761113334738
0.3613988288550506
0.36118766701835486
0.3610272030551586


In [36]:
predictions = np.zeros(shape = (latent_user_preferences.shape[0], latent_item_features.shape[0]) )
#print latent_user_preferences
print( user_features_weights)
print (item_features_weights)
for user_id in range(0,latent_user_preferences.shape[0]):
            for item_id in range(0,latent_item_features.shape[0]):
                predictions[user_id,item_id] =  predict_rating(user_id,item_id)

  

[[-0.019  0.771  0.417  0.449  0.809]
 [ 0.902  0.565  0.981  0.393  0.246]
 [ 0.44   0.816  0.065  0.67   0.885]
 [ 1.061  0.751  0.605  0.3    1.129]
 [ 1.291  0.42   0.811  1.001  2.109]
 [ 0.13   0.956  0.355  0.062  0.413]
 [ 0.6    0.855  0.442  0.459  0.105]
 [ 0.304  0.561  0.234  0.025  0.338]
 [ 0.33   0.624  0.049  0.744  0.008]
 [ 0.615  0.79   0.91   0.918 -0.263]]
[[0.84  1.391 1.767 3.186 2.143]
 [0.036 0.009 0.005 0.036 0.029]
 [0.104 0.178 0.171 0.075 0.454]
 [1.873 0.924 1.052 0.92  0.452]
 [0.976 0.743 1.112 0.699 0.641]]


In [42]:
values = [zip(user_ratings[i], predictions[i]) for i in range(0,predictions.shape[0])]
comparison_data = pd.DataFrame(values)
comparison_data.columns = user_ratings_df.columns
#comparison_data.applymap(lambda (x,y): "(%2.3f|%2.3f)"%(x,y))

In [43]:
comparison_data

Unnamed: 0,The Call of Cthulhu,Frankenstein,Dracula,Neuromancer,Space Odyssey
0,"(8.0, 8.00470812932014)","(2.0, 1.5444963841621342)","(nan, 5.91945815922451)","(5.0, 5.378557699633641)","(4.0, 4.070081014112371)"
1,"(3.0, 2.992303132283301)","(2.0, 2.7581171923169086)","(nan, 4.648677110285858)","(7.0, 6.372235547412158)","(7.0, 6.884324310569557)"
2,"(9.0, 8.947704145260381)","(nan, 3.102847725364155)","(7.0, 7.5650239399093735)","(8.0, 6.963205776388715)","(5.0, 5.5265170779479265)"
3,"(nan, 17.48938656595775)","(nan, 4.588868189508462)","(7.0, 7.000563793515084)","(8.0, 7.996068899547945)","(9.0, 9.002238339793468)"
4,"(nan, 107.4778819008189)","(1.0, 1.0163020519409165)","(8.0, 7.992586084866161)","(3.0, 2.998322411127182)","(7.0, 6.991793787668115)"
5,"(2.0, 1.9998302208164622)","(3.0, 2.9999518648017873)","(5.0, 5.0010180808196685)","(nan, 6.6575164515773135)","(nan, 6.972098279732448)"
6,"(4.0, 4.017227619765737)","(2.0, 0.3000531812949788)","(nan, -1.1832642864263767)","(2.0, 3.3495609372220434)","(7.0, 7.254537331539927)"
7,"(7.0, 6.983126481264769)","(1.0, 2.57416469177384)","(2.0, 2.0965615558509403)","(7.0, 5.753439789286005)","(9.0, 8.780433194381066)"
8,"(3.0, 3.0016949655314327)","(3.0, 2.806629003710535)","(nan, 9.685859703938819)","(7.0, 7.14174409556327)","(3.0, 3.031140915100524)"
9,"(4.0, 4.0519631458894425)","(nan, 0.10340629990493463)","(5.0, 4.44198193073372)","(3.0, 3.9881627355189044)","(3.0, 2.519713309171633)"


In [41]:
d = comparison_data.to_latex()
text_file = open("comparison.txt", "w")
text_file.write(d)
text_file.close()