# Sample Content based recommender with crossvalidation for attributes 

We make use of the Microsoft anon web data set: https://archive.ics.uci.edu/ml/datasets/Anonymous+Microsoft+Web+Data in this data set there are 3 types of unstructured records (A = Attribute, C = Case and V = vote)

In [4]:
path = "/Users/jonathangreve/Downloads/anonymous-msweb.test"
import pandas as pd

In [6]:
raw_data = pd.read_csv(path, header=None, skiprows=7)
raw_data.head()

Unnamed: 0,0,1,2,3,4
0,A,1277,1,NetShow for PowerPoint,/stream
1,A,1253,1,MS Word Development,/worddev
2,A,1109,1,TechNet (World Wide Web Edition),/technet
3,A,1038,1,SiteBuilder Network Membership,/sbnmember
4,A,1205,1,Hardware Supprt,/hardwaresupport


In [7]:
user_activity = raw_data.loc[raw_data[0] != "A"] 

In [8]:
user_activity = user_activity.loc[:,:1]

In [9]:
user_activity.columns = ['category', 'value']

In [11]:
user_activity.head(10)

Unnamed: 0,category,value
294,C,10001
295,V,1038
296,V,1026
297,V,1034
298,C,10002
299,V,1008
300,V,1056
301,V,1032
302,C,10003
303,V,1064


Above shows strictly the user activity information

Next we need to create a user-item-rating matrix

In [13]:
temp = 0
nextRow = False
lastIndex = user_activity.index[len(user_activity)-1]

20484

The below snippet will add unique columns such as user_id and web_id to the user activiy data frame to show a user ID and web activity

In [15]:
for index, row in user_activity.iterrows():
    if(index <= lastIndex):
        if(user_activity.loc[index,'category'] == "C"):
            tmp = 0
            userId = user_activity.loc[index,'value']
            user_activity.loc[index, 'userId'] = userId
            user_activity.loc[index, 'webId'] = userId
            tmp = userId
            nextRow = True
        elif(user_activity.loc[index,'category'] != "C" and nextRow == True):
            webId = user_activity.loc[index,'value']
            user_activity.loc[index, 'webId'] = webId
            user_activity.loc[index, 'userId'] = tmp
            if(index != lastIndex and user_activity.loc[index+1, 'category'] == "C"):
                nextRow = False
                caseId = 0
            

In [16]:
user_activity = user_activity[user_activity['category']== "V"]
user_activity.head()

Unnamed: 0,category,value,userId,webId
295,V,1038,10001.0,1038.0
296,V,1026,10001.0,1026.0
297,V,1034,10001.0,1034.0
299,V,1008,10002.0,1008.0
300,V,1056,10002.0,1056.0


Above we removed the unwanted rows, ie "C" in the data column

In [17]:
user_activity = user_activity[['userId','webId']]
user_activity_sort = user_activity.sort_values(by='webId', ascending=True)

Next we create a dense binary rating matrix using 

In [18]:
sLength = len(user_activity_sort['webId'])

In [19]:
import numpy as np
user_activity_sort['rating'] = pd.Series(np.ones((sLength,)),index=user_activity.index) 

We now use a Pivot to create a binary rating matrix

In [20]:
rateMat = user_activity_sort.pivot(index='userId', columns='webId', values='rating').fillna(0)

and now we create a dense matrix (dense matrices store everything including zero values)

In [22]:
rateMat = rateMat.to_dense().as_matrix()

We create an item profile from the initial raw data (for an item profile we only need 2 columns so we slice the data frames as webId and desc

In [24]:
items = raw_data.loc[raw_data[0] == "A"]
items.columns = ['record','webId','vote','desc','url']
items = items[['webId','desc']]
items.head()

Unnamed: 0,webId,desc
0,1277,NetShow for PowerPoint
1,1253,MS Word Development
2,1109,TechNet (World Wide Web Edition)
3,1038,SiteBuilder Network Membership
4,1205,Hardware Supprt


In [25]:
items2 = items[items['webId'].isin(user_activity['webId'].tolist())]
items_sort = items2.sort_values(by='webId', ascending=True)
items_sort.head(5)

Unnamed: 0,webId,desc
113,1000,regwiz
40,1001,Support Desktop
278,1002,End User Produced View
102,1003,Knowledge Base
243,1004,Microsoft.com Search


Here we are gonna make use of the SKLearn package (the number of features depends on the dataset and can be learned by the cross validation approach

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer(stop_words = "english", max_features = 100, ngram_range= (0,3),sublinear_tf = True)
x = v.fit_transform(items_sort['desc'])
itemProfile = x.todense()
itemProfile

matrix([[ 1.        ,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.32213709,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.43709646,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ..., 
        [ 0.38159493,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.30073274,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.36402686,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

With the item profile and user activity now we need the product of hte 2 in a new matrix using scipy linalg by doing a cosin similarity between user and item profile

In [34]:
from scipy import linalg, dot
userProfile = dot(rateMat, itemProfile)/linalg.norm(rateMat)/linalg.norm(itemProfile)
userProfile

matrix([[ 0.00062937,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.00089668,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.00144708,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        ..., 
        [ 0.00046412,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.00067229,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ],
        [ 0.00079067,  0.        ,  0.        , ...,  0.        ,
          0.        ,  0.        ]])

user SK learn to compute the cosin calculations between user profile and item profile

In [36]:
import sklearn.metrics as skme
similarityCalc = skme.pairwise.cosine_similarity(userProfile,itemProfile, dense_output= True)
similarityCalc

array([[ 0.54168902,  0.17449812,  0.23677035, ...,  0.20670579,
         0.16290362,  0.19718935],
       [ 0.78844617,  0.25398775,  0.34462703, ...,  0.30086706,
         0.23711158,  0.28701558],
       [ 0.63172381,  0.20350167,  0.27612424, ...,  0.29413451,
         0.18998003,  0.22996444],
       ..., 
       [ 0.56969503,  0.1835199 ,  0.24901168, ...,  0.21739274,
         0.17132595,  0.20738429],
       [ 0.49394733,  0.15911875,  0.21590263, ...,  0.1884878 ,
         0.14854613,  0.17981009],
       [ 0.86518334,  0.27870764,  0.37816858, ...,  0.33014958,
         0.26018896,  0.31494998]])

Next we convert to a binary 0,1 by removing 0 vales from results we have probible items we can recommend to the users

In [37]:
final_pred = np.where(similarityCalc>0.6,1,0)
final_pred[1]

array([1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0])

so for a user (id 213) the recomended items 

In [38]:
indexes_of_user = np.where(final_pred[213] == 1)
indexes_of_user

(array([  9,  37,  68, 152]),)