In [1]:
from collections import Counter, defaultdict
import math
import numpy as np
import os
import pandas as pd
import re
from scipy.sparse import csr_matrix

In [2]:
def tokenize_string(my_string):
    """ 
    This is used in tokenize function.
    """
    return re.findall('[\w\&]+', my_string.lower())

In [3]:
def tokenize(songs):
    """
    Append a new column to the songs DataFrame with header 'tokens'.
    This will contain a list of strings, one per token, extracted
    from the 'genre' field of each song. Use the tokenize_string method above.
    Note: you may modify the songs parameter directly; no need to make
    a new copy.
    Params:
      songs...The songs DataFrame
    Returns:
      The songs DataFrame, augmented to include a new column called 'tokens'.
    >>> songs = pd.DataFrame([[linkin park, 'Alternative'], [Baby, 'Rock']], columns=['songId', 'genres'])
    >>> songs = tokenize(songs)
    >>> songs['tokens'].tolist()
    [['Alternative', 'rock']]
    """
    tokenlist=[]
    for index,row in songs.iterrows():
        tokenlist.append(row.title)
    songs['tokens']=tokenlist
    return songs

In [4]:
def featurize(songs):
    """
    Append a new column to the songs DataFrame with header 'features'.
    Each row will contain a csr_matrix of shape (1, num_features). Each
    entry in this matrix will contain the tf-idf value of the term, as
    defined in class:
    tfidf(i, d) := tf(i, d) / max_k tf(k, d) * log10(N/df(i))
    where:
    i is a term
    d is a document (song)
    tf(i, d) is the frequency of term i in document d
    max_k tf(k, d) is the maximum frequency of any term in document d
    N is the number of documents (songs)
    df(i) is the number of unique documents containing term i
    Params:
      songs...The songs DataFrame
    Returns:
      A tuple containing:
      - The songs DataFrame, which has been modified to include a column named 'features'.
      - The vocab, a dict from term to int. Make sure the vocab is sorted alphabetically as in a2 (e.g., {'aardvark': 0, 'boy': 1, ...})
    """
    def tf(word,doc):
        return doc.count(word) / Counter(doc).most_common()[0][1]

    def df(word, doclist):
        return sum(1 for d in doclist if word in d)

    def tfidf(word, doc, dfdict, N):
        return tf(word, doc) * math.log10((N/dfdict[word]))

    def getcsrmatrix(tokens,dfdict,N,vocab):
        matrixRow_list = []
        matrixRow_list = np.zeros((1,len(vocab)),dtype='float')
        for t in tokens:
            if t in vocab:
                matrixRow_list[0][vocab[t]] = tfidf(t,tokens,dfdict,N)
        return csr_matrix(matrixRow_list)

    N=len(songs)
    doclist = songs['tokens'].tolist()
    vocab = { s:x for x,s in enumerate(sorted(list(set(s for s in doclist)))) }
    dfdict = {}
    for v in vocab.items():
        dfdict[v[0]] = df(v[0],doclist)

    csrlist = []
    for index, row in songs.iterrows():
         csrlist.append(getcsrmatrix(row['tokens'],dfdict,N,vocab))

    songs['features'] =  csrlist
    return (songs,vocab)

In [5]:
def train_test_split(ratings):
    """
    Returns a random split of the ratings matrix into a training and testing set.
    """
    test = set(range(len(ratings))[::1000])
    train = sorted(set(range(len(ratings))) - test)
    test = sorted(test)
    return ratings.iloc[train], ratings.iloc[test]

In [6]:
def cosine_sim(a, b):
    """
    Compute the cosine similarity between two 1-d csr_matrices.
    Each matrix represents the tf-idf feature vector of a movie.
    Params:
      a...A csr_matrix with shape (1, number_features)
      b...A csr_matrix with shape (1, number_features)
    Returns:
      The cosine similarity, defined as: dot(a, b) / ||a|| * ||b||
      where ||a|| indicates the Euclidean norm (aka L2 norm) of vector a.
    """
    v1 = a.toarray()[0]
    v2  = b.toarray()[0]
    return sum(i[0] * i[1] for i in zip(v1, v2))/(math.sqrt(sum([i*i for i in v1]))*math.sqrt(sum([i*i for i in v2])))

In [7]:
def make_predictions(songs, ratings_train, ratings_test):
    """
    Using the ratings in ratings_train, predict the ratings for each
    row in ratings_test.
    
    Params:
      songs..........The songs DataFrame.
      ratings_train...The subset of ratings used for making predictions. These are the "historical" data.
      ratings_test....The subset of ratings that need to predicted. These are the "future" data.
    Returns:
      A numpy array containing one predicted rating for each element of ratings_test.
    """
    result = []
    for index,row in ratings_test.iterrows():
        mlist = list(ratings_train.loc[ratings_train['user_id'] == row['user_id']]['book_id'])
        csrlist = list(songs.loc[songs['book_id'].isin(mlist)]['features'])
        mrlist = list(ratings_train.loc[ratings_train['user_id'] ==row['user_id']]['rating'])
        cmlist = [cosine_sim(c,songs.loc[songs['book_id'] ==row['book_id']]['features'].values[0]) for c in csrlist]
        wan = sum([ v*mrlist[i] for i,v in enumerate(cmlist) if v>0 ])
        wadlist = [i for i in cmlist if i>0]
        if (len(wadlist)>0):
            result.append(wan/sum(wadlist))
        else:
            result.append(np.mean(mrlist))
    return np.array(result)

In [8]:
def mean_absolute_error(predictions, ratings_test):
    """
    Return the mean absolute error of the predictions.
    """
    return np.abs(predictions - np.array(ratings_test.rating)).mean()

In [10]:
books = pd.read_csv("../data/books.csv")
book_tags = pd.read_csv("../data/book_tags.csv")
tags = pd.read_csv("../data/tags.csv")
ratings = pd.read_csv("../data/ratings.csv")

In [11]:
pd.set_option("display.max_rows", None)

In [12]:
pd.DataFrame(np.unique(tags['tag_name']))

Unnamed: 0,0
0,-
1,--1-
2,--10-
3,--12-
4,--122-
5,--166-
6,--17-
7,--19-
8,--2-
9,--258-


In [13]:
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [14]:
ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [15]:
books = books[['book_id','title','authors']]

In [16]:
books.head()

Unnamed: 0,book_id,title,authors
0,2767052,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins
1,3,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré"
2,41865,"Twilight (Twilight, #1)",Stephenie Meyer
3,2657,To Kill a Mockingbird,Harper Lee
4,4671,The Great Gatsby,F. Scott Fitzgerald


In [17]:
books = books.drop_duplicates(['book_id'])

In [18]:
books.shape

(10000, 3)

In [None]:
books = tokenize(books)
books.head()

In [None]:
books, vocab = featurize(books)

In [None]:
books.head()

In [None]:
vocab

In [None]:
print('Favorites:')
print(sorted(vocab.items())[:100])

In [None]:
ratings_train, ratings_test = train_test_split(ratings)

In [None]:
print('%d training ratings; %d testing ratings' % (len(ratings_train), len(ratings_test)))
predictions = make_predictions(books, ratings_train, ratings_test)
print('error=%f' % mean_absolute_error(predictions, ratings_test))
print(predictions[:100])