

## Assignment 3:  Recommendation systems

### Here we'll implement a content-based recommendation algorithm.
### It will use the list of genres for a movie as the content.
### The data come from the MovieLens project: http://grouplens.org/datasets/movielens/

### Please only use these imports.


In [242]:
# coding: utf-8
from collections import Counter, defaultdict
import math
import numpy as np
import os
import pandas as pd
import re
from scipy.sparse import csr_matrix
import urllib.request
import zipfile

In [243]:
def download_data():
    """ DONE. Download and unzip data.
    """
    url = 'https://www.dropbox.com/s/h9ubx22ftdkyvd5/ml-latest-small.zip?dl=1'
    urllib.request.urlretrieve(url, 'ml-latest-small.zip')
    zfile = zipfile.ZipFile('ml-latest-small.zip')
    zfile.extractall()
    zfile.close()

In [244]:

def tokenize_string(my_string):
    """ DONE. You should use this in your tokenize function.
    """
    return re.findall('[\w\-]+', my_string.lower())

In [245]:

def tokenize(movies):
    """
    Append a new column to the movies DataFrame with header 'tokens'.
    This will contain a list of strings, one per token, extracted
    from the 'genre' field of each movie. Use the tokenize_string method above.

    Note: you may modify the movies parameter directly; no need to make
    a new copy.
    Params:
      movies...The movies DataFrame
    Returns:
      The movies DataFrame, augmented to include a new column called 'tokens'.

    >>> movies = pd.DataFrame([[123, 'Horror|Romance'], [456, 'Sci-Fi']], columns=['movieId', 'genres'])
    >>> movies = tokenize(movies)
    >>> movies['tokens'].tolist()
    [['horror', 'romance'], ['sci-fi']]
    """
    ###TODO
    tokens = [tokenize_string(t) for t in movies['genres'].tolist()]
    movies['tokens']=tokens
    return movies

movies = pd.DataFrame([[123, 'Horror|Horror|Romance'], [456, 'Sci-Fi'], [789, 'Sci-Fi|Drama']], columns=['movieId', 'genres'])
movies = tokenize(movies)
movies['tokens'].tolist()
vocab_keys = sorted(set(sum(movies['tokens'].tolist(),[])))
print (vocab_keys)
vocabulary = defaultdict(lambda: len(vocabulary))
for key in vocab_keys:
    vocabulary[key]
print (dict(vocabulary))
vocab_values = [v for v in range(len(vocab_keys))]
vocab = dict(zip(vocab_keys,vocab_values))
count = Counter(sum(movies['tokens'].tolist(),[]))
for m in movies.iterrows():
    print (m[1]['movieId'])
    c = Counter(m[1]['tokens'])
    print (c,c['horror'])
print (vocab)

In [246]:
def featurize(movies):
    """
    Append a new column to the movies DataFrame with header 'features'.
    Each row will contain a csr_matrix of shape (1, num_features). Each
    entry in this matrix will contain the tf-idf value of the term, as
    defined in class:
    tfidf(i, d) := tf(i, d) / max_k tf(k, d) * log10(N/df(i))
    where:
    i is a term
    d is a document (movie)
    tf(i, d) is the frequency of term i in document d
    max_k tf(k, d) is the maximum frequency of any term in document d
    N is the number of documents (movies)
    df(i) is the number of unique documents containing term i

    Params:
      movies...The movies DataFrame
    Returns:
      A tuple containing:
      - The movies DataFrame, which has been modified to include a column named 'features'.
      - The vocab, a dict from term to int. Make sure the vocab is sorted alphabetically as in a2 (e.g., {'aardvark': 0, 'boy': 1, ...})
    """
    ###TODO
    vocab_keys = sorted(set(sum(movies['tokens'].tolist(),[])))
    # print (vocab_keys)
    
#     vocab_values = [v for v in range(len(vocab_keys))]
#     vocab = dict(zip(vocab_keys,vocab_values))
    vocabulary = defaultdict(lambda: len(vocabulary))
    for key in vocab_keys:
        vocabulary[key]
    vocab = dict(vocabulary)
    num_d = movies.shape[0]
    features = []
    for m in movies.iterrows():
        m_tokens = m[1]['tokens']
        c = Counter(m_tokens)
#         print(c.items())
        tf = c.values()
        max_tf = c.most_common(1)[0][1]
#         print (tf)
        df = len(c.items())
        temp = max_tf*np.log10(num_d/df)
        tfidf = [t/temp for t in tf]
        col_idx = [vocab[f] for f in c.keys()]
        data = np.zeros(len(vocab))
        data[col_idx] = tfidf
        features.extend(csr_matrix(data))
#         print(csr_matrix(data).toarray())
    movies['features'] = features
    return (movies, vocab)
    pass



In [384]:
movies = pd.DataFrame([[123, 'Horror|Horror|Romance'], [456, 'Sci-Fi'], [789, 'Sci-Fi|Drama']], columns=['movieId', 'genres'])
movies = tokenize(movies)
movies,vocab = featurize(movies)
features = movies.loc[0]['features']
f = features[0]
print('features\n',features)
print('value', f)
# print('vocab',vocab)
print('type', type(f))
# cosine_sim(features[2],features[1])

features
   (0, 1)	5.67887358727
  (0, 2)	2.83943679363
value   (0, 1)	5.67887358727
  (0, 2)	2.83943679363
type <class 'scipy.sparse.csr.csr_matrix'>


In [199]:
v1 = [ 0, 5.67887359, 2.83943679, 0]
v2 = [ 5.67887359,0, 0, 5.67887359]
np.dot(v1, v2)

0.0

In [247]:
def train_test_split(ratings):
    """DONE.
    Returns a random split of the ratings matrix into a training and testing set.
    """
    test = set(range(len(ratings))[::1000])
    train = sorted(set(range(len(ratings))) - test)
    test = sorted(test)
    return ratings.iloc[train], ratings.iloc[test]

In [248]:
def cosine_sim(a, b):
    """
    Compute the cosine similarity between two 1-d csr_matrices.
    Each matrix represents the tf-idf feature vector of a movie.
    Params:
      a...A csr_matrix with shape (1, number_features)
      b...A csr_matrix with shape (1, number_features)
    Returns:
      The cosine similarity, defined as: dot(a, b) / ||a|| * ||b||
      where ||a|| indicates the Euclidean norm (aka L2 norm) of vector a.
    """
    ###TODO
    v1 = a.toarray()
    v2 = b.toarray()
    norms = np.linalg.norm(v1)*np.linalg.norm(v2)
    return np.dot(v1[0],v2[0])/norms
    pass


In [239]:
cosine_sim(features[2],features[1])

0.70710678118654746

In [412]:
def make_predictions(movies, ratings_train, ratings_test):
    """
    Using the ratings in ratings_train, predict the ratings for each
    row in ratings_test.

    To predict the rating of user u for movie i: Compute the weighted average
    rating for every other movie that u has rated.  Restrict this weighted
    average to movies that have a positive cosine similarity with movie
    i. The weight for movie m corresponds to the cosine similarity between m
    and i.

    If there are no other movies with positive cosine similarity to use in the
    prediction, use the mean rating of the target user in ratings_train as the
    prediction.

    Params:
      movies..........The movies DataFrame.
      ratings_train...The subset of ratings used for making predictions. These are the "historical" data.
      ratings_test....The subset of ratings that need to predicted. These are the "future" data.
    Returns:
      A numpy array containing one predicted rating for each element of ratings_test.
    """
    ###TODO
    num = ratings_test.shape[0]
    pre_rating = np.zeros(num)
    rate_mean = defaultdict(lambda: -1)
    for i in range(num):
        r_test = ratings_test.iloc[i]
        user = r_test['userId']
        pre_movie = r_test['movieId']
        pre_feature = movies[(movies.movieId == pre_movie)]['features'].item()
#         print('pre',pre_feature.item())
#         print('type',type(pre_feature.item()))
        u_ratings_train = ratings_train[ratings_train.userId == user]
        train_movies = u_ratings_train['movieId']
        if user not in rate_mean:
            avg_rating = u_ratings_train['rating'].mean()
            rate_mean[user]=avg_rating
#         cslist = []
        sum_rating = 0
        sum_cos = 0
        for u_m in train_movies:
            train_feature = movies[(movies.movieId == u_m)]['features'].item()  
            rating = u_ratings_train[u_ratings_train.movieId == u_m]['rating'].item()
            sum_rating += cosine_sim(pre_feature, train_feature)*rating
            sum_cos += cosine_sim(pre_feature, train_feature)
        if sum_cos is 0:
            pre = -1
        else:
            pre = sum_rating/sum_cos
#             print(movies[(movies.movieId == u_m)])
#             train_feature = movies[(movies.movieId == u_m)]['features'].item()       
#             cslist.append((u_m, cosine_sim(pre_feature, train_feature)))
#         pre_m = sorted(cslist, key=lambda x: x[1], reverse=True)[0]
#         print(pre)
        if pre > 0:
            pre_rating[i] =pre
        else:
            pre_rating[i] = rate_mean[user]
#     print('mean of users:',rate_mean.items())
    return pre_rating
    pass  


In [413]:
def mean_absolute_error(predictions, ratings_test):
    """DONE.
    Return the mean absolute error of the predictions.
    """
    return np.abs(predictions - np.array(ratings_test.rating)).mean()


def main():
    download_data()
    path = 'ml-latest-small'
    ratings = pd.read_csv(path + os.path.sep + 'ratings.csv')
    movies = pd.read_csv(path + os.path.sep + 'movies.csv')
    movies = tokenize(movies)
    movies, vocab = featurize(movies)
    print(sorted(vocab.items())[:10])
    ratings_train, ratings_test = train_test_split(ratings)
    print('%d training ratings; %d testing ratings' % (len(ratings_train), len(ratings_test)))
    predictions = make_predictions(movies, ratings_train, ratings_test)
    print('error=%f' % mean_absolute_error(predictions, ratings_test))
    print(predictions[:10])


if __name__ == '__main__':
    main()


[('action', 0), ('adventure', 1), ('animation', 2), ('children', 3), ('comedy', 4), ('crime', 5), ('documentary', 6), ('drama', 7), ('fantasy', 8), ('film-noir', 9)]
99903 training ratings; 101 testing ratings




error=0.789847
[ 2.64185496  2.57464639  2.73540562  4.1070431   3.2113471   4.18485263
  3.94769278  3.95200736  3.30050115  3.54732792]


In [267]:
data = [('a', 0),('b',-1),('c', 1),('d', 0),('e', 0)]
sorted(data, key=lambda x: x[1],reverse = True)[0][0]

'c'

mean of users: dict_items([(1.0, 2.5526315789473686), (516.0, 3.4594594594594597), (262.0, 2.6111111111111112), (520.0, 3.3823529411764706), (266.0, 3.736842105263158), (15.0, 2.622791519434629), (17.0, 3.7430939226519335), (275.0, 4.4601990049751246), (529.0, 3.5290215588723051), (533.0, 3.3518518518518516), (22.0, 3.2739726027397262), (28.0, 4.2653061224489797), (285.0, 3.1638954869358669), (30.0, 3.7658415841584159), (543.0, 4.333333333333333), (547.0, 3.3660527417329424), (292.0, 3.9566787003610107), (550.0, 3.5820895522388061), (41.0, 3.8661616161616164), (301.0, 3.3409090909090908), (559.0, 4.4140625), (306.0, 3.408385093167702), (564.0, 3.5535905680600215), (311.0, 3.0068762278978389), (58.0, 3.3888888888888888), (315.0, 2.4814814814814814), (575.0, 3.3919413919413919), (608.0, 3.9491525423728815), (580.0, 3.271986970684039), (71.0, 4.2727272727272725), (73.0, 3.3738346799254195), (330.0, 3.4680851063829787), (75.0, 3.2777777777777777), (592.0, 3.8787878787878789), (83.0, 3.9281250000000001), (597.0, 4.0199004975124382), (345.0, 3.73943661971831), (91.0, 4.1677852348993287), (570.0, 3.6574803149606301), (367.0, 3.3425925925925926), (353.0, 2.5942307692307693), (102.0, 3.9748892171344163), (358.0, 3.1866883116883118), (97.0, 3.0275590551181102), (617.0, 3.1216216216216215), (111.0, 3.5073529411764706), (50.0, 3.2888888888888888), (119.0, 3.5015624999999999), (378.0, 3.2328767123287672), (380.0, 3.365819209039548), (637.0, 4.166666666666667), (127.0, 4.0499999999999998), (384.0, 3.2334710743801653), (388.0, 3.6472819216182049), (133.0, 2.3785310734463279), (649.0, 3.50561797752809), (394.0, 2.7965686274509802), (654.0, 4.0712000000000002), (664.0, 3.7953667953667956), (146.0, 3.5555555555555554), (405.0, 3.6545012165450124), (152.0, 3.4308755760368665), (411.0, 3.7307692307692308), (671.0, 3.9166666666666665), (624.0, 2.8958453548759375), (162.0, 3.4137931034482758), (422.0, 3.9761904761904763), (170.0, 2.54), (427.0, 3.8890784982935154), (433.0, 3.8299492385786804), (182.0, 3.8153846153846156), (585.0, 4.2474916387959869), (442.0, 4.2321428571428568), (189.0, 2.6914285714285713), (452.0, 3.1893203883495147), (198.0, 3.7905405405405403), (457.0, 2.5028089887640448), (460.0, 3.7155477031802122), (205.0, 3.4121951219512194), (463.0, 3.3589211618257262), (212.0, 3.1080000000000001), (213.0, 2.6617161716171616), (470.0, 3.3014705882352939), (472.0, 3.7882991556091676), (220.0, 3.5555555555555554), (479.0, 4.0510204081632653), (483.0, 3.5847457627118646), (294.0, 3.5845665961945032), (232.0, 3.9662261380323054), (468.0, 2.9666666666666668), (239.0, 3.7068965517241379), (497.0, 3.8076923076923075), (243.0, 3.3954248366013071), (505.0, 3.2715736040609138), (250.0, 4.2727272727272725), (509.0, 3.3671366594360088), (605.0, 3.0458715596330275)])

In [256]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [366]:
frame[frame.year== 2000]['pop'].values

array([ 1.5])

In [303]:
for f in frame.iterater:
    print(f['pop'])

SyntaxError: invalid syntax (<ipython-input-303-0cb9db621b20>, line 1)