# Chapter 19
## Recommender systems

In [1]:
import os
import zipfile
import codecs
import collections
import numpy as np
import pandas as pd
from urllib import request
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD

### Getting ranking data

In [2]:
DirName = 'ml-1m'
Filename = 'ml-1m.zip'
if not os.path.exists(DirName):
    url = "http://files.grouplens.org/datasets/movielens/"
    request.urlretrieve(url + Filename, Filename)
    with zipfile.ZipFile(Filename, 'r') as zip_ref:
        zip_ref.extractall()

In [3]:
users = pd.read_table('ml-1m/users.dat', sep='::', header=None, names=['user_id', 'gender', 'age', 'occupation', 'zip'], engine='python')
ratings = pd.read_table('ml-1m/ratings.dat', sep='::', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'], engine='python')
movies = pd.read_table('ml-1m/movies.dat', sep='::', header=None, names=['movie_id', 'title', 'genres'], engine='python', encoding='latin1')
movieLens = pd.merge(pd.merge(ratings, users), movies)

In [4]:
print(movieLens.set_index(['title', 'rating']).groupby(level='rating').count()['user_id'])

rating
1     56174
2    107557
3    261197
4    348971
5    226310
Name: user_id, dtype: int64


In [5]:
indUser = movieLens.set_index(['movie_id', 'user_id']).groupby(level='user_id').count()['title']
indMovie = movieLens.set_index(['user_id', 'title']).groupby(level='title').count()['movie_id']

print(f"Average movie number of reviews per person: {indUser.mean():.4}")
print('\nNumber of reviews per movie\n' + str(indMovie))

Average movie number of reviews per person: 165.6

Number of reviews per movie
title
$1,000,000 Duck (1971)                         37
'Night Mother (1986)                           70
'Til There Was You (1997)                      52
'burbs, The (1989)                            303
...And Justice for All (1979)                 199
                                             ... 
Zed & Two Noughts, A (1985)                    29
Zero Effect (1998)                            301
Zero Kelvin (Kjærlighetens kjøtere) (1995)      2
Zeus and Roxanne (1997)                        23
eXistenZ (1999)                               410
Name: movie_id, Length: 3706, dtype: int64


In [6]:
movie = 260
selMovie = movieLens[movieLens['movie_id'] == movie]
print(f"{selMovie.size} users gave {selMovie['title'].iloc[0]} an average rating of {selMovie['rating'].mean():.4}")

29910 users gave Star Wars: Episode IV - A New Hope (1977) an average rating of 4.454


In [7]:
file = codecs.open('msweb', 'r')

attribute = collections.namedtuple('page', ['id', 'description', 'url'])
attributes = {}

current_user_id = None
current_user_ids = []
user_visits = {}
page_visits = {}

for line in file:
    chunks = line.split(',')
    entry_type = chunks[0]
    if entry_type == 'A':
        _, id, _, description, url = chunks
        attributes[int(id)] = attribute(id=int(id), description=description, url=url)
    
    if entry_type == 'C':
        if current_user_id is not None:
            user_visits[current_user_id] = set(current_user_ids)
            current_user_ids = []
        current_user_id = int(chunks[2])
    
    if entry_type == 'V':
        page_id = int(chunks[1])
        current_user_ids.append(page_id)
        page_visits.setdefault(page_id, [])
        page_visits[page_id].append(current_user_id)

print('Total number of attributes: ', len(attributes.keys()))
print('Total number of Users: ', len(user_visits.keys()))
print('Total number of VRoots: ', len(page_visits.keys()))

Total number of attributes:  294
Total number of Users:  32710
Total number of VRoots:  285


### Data preparation

In [8]:
reduced_movies = movieLens[movieLens['rating'] >= 3.0].drop(columns=['movie_id', 'timestamp', 'genres', 'gender', 'age', 'occupation', 'zip'])

display(reduced_movies.head())
print(f"\nOriginal shape: {movieLens.shape}\nNew shape: {reduced_movies.shape}")

Unnamed: 0,user_id,rating,title
0,1,5,One Flew Over the Cuckoo's Nest (1975)
1,2,5,One Flew Over the Cuckoo's Nest (1975)
2,12,4,One Flew Over the Cuckoo's Nest (1975)
3,15,4,One Flew Over the Cuckoo's Nest (1975)
4,17,5,One Flew Over the Cuckoo's Nest (1975)



Original shape: (1000209, 10)
New shape: (836478, 3)


In [9]:
reduced_movies = reduced_movies[reduced_movies.groupby('title')['rating'].transform('size') > 1000]

print(reduced_movies.groupby('title')['rating'].count().sort_values().head())
print('\nNew shape: ', reduced_movies.shape)

title
Few Good Men, A (1992)    1003
My Cousin Vinny (1992)    1003
Boogie Nights (1997)      1004
Witness (1985)            1009
Sneakers (1992)           1009
Name: rating, dtype: int64

New shape:  (237212, 3)


### Collaborative filtering

In [10]:
user_rating = pd.pivot_table(reduced_movies, index='user_id', columns='title', values='rating')
display(user_rating.head())

title,2001: A Space Odyssey (1968),"Abyss, The (1989)","African Queen, The (1951)",Airplane! (1980),Aladdin (1992),Alien (1979),Aliens (1986),Amadeus (1984),American Beauty (1999),American Pie (1999),...,Twelve Monkeys (1995),"Untouchables, The (1987)","Usual Suspects, The (1995)",When Harry Met Sally... (1989),Who Framed Roger Rabbit? (1988),Willy Wonka and the Chocolate Factory (1971),Witness (1985),"Wizard of Oz, The (1939)",X-Men (2000),Young Frankenstein (1974)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,4.0,4.0,,,,,,...,,,,,,,,4.0,,
2,,,,,,,,5.0,4.0,,...,,4.0,,,,,,,,
3,,,,,,,,,4.0,,...,,,,,,,,,,
4,,,,,,4.0,,,,,...,,,,,,,,,,
5,,,,,,,,,4.0,,...,4.0,,5.0,,4.0,,,4.0,,


In [11]:
YF_ratings = user_rating['Young Frankenstein (1974)']
print(YF_ratings.sort_values(ascending=False).head())

user_id
10      5.0
2537    5.0
4904    5.0
4901    5.0
2486    5.0
Name: Young Frankenstein (1974), dtype: float64


In [12]:
print(user_rating.corrwith(YF_ratings).sort_values(ascending=False).head())

title
Young Frankenstein (1974)                       1.000000
Blazing Saddles (1974)                          0.412395
Alien (1979)                                    0.297567
Willy Wonka and the Chocolate Factory (1971)    0.272574
M*A*S*H (1970)                                  0.259304
dtype: float64


### Integrating text and behaviour

In [13]:
for i, (k, v), in enumerate(attributes.items()):
    if i > 9:
        for _ in range(3): print('.')
        break
    print(f"{v.id:4} {v.description:30.30} {v.url}", end='')

1287 "International AutoRoute"      "/autoroute"
1288 "library"                      "/library"
1289 "Master Chef Product Informati "/masterchef"
1297 "Central America"              "/centroam"
1215 "For Developers Only Info"     "/developer"
1279 "Multimedia Golf"              "/msgolf"
1239 "Microsoft Consulting"         "/msconsult"
1282 "home"                         "/home"
1251 "Reference Support"            "/referencesupport"
1121 "Microsoft Magazine"           "/magazine"
.
.
.


In [14]:
nbr_visits = list(map(len, user_visits.values()))
average_visits = sum(nbr_visits) / len(nbr_visits)
one_visit = sum(x == 1 for x in nbr_visits)

print('Number of user visits: ', sum(nbr_visits))
print('Average number of visits: {:.3}'.format(average_visits))
print('Users woth just one visit: ', one_visit)

Number of user visits:  98653
Average number of visits: 3.02
Users woth just one visit:  9994


In [15]:
ratings_mtx_df = movieLens.pivot_table(values='rating', index='user_id', columns='title', fill_value=0)
movie_index = ratings_mtx_df.columns

In [16]:
recom = TruncatedSVD(n_components=15, random_state=101)
R = recom.fit_transform(ratings_mtx_df.values.T)

In [17]:
movie = 'Star Wars: Episode V - The Empire Strikes Back (1980)'
movie_idx = list(movie_index).index(movie)

print(f"Movie index: {movie_idx}")
print(R[movie_idx])

Movie index: 3154
[184.72254552 -17.77612872  47.33450866  51.4664494   47.92058216
  17.65033116  14.3574635  -12.82219207  17.51347857   5.46888807
   7.5430805   -0.57117869 -30.74032355   2.4088565  -22.50368497]


In [18]:
correlation_matrix = np.corrcoef(R)
P = correlation_matrix[movie_idx]

print(list(movie_index[(P > 0.985) & (P < 1)]))

['Star Wars: Episode IV - A New Hope (1977)', 'Star Wars: Episode VI - Return of the Jedi (1983)']
