# Truncated SVD Demo

Performs linear dimensionality reduction by means of truncated singular value decomposition (SVD).  
This estimator does not center the data before computing the singular value decomposition. This means it can work with sparse matrices efficiently.

Missing values means matrix decomposition cannot happen naturally.
We thus use Machine Learning iterative approach to complete the missing values.
The filled values which are ratings for users if high, those movies can be recommended to a user.

Actual SVD works on filled data in matrices like PCA does. We are going to use Truncated SVD.

In [1]:
import pandas as pd 
import numpy as np
from scipy.sparse import csr_matrix as sparse_matrix
import os
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

In [2]:
movie_ratings = pd.read_csv('data/ratings.csv')
movie_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
import pandas as pd 
# to look up titles
movie_info = pd.read_csv('data/movies.csv', index_col=0)
movie_info.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [4]:
def get_stats(ratings, item_key="item", user_key="user"):
    print("Number of ratings:", len(ratings))
    print("The average rating:", np.mean(ratings["rating"]))

    n = len(set(ratings[item_key]))
    d = len(set(ratings[user_key]))
    print("Number of users:", d)
    #print("Number of items:", n)
    print("Fraction nonzero:", len(ratings)/(n*d))
    print("Size of full X matrix (GB):", (n*d)*8/1e9)

    return n,d

In [5]:
def create_X(ratings, n, d, user_key="user", item_key="item"):
    """
    Creates a sparse matrix using scipy.csr_matrix and mappers to relate indexes to items' id.
    
    Parameters:
    -----------
    ratings: the ratings to be stored in the matrix;
    n: the number of items
    d: the number of users
    user_key: the column in the pandas dataframe that contains the users id
    item_key: the column in the pandas dataframe that contains the items id
    
    Returns: (X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind)
    --------
    X: the sparse matrix containing the ratings. Note that
    user_mapper: stores the indexes of the users - the user_id is the key;
    item_mapper: stores the indexes of the items - the item_id is the key;
    user_inverse_mapper: stores the user id - the user index is the key;
    item_inverse_mapper: stores the item id - the item index is the key;
    user_ind: indexes of the users;
    item_ind: indexes of the items;
    """
    
    user_mapper = dict(zip(np.unique(ratings[user_key]), list(range(d))))
    item_mapper = dict(zip(np.unique(ratings[item_key]), list(range(n))))

    user_inverse_mapper = dict(zip(list(range(d)), np.unique(ratings[user_key])))
    item_inverse_mapper = dict(zip(list(range(n)), np.unique(ratings[item_key])))

    user_ind = [user_mapper[i] for i in ratings[user_key]]
    item_ind = [item_mapper[i] for i in ratings[item_key]]

    X = sparse_matrix((ratings["rating"], (item_ind, user_ind)), shape=(n,d))
    
    return X, user_mapper, item_mapper, user_inverse_mapper, item_inverse_mapper, user_ind, item_ind

In [6]:
def find_nearestneighbour(model, X, query_ind):
    
    model.fit(X)
    if X[query_ind].ndim==2:
        neighbors_idx = model.kneighbors(X[query_ind], return_distance=False).ravel()
    else: 
        neighbors_idx = model.kneighbors(X[query_ind][None], return_distance=False).ravel()
    
    return np.delete(neighbors_idx, np.where(neighbors_idx==query_ind))

In [7]:
def print_movie_pop(nn):
    movies = [movie_inverse_mapper[z] for z in nn]
    pop = np.sum(movie_X[nn], axis=1) 
    for i in range(0,5):
        print(f"\t{movie_info.loc[movies[i],'title']:50} Total stars: {pop[i,0]}")

In [8]:
movie_n, movie_d = get_stats(movie_ratings, user_key="userId", item_key="movieId")

Number of ratings: 100836
The average rating: 3.501556983616962
Number of users: 610
Fraction nonzero: 0.016999683055613623
Size of full X matrix (GB): 0.04745312


In [9]:
movie_X, user_mapper, movie_mapper, user_inverse_mapper, movie_inverse_mapper, user_ind, movie_ind = create_X(movie_ratings, movie_n, movie_d, user_key="userId", item_key="movieId")


In [10]:
nn_euc = find_nearestneighbour(NearestNeighbors(n_neighbors=6), movie_X, 0)
nn_euc

array([2353,  546,  615, 1756,  622], dtype=int64)

In [11]:
toy_story_ind = 0
toy_story_vec = movie_X[toy_story_ind]

In [12]:
for k in [10, 100, 500]:
    print("\n\n")
    movie_svd = TruncatedSVD(n_components=k)
    Z = movie_svd.fit_transform(movie_X)
    nn_svd = find_nearestneighbour(NearestNeighbors(n_neighbors=50), Z, 0)

    print(f"SVD(k={k}): ")
    print_movie_pop(nn_svd) 
    
print("\n\n Euclidean:")
print_movie_pop(nn_euc)




SVD(k=10): 
	Independence Day (a.k.a. ID4) (1996)               Total stars: 696.0
	Back to the Future (1985)                          Total stars: 690.5
	Jurassic Park (1993)                               Total stars: 892.5
	Aladdin (1992)                                     Total stars: 694.0
	Mission: Impossible (1996)                         Total stars: 573.0



SVD(k=100): 
	Toy Story 2 (1999)                                 Total stars: 374.5
	Mission: Impossible (1996)                         Total stars: 573.0
	Independence Day (a.k.a. ID4) (1996)               Total stars: 696.0
	Willy Wonka & the Chocolate Factory (1971)         Total stars: 461.0
	Twister (1996)                                     Total stars: 408.5



SVD(k=500): 
	Toy Story 2 (1999)                                 Total stars: 374.5
	Mission: Impossible (1996)                         Total stars: 573.0
	Independence Day (a.k.a. ID4) (1996)               Total stars: 696.0
	Bug's Life, A (1998)         