# Product Recommendation

In this blog post, I will walk through how you can manually make a product recommendation system for movies.

## Loading in data:

In [66]:
# Importing libraries:
import scipy.io as sio
import numpy as np
import pandas as pd
from scipy.optimize import minimize, rosen, rosen_der

ratings = pd.read_csv(filepath_or_buffer="../data/ratings.csv")
ratings = ratings.drop(labels='timestamp', axis=1)

In [111]:
movies = pd.read_csv(filepath_or_buffer="../data/movies.csv")
movies = movies.drop('genres', axis=1)
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [26]:
ratings_spread = ratings.pivot(index='movieId', columns='userId', values='rating')

In [56]:
ratings_spread.head().drop(ratings_spread.index[1-1])

userId,1,2,3,4,5,6,7,8,9,10,...,138484,138485,138486,138487,138488,138489,138490,138491,138492,138493
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,3.5,,,,3.0,,,,,,...,3.0,,,,3.0,,,,,4.0
3,,4.0,,,,3.0,3.0,5.0,,,...,4.0,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [67]:
# Get only first ten thousand movies and first one thousand users
ratings_sub = ratings_spread.iloc[0:10000, 0:1000] 

In [147]:
# Droping all un-informed data:
ratings_sub = ratings_sub.dropna(axis=0, how='all') # drop all movies with no ratings
ratings_sub = ratings_sub.dropna(axis=1, how='all') # drop all users who didn't rate

# Check which users rated which movies
rated = pd.isnull(ratings_sub)

# Get subset of movie titles
movies_sub = movies[movies['movieId'].isin(ratings_sub.index)]

In [148]:
def compute_cost(X_theta, y, rated, reg_coeff, num_features):
    # Get dimensions
    num_users = y.shape[1]
    num_movies = y.shape[0]
    
    # Reconstructing X:
    X = X_theta[0:num_movies*num_features]
    X = X.reshape((num_movies, num_features))
    
    # Reconstructing theta:
    theta = X_theta[num_movies*num_features:]
    theta = theta.reshape((num_users, num_features))
    
    # Calculating estimate:
    y_hat = np.dot(X, theta.T)
    
    # Calculating error:
    error = np.multiply((y_hat - y), rated)
    sq_error = error**2
    
    # Calculating cost:
    theta_regularization = (reg_coeff/2)*(np.sum(theta**2))
    X_regularization = (reg_coeff/2)*(np.sum(X**2))                
    J =  (1/2)*np.sum(sq_error) + theta_regularization + X_regularization
    
    # Calculating gradients:
    theta_gradient = np.dot(error.T,X) + reg_coeff*theta
    X_gradient = np.dot(error,theta) + reg_coeff*X 
    X_theta_gradient = np.append(np.ravel(X_gradient), np.ravel(theta_gradient))

    return(J, X_theta_gradient)

In [141]:
# Reading in movie titles:
movie_titles = []
f = open("../data/movie_ids.txt", encoding='ISO-8859-1')
for line in f:
    title_list = line.split(" ")[1:-1]
    movie_titles.append(" ".join(title_list))

In [204]:
# Getting data
ratings = movie_info['R']
y = movie_info['Y']
X = user_info['X']
theta = user_info['Theta']

# getting dimensions:
num_features = 10
num_movies = y.shape[0]
num_users = y.shape[1]

# Making new user:
new_user = np.zeros(num_movies)
rated = np.zeros(num_movies)

# Entering user preferences:
new_user[0] = 4  
new_user[6] = 3  
new_user[11] = 5  
new_user[53] = 4  
new_user[63] = 5  
new_user[65] = 3  
new_user[68] = 5  
new_user[97] = 2  
new_user[182] = 4  
new_user[225] = 5  
new_user[354] = 5

# Marking which movies the user has rated:
for i,r in enumerate(new_user):
    if r != 0:
        rated[i] = 1
ratings = np.vstack((ratings.T, rated)).T

# Printing out rated movies:
for i, movie_title in enumerate(movie_titles):
    if rated[i] == 1:
        print(movie_title)

# adding user preferences to database
y = np.vstack((y.T, new_user)).T

# getting dimensions:
num_features = 10
num_movies = y.shape[0]
num_users = y.shape[1]

# making random x values - this corresponds to initializing user preferences
X = np.random.rand(num_users, num_features)

# making random theta values - this corresponds to initializing movie attributes
theta = np.random.rand(num_movies, num_features)

# Combining x and theta:
X_theta = np.append(np.ravel(X), np.ravel(theta))

# Normalizing y:
y_mean = np.mean(y,axis=1)
y_norm = y.T - y_mean.T
y_norm = y_norm.T

Toy Story
Twelve Monkeys
Usual Suspects, The
Outbreak
Shawshank Redemption, The
While You Were Sleeping
Forrest Gump
Silence of the Lambs, The
Alien
Die Hard 2
Sphere


## Training:

In [210]:
reg_coeff = 50

min_results = minimize(fun=compute_cost,
                       x0=X_theta, 
                       method='CG',         
                       jac=True,
                       args=(y_norm, ratings, reg_coeff, num_features),
                       options={'maxiter':1000})      
        
min_results

     fun: 172340.13706469146
     jac: array([  1.08754225e-06,   9.84602966e-07,   1.05363709e-06, ...,
         1.95317808e-07,   1.70600618e-08,  -7.97723487e-09])
 message: 'Desired error not necessarily achieved due to precision loss.'
    nfev: 865
     nit: 558
    njev: 864
  status: 2
 success: False
       x: array([ 0.08971245,  0.56593131,  0.62924807, ...,  0.22001374,
        0.21241491,  0.19224528])

## Prediction:

In [211]:
X_theta_pred = min_results['x']

# Reconstructing X:
X_pred = X_theta_pred[0:num_movies*num_features]
X_pred = X_pred.reshape((num_movies, num_features))

# Reconstructing theta:
theta_pred = X_theta_pred[num_movies*num_features:]
theta_pred = theta_pred.reshape((num_users, num_features))

# Predicting new_user:
predictions = np.dot(X_pred, theta_pred.T)
test = np.vstack((range(0,num_movies), predictions[:,-1].T)).T

new_user_df = pd.DataFrame(test)
new_user_df.columns = ["movie_id", "predicted_rating"]
new_user_df['predicted_rating'] = new_user_df['predicted_rating'] + y_mean
new_user_df = new_user_df.sort_values(by='predicted_rating', ascending=False)

movie = []

for i, movie_id in enumerate(new_user_df['movie_id']):
    movie.append(movie_titles[int(movie_id)])

new_user_df['movie'] = movie

new_user_df

Unnamed: 0,movie_id,predicted_rating,movie
49,49.0,3.401562,Star Wars
99,99.0,2.991046,Fargo
180,180.0,2.948426,Return of the Jedi
173,173.0,2.834402,Raiders of the Lost Ark
257,257.0,2.829887,Contact
126,126.0,2.813705,"Godfather, The"
97,97.0,2.719508,"Silence of the Lambs, The"
312,312.0,2.716535,Titanic
0,0.0,2.698417,Toy Story
285,285.0,2.659803,"English Patient, The"


In [54]:
X.shape

(1683, 10)