In [1]:
%%time
ratings = [
    5,
    2,
    3,
    3,
    4,
    5,
    5,
    1,
    5,
    1,
    3,
    4
]

for i, value in enumerate(ratings):
    print("Updating rating {}".format(i))
    ratings[i] = value * 2

print(ratings)

Updating rating 0
Updating rating 1
Updating rating 2
Updating rating 3
Updating rating 4
Updating rating 5
Updating rating 6
Updating rating 7
Updating rating 8
Updating rating 9
Updating rating 10
Updating rating 11
[10, 4, 6, 6, 8, 10, 10, 2, 10, 2, 6, 8]
Wall time: 0 ns


In [2]:
%%time
import numpy as np

ratings = np.array([
    5,
    2,
    3,
    3,
    4,
    5,
    5,
    1,
    5,
    1,
    3,
    4
])

ratings = ratings * 2

print(ratings)


[10  4  6  6  8 10 10  2 10  2  6  8]
Wall time: 395 ms


In [12]:
import pandas
import webbrowser
import os

# Read the dataset into a data table using Pandas
data_table = pandas.read_csv(".\movie_ratings_data_set.csv", index_col="movie_id")

# Create a web page view of the data for easy viewing
html = data_table[0:100].to_html()

# Save the html to a temporary file
with open("data.html", "w") as f:
    f.write(html)

# Open the web page in our web browser
full_filename = os.path.abspath("data.html")
webbrowser.open("file://{}".format(full_filename))

True

In [13]:
import pandas as pd
import numpy as np
import os
import webbrowser

# Read the dataset into a data table using Pandas
df = pd.read_csv(".\movie_ratings_data_set.csv")

# Convert the running list of user ratings into a matrix using the 'pivot table' function

#np.max means choose the maximum value of two replies
#np.mean same principle
ratings_df = pd.pivot_table(df, index = 'user_id', columns = 'movie_id', aggfunc = np.max)

# Create a web page view of the data for easy viewing
html = ratings_df.to_html(na_rep="")

# Save the html to a temporary file
with open("review_matrix.html", "w") as f:
    f.write(html)

# Open the web page in our web browser
full_filename = os.path.abspath("review_matrix.html")
webbrowser.open("file://{}".format(full_filename))

True

In [14]:
import pandas as pd
import numpy as np

# Read the dataset into a data table using Pandas
df = pd.read_csv(".\movie_ratings_data_set.csv")

# Convert the running list of user ratings into a matrix using the 'pivot table' function
ratings_df = pd.pivot_table(df, index='user_id', columns='movie_id', aggfunc=np.max)

# Create a csv file of the data for easy viewing
ratings_df.to_csv("review_matrix.csv", na_rep="")

In [15]:
import numpy as np
from scipy.optimize import fmin_cg


def normalize_ratings(ratings):
    """
    Given an array of user ratings, subtract the mean of each product's ratings
    :param ratings: 2d array of user ratings
    :return: (normalized ratings array, the calculated means)
    """
    mean_ratings = np.nanmean(ratings, axis=0)
    return ratings - mean_ratings, mean_ratings


def cost(X, *args):
    """
    Cost function for low rank matrix factorization
    :param X: The matrices being factored (P and Q) rolled up as a contiguous array
    :param args: Array containing (num_users, num_products, num_features, ratings, mask, regularization_amount)
    :return: The cost with the current P and Q matrices
    """
    num_users, num_products, num_features, ratings, mask, regularization_amount = args

    # Unroll P and Q
    P = X[0:(num_users * num_features)].reshape(num_users, num_features)
    Q = X[(num_users * num_features):].reshape(num_products, num_features)
    Q = Q.T

    # Calculate current cost
    return (np.sum(np.square(mask * (np.dot(P, Q) - ratings))) / 2) + ((regularization_amount / 2.0) * np.sum(np.square(Q.T))) + ((regularization_amount / 2.0) * np.sum(np.square(P)))


def gradient(X, *args):
    """
    Calculate the cost gradients with the current P and Q.
    :param X: The matrices being factored (P and Q) rolled up as a contiguous array
    :param args: Array containing (num_users, num_products, num_features, ratings, mask, regularization_amount)
    :return: The gradient with the current X
    """
    num_users, num_products, num_features, ratings, mask, regularization_amount = args

    # Unroll P and Q
    P = X[0:(num_users * num_features)].reshape(num_users, num_features)
    Q = X[(num_users * num_features):].reshape(num_products, num_features)
    Q = Q.T

    # Calculate the current gradients for both P and Q
    P_grad = np.dot((mask * (np.dot(P, Q) - ratings)), Q.T) + (regularization_amount * P)
    Q_grad = np.dot((mask * (np.dot(P, Q) - ratings)).T, P) + (regularization_amount * Q.T)

    # Return the gradients as one rolled-up array as expected by fmin_cg
    return np.append(P_grad.ravel(), Q_grad.ravel())


def low_rank_matrix_factorization(ratings, mask=None, num_features=15, regularization_amount=0.01):
    """
    Factor a ratings array into two latent feature arrays (user features and product features)

    :param ratings: Matrix with user ratings to factor
    :param mask: A binary mask of which ratings are present in the ratings array to factor
    :param num_features: Number of latent features to generate for users and products
    :param regularization_amount: How much regularization to apply
    :return: (P, Q) - the factored latent feature arrays
    """
    num_users, num_products = ratings.shape

    # If no mask is provided, consider all 'NaN' elements as missing and create a mask.
    if mask is None:
        mask = np.invert(np.isnan(ratings))

    # Replace NaN values with zero
    ratings = np.nan_to_num(ratings)

    # Create P and Q and fill with random numbers to start
    np.random.seed(0)
    P = np.random.randn(num_users, num_features)
    Q = np.random.randn(num_products, num_features)

    # Roll up P and Q into a contiguous array as fmin_cg expects
    initial = np.append(P.ravel(), Q.ravel())

    # Create an args array as fmin_cg expects
    args = (num_users, num_products, num_features, ratings, mask, regularization_amount)

    # Call fmin_cg to minimize the cost function and this find the best values for P and Q
    X = fmin_cg(cost, initial, fprime=gradient, args=args, maxiter=3000)

    # Unroll the new P and new Q arrays out of the contiguous array returned by fmin_cg
    nP = X[0:(num_users * num_features)].reshape(num_users, num_features)
    nQ = X[(num_users * num_features):].reshape(num_products, num_features)

    return nP, nQ.T


def RMSE(real, predicted):
    """
    Calculate the root mean squared error between a matrix of real ratings and predicted ratings
    :param real: A matrix containing the real ratings (with 'NaN' for any missing elements)
    :param predicted: A matrix of predictions
    :return: The RMSE as a float
    """
    return np.sqrt(np.nanmean(np.square(real - predicted)))

In [16]:
import numpy as np
import pandas as pd

# Load user ratings
raw_dataset_df = pd.read_csv('.\movie_ratings_data_set.csv')

# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(raw_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)

# Apply matrix factorization to find the latent features
U,M = low_rank_matrix_factorization(ratings_df.as_matrix(),num_features=15,regularization_amount=0.1)

# Find all predicted ratings by multiplying the U by M
predicted_ratings = np.matmul(U,M)

# Save all the ratings to a csv file
predicted_ratings_df = pd.DataFrame(index=ratings_df.index,
                                    columns=ratings_df.columns,
                                    data=predicted_ratings)
predicted_ratings_df.to_csv("predicted_ratings.csv")

  # This is added back by InteractiveShellApp.init_path()


         Current function value: 32.504368
         Iterations: 3000
         Function evaluations: 4478
         Gradient evaluations: 4478


In [17]:
raw_dataset_df

Unnamed: 0,user_id,movie_id,value
0,1,28,4
1,1,26,4
2,1,9,4
3,1,1,4
4,1,14,4
5,1,13,5
6,2,2,5
7,2,15,4
8,2,1,5
9,2,21,5


In [18]:
ratings_df

Unnamed: 0_level_0,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value,value
movie_id,1,2,3,4,5,6,7,8,9,10,...,25,26,27,28,29,30,31,32,33,34
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.0,,,,,,,,4.0,,...,,4.0,,4.0,,,,,,
2,5.0,5.0,,,,,,,,,...,,,,,,,3.0,,,4.0
3,4.0,4.0,5.0,,,,,,,,...,,,,,,,,,,
4,5.0,5.0,,5.0,5.0,,,,,,...,,,,,,,,,,
5,5.0,,,,,,,,5.0,,...,,,,,3.0,,3.0,2.0,5.0,5.0
6,5.0,5.0,,,,,,,,,...,,,,,,,2.0,5.0,4.0,3.0
7,5.0,,,2.0,,,,,,,...,,,,,,,,,,4.0
8,4.0,,5.0,,,,,,,5.0,...,,,5.0,5.0,,,,,,
9,5.0,,5.0,,,,,,,,...,,,,5.0,4.0,,,,,
10,4.0,,4.0,,,,,4.0,,,...,,,,,5.0,,,,,


In [19]:
import numpy as np
import pandas as pd

# Load user ratings
df = pd.read_csv('.\movie_ratings_data_set.csv')

# Load movie titles
movies_df = pd.read_csv('.\movies.csv', index_col='movie_id')

# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(df, index='user_id', columns='movie_id', aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = low_rank_matrix_factorization(ratings_df.as_matrix(), num_features=15, regularization_amount=1.0)

# Swap the rows and columns of product_features just so it's easier to work with
M = np.transpose(M)


# Choose a movie to find similar movies to. Let's find movies similar to movie #5:
movie_id = 5

# Get movie #1's name and genre
movie_information = movies_df.loc[5]

print("We are finding movies similar to this movie:")
print("Movie title: {}".format(movie_information.title))
print("Genre: {}".format(movie_information.genre))

# Get the features for movie #1 we found via matrix factorization
current_movie_features = M[movie_id-1]

print("The attributes for this movie are:")
print(current_movie_features)

# The main logic for finding similar movies:

# 1. Subtract the current movie's features from every other movie's features
difference = M-current_movie_features

# 2. Take the absolute value of that difference (so all numbers are positive)
absolute_difference = np.abs(difference)

# 3. Each movie has 15 features. Sum those 15 features to get a total 'difference score' for each movie
# axis = 1 represent row
total_difference = np.sum(absolute_difference,axis = 1)

# 4. Create a new column in the movie list with the difference score for each movie
movies_df['difference_score'] = total_difference

# 5. Sort the movie list by difference score, from least different to most different
sorted_movie_list = movies_df.sort_values('difference_score')

# 6. Print the result, showing the 5 most similar movies to movie_id #1
print("The five most similar movies are:")
print(sorted_movie_list[['title', 'difference_score']][0:5])


  


Optimization terminated successfully.
         Current function value: 312.762757
         Iterations: 1611
         Function evaluations: 2402
         Gradient evaluations: 2402
We are finding movies similar to this movie:
Movie title: The Big City Judge 2
Genre: legal drama
The attributes for this movie are:
[ 0.66560733 -0.82883687 -0.72671107  0.52255651 -0.8484791  -1.8418182
 -0.78713896  0.25989426 -0.11901175  0.11403235 -0.15083588 -0.17655374
 -0.23276161 -0.81279556  1.08268687]
The five most similar movies are:
                            title  difference_score
movie_id                                           
5            The Big City Judge 2          0.000000
10        Surrounded by Zombies 1          1.871728
9                     Biker Gangs          2.600086
3                   The Sheriff 2          2.696042
24           The Big City Judge 3          2.788129


In [20]:
import numpy as np
import pandas as pd

# Load user ratings
raw_dataset_df = pd.read_csv('.\movie_ratings_data_set.csv')

# Load movie titles
movies_df = pd.read_csv('.\movies.csv', index_col='movie_id')

# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(raw_dataset_df, index='user_id',
                            columns='movie_id',
                            aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = low_rank_matrix_factorization(ratings_df.as_matrix(),num_features=15,regularization_amount=0.1)

# Find all predicted ratings by multiplying U and M matrices
predicted_ratings = np.matmul(U, M)

print("Enter a user_id to get recommendations (Between 1 and 100):")
user_id_to_search = int(input())

print("Movies previously reviewed by user_id {}:".format(user_id_to_search))

reviewed_movies_df = raw_dataset_df[raw_dataset_df['user_id'] == user_id_to_search]
reviewed_movies_df = reviewed_movies_df.join(movies_df, on='movie_id')

print(reviewed_movies_df[['title', 'genre', 'value']])

input("Press enter to continue.")

print("Movies we will recommend:")

user_ratings = predicted_ratings[user_id_to_search-1]
movies_df['rating'] = user_ratings

already_reviewed = reviewed_movies_df['movie_id']
recommended_df = movies_df[movies_df.index.isin(already_reviewed) == False]
recommended_df = recommended_df.sort_values(by=['rating'], ascending=False)

print(recommended_df[['title', 'genre', 'rating']].head(5))



  app.launch_new_instance()


         Current function value: 32.504368
         Iterations: 3000
         Function evaluations: 4478
         Gradient evaluations: 4478
Enter a user_id to get recommendations (Between 1 and 100):
20
Movies previously reviewed by user_id 20:
                     title            genre  value
128           Sports Nerds           comedy      4
129   The Big City Judge 3      legal drama      5
130  The Serious Detective  detective drama      5
131   The Big City Judge 1      legal drama      5
132   The Big City Judge 2      legal drama      5
133         The Spy Family        spy drama      4
134       Trapped in Space  sci-fi, mystery      4
135  My Complicated Family     comedy-drama      3
Press enter to continue.
Movies we will recommend:
                            title                     genre    rating
movie_id                                                             
21                Political Gaffs  comedy, political satire  5.186300
10        Surrounded by Zombies 1 

In [22]:
import numpy as np
import pandas as pd

# Load user ratings
raw_training_dataset_df = pd.read_csv('.\movie_ratings_data_set_training.csv')
raw_testing_dataset_df = pd.read_csv('.\movie_ratings_data_set_testing.csv')

# Convert the running list of user ratings into a matrix
ratings_training_df = pd.pivot_table(raw_training_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)
ratings_testing_df = pd.pivot_table(raw_testing_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = low_rank_matrix_factorization(ratings_training_df.as_matrix(),num_features=15,regularization_amount=0.01)

# Find all predicted ratings by multiplying U and M
predicted_ratings = np.matmul(U, M)

# Measure RMSE
rmse_training = RMSE(ratings_training_df.as_matrix(),predicted_ratings)
rmse_testing = RMSE(ratings_testing_df.as_matrix(),predicted_ratings)

print("Training RMSE: {}".format(rmse_training))
print("Testing RMSE: {}".format(rmse_testing))


  del sys.path[0]


         Current function value: 3.013030
         Iterations: 3000
         Function evaluations: 4520
         Gradient evaluations: 4520
Training RMSE: 0.002447044124838704
Testing RMSE: 1.2306591968130036




In [23]:
raw_testing_dataset_df

Unnamed: 0,user_id,movie_id,value
0,32,13,5
1,26,8,5
2,70,30,5
3,66,19,4
4,38,3,5
5,26,27,5
6,92,34,5
7,60,29,2
8,58,20,5
9,53,21,5


In [24]:
import numpy as np
import pandas as pd
import pickle

# Load user ratings
raw_dataset_df = pd.read_csv('.\movie_ratings_data_set.csv')

# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(raw_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)

# Apply matrix factorization to find the latent features
U, M = low_rank_matrix_factorization(ratings_df.as_matrix(),num_features=15,regularization_amount=0.1)

# Find all predicted ratings by multiplying U and M
predicted_ratings = np.matmul(U, M)

# Save features and predicted ratings to files for later use
pickle.dump(U, open("user_features.dat", "wb"))
pickle.dump(M, open("product_features.dat", "wb"))
pickle.dump(predicted_ratings, open("predicted_ratings.dat", "wb" ))

  if sys.path[0] == '':


         Current function value: 32.504368
         Iterations: 3000
         Function evaluations: 4478
         Gradient evaluations: 4478


In [25]:
import pickle
import pandas as pd

# Load prediction rules from data files
U = pickle.load(open("user_features.dat", "rb"))
M = pickle.load(open("product_features.dat", "rb"))
predicted_ratings = pickle.load(open("predicted_ratings.dat", "rb"))

# Load movie titles
movies_df = pd.read_csv('.\movies.csv', index_col='movie_id')

print("Enter a user_id to get recommendations (Between 1 and 100):")
user_id_to_search = int(input())

print("Movies we will recommend:")

user_ratings = predicted_ratings[user_id_to_search - 1]
movies_df['rating'] = user_ratings
movies_df = movies_df.sort_values(by=['rating'], ascending=False)

print(movies_df[['title', 'genre', 'rating']].head(5))

Enter a user_id to get recommendations (Between 1 and 100):
20
Movies we will recommend:
                            title                     genre    rating
movie_id                                                             
21                Political Gaffs  comedy, political satire  5.186300
34          The Serious Detective           detective drama  4.990897
5            The Big City Judge 2               legal drama  4.989093
10        Surrounded by Zombies 1    horror, zombie fiction  4.982623
24           The Big City Judge 3               legal drama  4.976658


In [26]:
import numpy as np
import pandas as pd
import pickle

# Load user ratings
raw_dataset_df = pd.read_csv('.\movie_ratings_data_set.csv')

# Convert the running list of user ratings into a matrix
ratings_df = pd.pivot_table(raw_dataset_df, index='user_id', columns='movie_id', aggfunc=np.max)

# Normalize the ratings (center them around their mean)
normalized_ratings, means = normalize_ratings(ratings_df.as_matrix())

# Apply matrix factorization to find the latent features
U, M = low_rank_matrix_factorization(normalized_ratings,num_features=11,regularization_amount=1.1)

# Find all predicted ratings by multiplying U and M
predicted_ratings = np.matmul(U, M)

# Add back in the mean ratings for each product to de-normalize the predicted results
predicted_ratings = predicted_ratings + means

# Save features and predicted ratings to files for later use
pickle.dump(U, open("user_features.dat", "wb"))
pickle.dump(M, open("product_features.dat", "wb"))
pickle.dump(predicted_ratings, open("predicted_ratings.dat", "wb" ))
pickle.dump(means, open("means.dat", "wb" ))


Optimization terminated successfully.
         Current function value: 105.620378
         Iterations: 460
         Function evaluations: 685
         Gradient evaluations: 685


  if sys.path[0] == '':


In [27]:
import pickle
import pandas as pd

# Load prediction rules from data files
means = pickle.load(open("means.dat", "rb"))

# Load movie titles
movies_df = pd.read_csv('.\movies.csv', index_col='movie_id')

# Just use the average movie ratings directly as the user's predicted ratings
user_ratings = means

print("Movies we will recommend:")

movies_df['rating'] = user_ratings
movies_df = movies_df.sort_values(by=['rating'], ascending=False)

print(movies_df[['title', 'genre', 'rating']].head(5))

Movies we will recommend:
                            title                   genre    rating
movie_id                                                           
6               Attack on Earth 1          sci-fi, action  4.900000
10        Surrounded by Zombies 1  horror, zombie fiction  4.882353
3                   The Sheriff 2    crime drama, western  4.818182
12                     Horrorfest                  horror  4.800000
5            The Big City Judge 2             legal drama  4.785714


In [28]:
import pickle
import pandas as pd
import numpy as np

# Load prediction rules from data files
M = pickle.load(open("product_features.dat", "rb"))

# Swap the rows and columns of product_features just so it's easier to work with
M = np.transpose(M)

# Load movie titles
movies_df = pd.read_csv('.\movies.csv', index_col='movie_id')

# Choose a movie to find similar movies to. Let's find movies similar to movie #5:
movie_id = 5

# Get movie #1's name and genre
movie_information = movies_df.loc[movie_id]

print("We are finding movies similar to this movie:")
print("Movie title: {}".format(movie_information.title))
print("Genre: {}".format(movie_information.genre))

# Get the features for movie #1 we found via matrix factorization
current_movie_features = M[movie_id - 1]

print("The attributes for this movie are:")
print(current_movie_features)

# The main logic for finding similar movies:

# 1. Subtract the current movie's features from every other movie's features
difference = M - current_movie_features

# 2. Take the absolute value of that difference (so all numbers are positive)
absolute_difference = np.abs(difference)

# 3. Each movie has several features. Sum those features to get a total 'difference score' for each movie
total_difference = np.sum(absolute_difference, axis=1)

# 4. Create a new column in the movie list with the difference score for each movie
movies_df['difference_score'] = total_difference

# 5. Sort the movie list by difference score, from least different to most different
sorted_movie_list = movies_df.sort_values('difference_score')

# 6. Print the result, showing the 5 most similar movies to movie_id #1
print("The five most similar movies are:")
print(sorted_movie_list[['title', 'difference_score']][0:5])


We are finding movies similar to this movie:
Movie title: The Big City Judge 2
Genre: legal drama
The attributes for this movie are:
[ 0.46009037 -0.31911741  0.22649527 -0.08152693  0.34194861  0.40706964
  0.11944108 -0.26751621  0.13222478 -0.06424029  0.06898328]
The five most similar movies are:
                             title  difference_score
movie_id                                            
5             The Big City Judge 2          0.000000
8         Sci-Fi Murder Detectives          1.501073
11               Inspector Jackson          2.053002
24            The Big City Judge 3          2.220825
26               Mafia Underground          2.450669


In [29]:
movies_df

Unnamed: 0_level_0,title,genre,difference_score
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,The Sheriff 1,"crime drama, western",2.898476
2,The Big City Judge 1,legal drama,2.691092
3,The Sheriff 2,"crime drama, western",3.589147
4,Just a Regular Family,reality,3.184083
5,The Big City Judge 2,legal drama,0.0
6,Attack on Earth 1,"sci-fi, action",2.775943
7,The Secret Box,"sci-fi, mystery, fantasy",4.676647
8,Sci-Fi Murder Detectives,"supernatural, mystery",1.501073
9,Biker Gangs,"crime drama, action",2.46761
10,Surrounded by Zombies 1,"horror, zombie fiction",2.835494
