# Movie Recommender System with SVD 
## 1. Loading the Data

In [1]:
import pandas as pd
import numpy as np

ratings_list = [i.strip().split("::") for i in open('./ml-1m/ratings.dat', 'r').readlines()]
users_list = [i.strip().split("::") for i in open('./ml-1m/users.dat', 'r').readlines()]
movies_list = [i.strip().split("::") for i in open('./ml-1m/movies.dat', 'r',encoding = "ISO-8859-1").readlines()]

print(ratings_list[0])
print(users_list[0])
print(movies_list[0])

['1', '1193', '5', '978300760']
['1', 'F', '1', '10', '48067']
['1', 'Toy Story (1995)', "Animation|Children's|Comedy"]


## - Data type conversion to numpy array and pandas DataFrame

In [2]:
ratings = np.array(ratings_list)
users = np.array(users_list)
movies = np.array(movies_list)

In [3]:
ratings_df = pd.DataFrame(ratings_list, columns = ['UserID', 'MovieID', 'Rating', 'Timestamp'], dtype = int)
movies_df = pd.DataFrame(movies_list, columns = ['MovieID', 'Title', 'Genres'])

#convert string data type to int64 
movies_df['MovieID'] = movies_df['MovieID'].apply(pd.to_numeric)

### * check dataframe

In [4]:
movies_df.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings_df.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## 2. Make pivot table
* 피봇 테이블(pivot table) : 데이터 열 중에서 두 개를 키(key)로 사용하여 데이터를 선택하는 방법을 말한다.
* 첫번째 인수로는 행 인덱스로 사용할 열 이름, 두번째 인수로는 열 인덱스로 사용할 열 이름, 그리고 마지막으로 데이터로 사용할 열 이름을 넣는다.
* fillna: Fill NA/NaN values using the specified method

In [6]:
"""
=====================
Fill in the cell !
=====================

* Useful functions:
    - DataFrame.pivot(index, columns, values)
    - DataFrame.fillna(value)
    
* Step by step
    1. Make pivot table "R_df" with rating DataFrame
    2. fill the NaN values
"""
R_df = ratings_df.pivot(index = 'UserID', columns ='MovieID', values = 'Rating').fillna(0)
R_df.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 3. Normalize by each users mean convert it from a dataframe to a numpy array

In [7]:
# DataFrame type to matrix
R = R_df.as_matrix()

"""
=====================
Fill in the cell !
=====================

* Useful functions:
    - np.mean(a, axis), reshape(-1,1) 
    
* Step by step
    1. make "user_ratings_mean" variable 
        : compute rating mean of each user
        (caution! matrix shape : [number of users, 1] , matrix dim : 2 dim)
    2. make "R_normalized" variable
        : subtract that matrix from "R"
        
"""
user_ratings_mean = np.mean(R, axis = 1)
R_normalized = R - user_ratings_mean.reshape(-1, 1)

## 4. Singular Value Decomposition

In [8]:
from scipy.sparse.linalg import svds
"""
=====================
Fill in the cell !
=====================

* Useful functions:
    - svds(A, k) : return U, sigma, Vt
    - np.diag(v) : Extract a diagonal matrix
    
* Step by step
    1. make "U, sigma, Vt" variables for svds funtion's output (k = 50)
    2. Diagonalize the sigma value 

"""
U, sigma, Vt = svds(R_normalized, k = 50)
sigma = np.diag(sigma)

## 5. Making Predictions 

* multiply $U$, $\Sigma$, and $V^{T}$ back to get the rank $k=50$ approximation of $R$.

* add the user means back to get the actual star ratings prediction.

In [9]:
"""
=====================
Fill in the cell !
=====================

* Useful functions:
    - np.dot
* Step by step
    1. make "all_user_predicted_ratings" variable for predictions
        1) multiply each variables 
        2) add user ratings mean again
"""
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) + user_ratings_mean.reshape(-1, 1)

## 6. Making Movie Recommendations

In [16]:
preds_df = pd.DataFrame(all_user_predicted_ratings, columns = R_df.columns)
preds_df.head()

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,4.288861,0.143055,-0.19508,-0.018843,0.012232,-0.176604,-0.07412,0.141358,-0.059553,-0.19595,...,0.027807,0.00164,0.026395,-0.022024,-0.085415,0.403529,0.105579,0.031912,0.05045,0.08891
1,0.744716,0.169659,0.335418,0.000758,0.022475,1.35305,0.051426,0.071258,0.161601,1.567246,...,-0.056502,-0.013733,-0.01058,0.062576,-0.016248,0.15579,-0.418737,-0.101102,-0.054098,-0.140188
2,1.818824,0.456136,0.090978,-0.043037,-0.025694,-0.158617,-0.131778,0.098977,0.030551,0.73547,...,0.040481,-0.005301,0.012832,0.029349,0.020866,0.121532,0.076205,0.012345,0.015148,-0.109956
3,0.408057,-0.07296,0.039642,0.089363,0.04195,0.237753,-0.049426,0.009467,0.045469,-0.11137,...,0.008571,-0.005425,-0.0085,-0.003417,-0.083982,0.094512,0.057557,-0.02605,0.014841,-0.034224
4,1.574272,0.021239,-0.0513,0.246884,-0.032406,1.552281,-0.19963,-0.01492,-0.060498,0.450512,...,0.110151,0.04601,0.006934,-0.01594,-0.05008,-0.052539,0.507189,0.03383,0.125706,0.199244


In [36]:
def recommend_movies(predictions_df, userID, movies_df, original_ratings_df, num_recommendations=5):
    
    # Get and sort the user's predictions
    user_row_number = userID - 1 # UserID starts at 1, not 0
    sorted_user_predictions = predictions_df.iloc[user_row_number].sort_values(ascending=False) # UserID starts at 1
    
    # Get the user's data and merge in the movie information.
    user_data = original_ratings_df[original_ratings_df.UserID == (userID)]
    #print(user_data.head())
    
    # left: use only keys from left frame, similar to a SQL left outer join; preserve key order
    user_full = (user_data.merge(movies_df, how = 'left', left_on = 'MovieID', right_on = 'MovieID').
                     sort_values(['Rating'], ascending=False)
                 )
    #print(user_full.head())
    
    print ('User {0} has already rated {1} movies.'.format(userID, user_full.shape[0]))
    print ('Recommending highest {0} predicted ratings movies not already rated.'.format(num_recommendations))
    
    # Recommend the highest predicted rating movies that the user hasn't seen yet.
    # remove already rated movie row
    recommendations = movies_df[~movies_df["MovieID"].isin(user_full['MovieID'])]
    
    # merge with prediction and movie information
    recommendations = recommendations.merge(pd.DataFrame(sorted_user_predictions).reset_index(), how = 'left',
               left_on = 'MovieID',
               right_on = 'MovieID')
    
    #rename userID column to prediction
    recommendations = recommendations.rename(columns = {user_row_number: 'Predictions'})
    
    #sorting prediction value to descending order and 
    recommendations = recommendations.sort_values(['Predictions'], ascending = False)
    recommendations = recommendations.iloc[:num_recommendations, :-1]

    return user_full, recommendations


In [37]:
already_rated, predictions = recommend_movies(preds_df, 837, movies_df, ratings_df, 10)

User 837 has already rated 69 movies.
Recommending highest 10 predicted ratings movies not already rated.


In [38]:
already_rated.head(10)

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres
36,837,858,5,975360036,"Godfather, The (1972)",Action|Crime|Drama
35,837,1387,5,975360036,Jaws (1975),Action|Horror
65,837,2028,5,975360089,Saving Private Ryan (1998),Action|Drama|War
63,837,1221,5,975360036,"Godfather: Part II, The (1974)",Action|Crime|Drama
11,837,913,5,975359921,"Maltese Falcon, The (1941)",Film-Noir|Mystery
20,837,3417,5,975360893,"Crimson Pirate, The (1952)",Adventure|Comedy|Sci-Fi
34,837,2186,4,975359955,Strangers on a Train (1951),Film-Noir|Thriller
55,837,2791,4,975360893,Airplane! (1980),Comedy
31,837,1188,4,975360920,Strictly Ballroom (1992),Comedy|Romance
28,837,1304,4,975360058,Butch Cassidy and the Sundance Kid (1969),Action|Comedy|Western


In [39]:
predictions

Unnamed: 0,MovieID,Title,Genres
516,527,Schindler's List (1993),Drama|War
1848,1953,"French Connection, The (1971)",Action|Crime|Drama|Thriller
596,608,Fargo (1996),Crime|Drama|Thriller
1235,1284,"Big Sleep, The (1946)",Film-Noir|Mystery
2085,2194,"Untouchables, The (1987)",Action|Crime|Drama
1188,1230,Annie Hall (1977),Comedy|Romance
1198,1242,Glory (1989),Action|Drama|War
897,922,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),Film-Noir
1849,1954,Rocky (1976),Action|Drama
581,593,"Silence of the Lambs, The (1991)",Drama|Thriller
