# Authors: Wenbo Fu (679744457), Bingyan Liu(668046518)
Both authors contribute equally

In [594]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

# System I: Recommendation based on Genres

In [26]:
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', engine = 'python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
movies = pd.read_csv('ml-1m/movies.dat', sep='::', engine = 'python',
                     encoding="ISO-8859-1", header = None)
movies.columns = ['MovieID', 'Title', 'Genres']

In [59]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [60]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   UserID     1000209 non-null  int64
 1   MovieID    1000209 non-null  int64
 2   Rating     1000209 non-null  int64
 3   Timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


In [561]:
#group by MovieID, find number of rating users and average rating
grouped_ratings = ratings.groupby('MovieID').agg({'UserID': 'nunique', 'Rating': 'mean'}).reset_index()
grouped_ratings.rename(columns={'UserID': 'num_users', 'Rating': 'average_rating'},inplace = True)

In [562]:
grouped_ratings.head()

Unnamed: 0,MovieID,num_users,average_rating
0,1,2077,4.1468464
1,2,701,3.2011412
2,3,478,3.0167364
3,4,170,2.7294118
4,5,296,3.0067568


In [62]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3883 entries, 0 to 3882
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   MovieID  3883 non-null   int64 
 1   Title    3883 non-null   object
 2   Genres   3883 non-null   object
 3   genre    3883 non-null   object
dtypes: int64(1), object(3)
memory usage: 121.5+ KB


In [63]:
#explode by genre
movies['genre'] = movies['Genres'].str.split('|')
exploded_genres_df = movies.explode('genre')

In [64]:
#merge the two tables on MovieID
grouped_genre = pd.merge(grouped_ratings, exploded_genres_df, on='MovieID', how = 'left')

In [65]:
grouped_genre

Unnamed: 0,MovieID,num_users,average_rating,Title,Genres,genre
0,1,2077,4.146846,Toy Story (1995),Animation|Children's|Comedy,Animation
1,1,2077,4.146846,Toy Story (1995),Animation|Children's|Comedy,Children's
2,1,2077,4.146846,Toy Story (1995),Animation|Children's|Comedy,Comedy
3,2,701,3.201141,Jumanji (1995),Adventure|Children's|Fantasy,Adventure
4,2,701,3.201141,Jumanji (1995),Adventure|Children's|Fantasy,Children's
...,...,...,...,...,...,...
6187,3949,304,4.115132,Requiem for a Dream (2000),Drama,Drama
6188,3950,54,3.666667,Tigerland (2000),Drama,Drama
6189,3951,40,3.900000,Two Family House (2000),Drama,Drama
6190,3952,388,3.780928,"Contender, The (2000)",Drama|Thriller,Drama


In [66]:
#average_num_users group by genre
average_users_per_genre = grouped_genre.groupby('genre')['num_users'].mean().reset_index()
average_users_per_genre = average_users_per_genre.rename(columns={'num_users': 'average_num_users'})

In [67]:
average_users_per_genre

Unnamed: 0,genre,average_num_users
0,Action,520.115152
1,Adventure,476.701068
2,Animation,412.314286
3,Children's,288.744
4,Comedy,306.603611
5,Crime,395.726368
6,Documentary,71.909091
7,Drama,237.460817
8,Fantasy,533.838235
9,Film-Noir,415.022727


In [69]:
grouped_genre = pd.merge(grouped_genre, average_users_per_genre, on='genre')

In [71]:
#use average_num_users as lower bounder users for highly rated movies
grouped_genre['lower_bound_users'] = grouped_genre['average_num_users'].astype(int)

In [72]:
grouped_genre

Unnamed: 0,MovieID,num_users,average_rating,Title,Genres,genre,average_num_users,lower_bound_users
0,1,2077,4.146846,Toy Story (1995),Animation|Children's|Comedy,Animation,412.314286,412
1,13,99,3.262626,Balto (1995),Animation|Children's,Animation,412.314286,412
2,48,382,2.976440,Pocahontas (1995),Animation|Children's|Musical|Romance,Animation,412.314286,412
3,239,168,2.875000,"Goofy Movie, A (1995)",Animation|Children's|Comedy|Romance,Animation,412.314286,412
4,244,10,1.900000,Gumby: The Movie (1995),Animation|Children's,Animation,412.314286,412
...,...,...,...,...,...,...,...,...
6187,3737,50,4.000000,Lonely Are the Brave (1962),Drama|Western,Western,308.701493,308
6188,3792,72,3.555556,Duel in the Sun (1946),Western,Western,308.701493,308
6189,3806,58,3.258621,MacKenna's Gold (1969),Western,Western,308.701493,308
6190,3871,305,3.839344,Shane (1953),Drama|Western,Western,308.701493,308


In [73]:
# The two functions find top frequent rated/top rating movies group by genre
def get_top_freq_movies(group):
    top_movies = group.nlargest(10, 'num_users')['Title']
    return '|'.join(top_movies.astype(str))


def get_top_rated_movies(group):
    qualified_movies = group[group['num_users'] >= group['lower_bound_users']]
    top_movies = qualified_movies.nlargest(10, 'average_rating')['Title']
    return '|'.join(top_movies.astype(str))

In [75]:
top_freq_bygenre = grouped_genre.groupby('genre').apply(get_top_freq_movies).reset_index(name='top_movies')

In [77]:
top_rate_bygenre = grouped_genre.groupby('genre').apply(get_top_rated_movies).reset_index(name='top_movies')

In [81]:
top_freq_bygenre

Unnamed: 0,genre,top_movies
0,Action,Star Wars: Episode IV - A New Hope (1977)|Star...
1,Adventure,Star Wars: Episode IV - A New Hope (1977)|Star...
2,Animation,Toy Story (1995)|Who Framed Roger Rabbit? (198...
3,Children's,E.T. the Extra-Terrestrial (1982)|Toy Story (1...
4,Comedy,American Beauty (1999)|Back to the Future (198...
5,Crime,Fargo (1996)|L.A. Confidential (1997)|Godfathe...
6,Documentary,Roger & Me (1989)|Hoop Dreams (1994)|Crumb (19...
7,Drama,American Beauty (1999)|Star Wars: Episode V - ...
8,Fantasy,Star Wars: Episode IV - A New Hope (1977)|E.T....
9,Film-Noir,L.A. Confidential (1997)|Blade Runner (1982)|W...


In [559]:
# The recommend function recommend based on genre, has two methods: by frequncy or rating
def recommendI(genre, method = 'Freq'):
    if method == 'Freq':
        return top_freq_bygenre[top_freq_bygenre['genre']==genre]['top_movies'].iloc[0].split('|')
    if method == 'Rate':
        return top_rate_bygenre[top_rate_bygenre['genre']==genre]['top_movies'].iloc[0].split('|')

In [560]:
recommendI('Action')

['Star Wars: Episode IV - A New Hope (1977)',
 'Star Wars: Episode V - The Empire Strikes Back (1980)',
 'Star Wars: Episode VI - Return of the Jedi (1983)',
 'Jurassic Park (1993)',
 'Saving Private Ryan (1998)',
 'Terminator 2: Judgment Day (1991)',
 'Matrix, The (1999)',
 'Men in Black (1997)',
 'Raiders of the Lost Ark (1981)',
 'Braveheart (1995)']

# System II : Recommendation based on IBCF

In [152]:
#pivot the rating table and return the user-movie matrix
rating_matrix = ratings.pivot(index='UserID', columns='MovieID', values='Rating')

In [153]:
rating_matrix

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,2.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,2.0,,3.0,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


In [188]:
# subtract mean from each row
row_means = rating_matrix.apply(lambda row: row.mean(skipna=True), axis=1)

centered_rating_matrix = rating_matrix.sub(row_means, axis=0)

In [201]:
# define similarity function for two vectors
def compute_similarity(arr1, arr2):
    nan_mask = (~np.isnan(arr1))&(~np.isnan(arr2))
    arr1_m = arr1[nan_mask]
    arr2_m = arr2[nan_mask]
    n, = arr1_m.shape
    if n <= 2:
        return np.nan
    else:
        return 0.5+0.5*np.dot(arr1_m, arr2_m) / (np.linalg.norm(arr1_m) * np.linalg.norm(arr2_m))

In [204]:
# compute the similarity matrix
S = np.zeros([3706,3706])

crm = centered_rating_matrix.to_numpy()

S = np.zeros([3706,3706])
for i in range(3706):
    for j in range(i,3706):
        arr1 = crm[:,i]
        arr2 = crm[:,j]
        S[i][j] = S[j][i] = compute_similarity(arr1, arr2)    
        
for i in range(3706):
    S[i][i] = np.nan

In [563]:
S_df = pd.DataFrame(S, index = centered_rating_matrix.columns, columns = centered_rating_matrix.columns)
S_df.to_csv('S_df.csv')

In [564]:
pd.set_option('display.precision', 7)
lst = [1,10,100,1510,260,3212]
S_df.loc[lst,lst]

MovieID,1,10,100,1510,260,3212
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,,0.5121055,0.3919999,,0.7411482,
10,0.5121055,,0.5474583,,0.5343338,
100,0.3919999,0.5474583,,,0.3296943,
1510,,,,,,
260,0.7411482,0.5343338,0.3296943,,,
3212,,,,,,


In [421]:
# Select top 30 ratings in each row and set the remaining to be nan
S_0 = np.nan_to_num(S, nan=0)
top_30_indices = np.argsort(S_0, axis=1)[:, -30:]
S_top = np.full_like(S_0, fill_value=np.nan, dtype=np.float64)

for i in range(S_0.shape[0]):
    S_top[i, top_30_indices[i]] = S_0[i, top_30_indices[i]]
S_top[S_top == 0] = np.nan

In [565]:
S_top_df = pd.DataFrame(S_top, index = centered_rating_matrix.columns, columns = centered_rating_matrix.columns)
S_top_df.loc[lst,lst]

MovieID,1,10,100,1510,260,3212
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,,,,,,
10,,,,,,
100,,,,,,
1510,,,,,,
260,,,,,,
3212,,,,,,


In [588]:
#define top 10 frequent rated movies
top10 = grouped_ratings.nlargest(10,'num_users')[['MovieID','average_rating']]\
.rename(columns={'average_rating':'rating'})
top10 = top10.merge(movies, on='MovieID',how = 'left')[['MovieID','Title','Genres','rating']]

In [589]:
top10

Unnamed: 0,MovieID,Title,Genres,rating
0,2858,American Beauty (1999),Comedy|Drama,4.3173862
1,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Fantasy|Sci-Fi,4.4536944
2,1196,Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Drama|Sci-Fi|War,4.2929766
3,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Romance|Sci-Fi|War,4.0228928
4,480,Jurassic Park (1993),Action|Adventure|Sci-Fi,3.7638473
5,2028,Saving Private Ryan (1998),Action|Drama|War,4.3373539
6,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller,4.0585126
7,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,4.3158301
8,1270,Back to the Future (1985),Comedy|Sci-Fi,3.9903213
9,593,"Silence of the Lambs, The (1991)",Drama|Thriller,4.3518231


In [590]:
#recommend base on top 10 similar ratings, where ratings are computed by the IBCF formula. Be sure to not include
#existing ratings from the input. recommendII and recommendIII would return the same, the difference is that
#recommendII raise warnings caused by 0/0. If there are less than 10 non-nan values in ratings, append the movies
#from top frequent rated movies.
def recommendII(w,S_top_df):
    w1 = np.nan_to_num(w, nan = 0)
    mask = ~np.isnan(w)
    w2 = mask.astype(int)
    S = np.nan_to_num(S_top_df.to_numpy(), nan = 0)
    rating = S@w1/(S@w2)
    rating[~np.isnan(w)] = np.nan
    df = pd.DataFrame(np.nan_to_num(rating, nan = 0), index = rating_matrix.columns)\
    .reset_index().rename(columns = {0:'rating'})
    df_candidate = df.nlargest(10,'rating').reset_index().drop('index',axis = 1)
    df_candidate = df_candidate[df_candidate['rating']>0]
    df_candidate = df_candidate.merge(movies, on='MovieID',how = 'left')[['MovieID','Title','Genres','rating']]
    l = len(df_candidate)
    if l < 10:
        return pd.concat([df_candidate, top10.iloc[:10-l,:]],axis=0)
    else:
        return df_candidate

In [554]:
def recommendIII(w,S_top_df):
    r = np.zeros(3706)
    S = S_top_df.to_numpy()
    mask_w = ~np.isnan(w)
    for l in range(3706):
        if mask_w[l] == True:
            r[l] = np.nan
            continue
        s = S[l][:]
        mask_s = ~np.isnan(s)
        mask = mask_w & mask_s
        if len(s[mask])==0 or len(w[mask])==0:
            r[l] = np.nan
        else:
            r[l] = np.dot(s[mask],w[mask])/np.sum(s[mask])
    df = pd.DataFrame(np.nan_to_num(r, nan = 0), index = rating_matrix.columns)\
    .reset_index().rename(columns = {0:'rating'})
    df_candidate = df.nlargest(10,'rating').reset_index().drop('index',axis = 1)
    df_candidate = df_candidate[df_candidate['rating']>0]
    df_candidate = df_candidate.merge(movies, on='MovieID',how = 'left')[['MovieID','Title','Genres','rating']]
    l = len(df_candidate)
    if l < 10:
        return pd.concat([df_candidate, top10.iloc[:10-l,:]],axis=0)
    else:
        return df_candidate

In [591]:
movieid = 1181
w = rating_matrix.loc[1181,:].to_numpy()
recommendII(w,S_top_df)

Unnamed: 0,MovieID,Title,Genres,rating
0,3732,"Fury, The (1978)",Horror,5.0
1,749,"Man from Down Under, The (1943)",Drama,4.5265592
2,3899,Circus (2000),Comedy,4.526066
3,249,Immortal Beloved (1994),Drama|Romance,4.0
4,337,What's Eating Gilbert Grape (1993),Drama,4.0
5,427,Boxing Helena (1993),Mystery|Romance|Thriller,4.0
6,504,No Escape (1994),Action|Sci-Fi,4.0
7,1039,Synthetic Pleasures (1995),Documentary,4.0
8,1235,Harold and Maude (1971),Comedy,4.0
9,1253,"Day the Earth Stood Still, The (1951)",Drama|Sci-Fi,4.0


In [592]:
movieid = 1351
w = rating_matrix.loc[1181,:].to_numpy()
recommendII(w,S_top_df)

Unnamed: 0,MovieID,Title,Genres,rating
0,3732,"Fury, The (1978)",Horror,5.0
1,749,"Man from Down Under, The (1943)",Drama,4.5265592
2,3899,Circus (2000),Comedy,4.526066
3,249,Immortal Beloved (1994),Drama|Romance,4.0
4,337,What's Eating Gilbert Grape (1993),Drama,4.0
5,427,Boxing Helena (1993),Mystery|Romance|Thriller,4.0
6,504,No Escape (1994),Action|Sci-Fi,4.0
7,1039,Synthetic Pleasures (1995),Documentary,4.0
8,1235,Harold and Maude (1971),Comedy,4.0
9,1253,"Day the Earth Stood Still, The (1951)",Drama|Sci-Fi,4.0


In [593]:
w = rating_matrix.loc[1,:]*np.nan
w[1613] = 5
w[1755] = 4
recommendII(w.to_numpy(),S_top_df)

Unnamed: 0,MovieID,Title,Genres,rating
0,46,How to Make an American Quilt (1995),Drama|Romance,5.0
1,74,Bed of Roses (1996),Drama|Romance,5.0
2,340,"War, The (1994)",Adventure|Drama,5.0
3,592,Batman (1989),Action|Adventure|Crime|Drama,5.0
4,765,Jack (1996),Comedy|Drama,5.0
5,1017,Swiss Family Robinson (1960),Adventure|Children's,5.0
6,3269,Forever Young (1992),Adventure|Romance|Sci-Fi,5.0
7,2,Jumanji (1995),Adventure|Children's|Fantasy,5.0
8,158,Casper (1995),Adventure|Children's,5.0
9,207,"Walk in the Clouds, A (1995)",Drama|Romance,5.0
