Aleksei Skopintsev

aleksei5

MCS-DS

No other contributors.

In [1]:
import pandas as pd
import numpy as np

## Logic:

### Scenario I:

For Scenario I, a simple logic of averaging ratings across all users is used and top 10 is returned. 

### Scenario II:

If there are less than 10 recommendation, top n movies from Comedy genre will be appended where n is max(0,10-#recommendations).
If there are no movies rated, then user will receive random movies as a suggestion.

In [2]:
def read_data():
    ratings = pd.read_csv('./ml-1m/ratings.dat', sep='::', engine = 'python', header=None)
    ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']

    movies = pd.read_csv('./ml-1m/movies.dat', sep='::', engine = 'python',
                        encoding="ISO-8859-1", header = None)
    movies.columns = ['MovieID', 'Title', 'Genres']

    return ratings,movies

# Function to filter movies by genre
def get_movies_by_genre(genre:str,movies:pd.DataFrame):
    return movies[movies['Genres']==genre]

# Function to return top 10 movies within selected Genre
def get_top_movies_by_rating(genre:str,ratings:pd.DataFrame,movies:pd.DataFrame):
    all_movies=get_movies_by_genre(movies=movies,genre=genre)
    merged=pd.merge(all_movies[['MovieID','Title','Genres']],
                    ratings[['MovieID','Rating']],on='MovieID')
    
    merged=merged.groupby(['Title','MovieID','Genres'])['Rating']\
        .mean().sort_values(ascending=False)

    return pd.DataFrame(merged[:10]).reset_index()

def normalize_ratings(ratings):

    R=ratings.pivot_table(values='Rating',index='UserID',columns='MovieID')
    # Calculate the mean for each row (user) excluding NaNs (missing ratings)
    row_means = R.mean(axis=1, skipna=True)

    # Subtract the mean from each element in the row
    norm_matrix = R.sub(row_means, axis=0)


    return norm_matrix

In [3]:
ratings,movies=read_data()

In [4]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


Make a DF with top ratings by genre and save it

In [5]:
distinct_genre=movies['Genres'].unique()

top_movies_by_genre=pd.DataFrame()

for genre in distinct_genre:
    top_movies_by_genre=pd.concat([top_movies_by_genre,get_top_movies_by_rating(genre,ratings,movies)])

top_movies_by_genre.to_csv('top_movies_by_genre.csv',index=False)


Read back top_movies_by_genre to use later in the code (and check if everything is working)

In [6]:
top_movies_by_genre=pd.read_csv('./top_movies_by_genre.csv')
top_movies_by_genre.head()


Unnamed: 0,Title,MovieID,Genres,Rating
0,Toy Story 2 (1999),3114,Animation|Children's|Comedy,4.218927
1,Toy Story (1995),1,Animation|Children's|Comedy,4.146846
2,Chicken Run (2000),3751,Animation|Children's|Comedy,3.879609
3,"Bug's Life, A (1998)",2355,Animation|Children's|Comedy,3.854375
4,"American Tail, An (1986)",2141,Animation|Children's|Comedy,3.428218


In [7]:
normalized_ratings=normalize_ratings(ratings)

Computing required cosine similarity with R is significantly faster than in Python. So below is some routine involving R

In [8]:
%load_ext rpy2.ipython

In [9]:
%%R
# install.packages("coop")
library('coop')

In [10]:
%R -i normalized_ratings

In [11]:
%%R
cosine_similarity <- cosine(as.matrix(normalized_ratings), use='pairwise.complete.obs')


In [12]:
# Convert back to python

%R -o cosine_similarity

In [13]:
normalized_cosine_similarity = (cosine_similarity + 1) / 2

  normalized_cosine_similarity = (cosine_similarity + 1) / 2


In [14]:
user_movie_matrix = (~normalized_ratings.isna()).astype(int)
user_movie_matrix

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
UserID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6037,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6038,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6039,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Diagonal number represents how many users rated the movie
users_count=np.dot(user_movie_matrix.T, user_movie_matrix)

In [16]:
# Create a mask
valid_pairs_mask = users_count >= 3

In [17]:
# Apply the mask to the cosine similarity matrix
cosine_similarity_df = pd.DataFrame(np.where(valid_pairs_mask, normalized_cosine_similarity, np.nan),
                                    index=normalized_ratings.columns, columns=normalized_ratings.columns)


In [18]:
cosine_similarity_df

MovieID,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
MovieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.442636,0.410766,0.404020,0.362457,0.579168,0.475257,0.303379,0.188966,0.512106,...,0.381834,0.089488,0.227803,0.270004,0.649773,0.548122,0.702972,0.555231,0.703109,0.600869
2,0.442636,1.000000,0.546743,0.460495,0.648002,0.412014,0.571488,0.488686,0.689619,0.541504,...,0.398421,,0.441406,0.685420,0.261454,0.553910,0.323460,0.316563,0.223420,0.421348
3,0.410766,0.546743,1.000000,0.640756,0.677383,0.437453,0.547960,0.669407,0.587030,0.568835,...,0.439756,0.458236,0.385446,0.703825,0.503654,0.528168,0.284015,0.515971,0.490683,0.427316
4,0.404020,0.460495,0.640756,1.000000,0.710145,0.318943,0.513088,0.638414,0.494078,0.369146,...,0.764084,,,0.907509,0.348647,0.457428,0.168765,0.621891,0.660954,0.449424
5,0.362457,0.648002,0.677383,0.710145,1.000000,0.388942,0.587943,0.608642,0.758438,0.503592,...,0.131195,0.509458,0.601690,0.855912,0.644852,0.590165,0.239982,0.517104,,0.347755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.548122,0.553910,0.528168,0.457428,0.590165,0.534482,0.477317,0.323261,0.441275,0.574412,...,0.465711,0.536067,0.448532,0.515220,0.369859,1.000000,0.491852,0.425333,0.307334,0.506530
3949,0.702972,0.323460,0.284015,0.168765,0.239982,0.615329,0.355397,0.110612,0.213767,0.439866,...,0.563115,0.301218,0.252580,0.295247,0.657607,0.491852,1.000000,0.594070,0.656516,0.566232
3950,0.555231,0.316563,0.515971,0.621891,0.517104,0.419051,0.649063,0.796771,,0.589969,...,0.480608,0.233652,0.020660,0.502494,0.532813,0.425333,0.594070,1.000000,0.662502,0.573728
3951,0.703109,0.223420,0.490683,0.660954,,0.380572,0.676682,0.131729,,0.022272,...,0.426416,0.387007,0.173395,0.398000,,0.307334,0.656516,0.662502,1.000000,0.690165


In [19]:
# Set diagonal to na because we dont want same movie to be in sorted matrix
np.fill_diagonal(cosine_similarity_df.values, np.nan)

In [20]:
cosine_similarity_df.columns=['m'+str(i) for i in cosine_similarity_df.columns]
cosine_similarity_df.index=['m'+str(i) for i in cosine_similarity_df.index]
cosine_similarity_df

Unnamed: 0,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,...,m3943,m3944,m3945,m3946,m3947,m3948,m3949,m3950,m3951,m3952
m1,,0.442636,0.410766,0.404020,0.362457,0.579168,0.475257,0.303379,0.188966,0.512106,...,0.381834,0.089488,0.227803,0.270004,0.649773,0.548122,0.702972,0.555231,0.703109,0.600869
m2,0.442636,,0.546743,0.460495,0.648002,0.412014,0.571488,0.488686,0.689619,0.541504,...,0.398421,,0.441406,0.685420,0.261454,0.553910,0.323460,0.316563,0.223420,0.421348
m3,0.410766,0.546743,,0.640756,0.677383,0.437453,0.547960,0.669407,0.587030,0.568835,...,0.439756,0.458236,0.385446,0.703825,0.503654,0.528168,0.284015,0.515971,0.490683,0.427316
m4,0.404020,0.460495,0.640756,,0.710145,0.318943,0.513088,0.638414,0.494078,0.369146,...,0.764084,,,0.907509,0.348647,0.457428,0.168765,0.621891,0.660954,0.449424
m5,0.362457,0.648002,0.677383,0.710145,,0.388942,0.587943,0.608642,0.758438,0.503592,...,0.131195,0.509458,0.601690,0.855912,0.644852,0.590165,0.239982,0.517104,,0.347755
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
m3948,0.548122,0.553910,0.528168,0.457428,0.590165,0.534482,0.477317,0.323261,0.441275,0.574412,...,0.465711,0.536067,0.448532,0.515220,0.369859,,0.491852,0.425333,0.307334,0.506530
m3949,0.702972,0.323460,0.284015,0.168765,0.239982,0.615329,0.355397,0.110612,0.213767,0.439866,...,0.563115,0.301218,0.252580,0.295247,0.657607,0.491852,,0.594070,0.656516,0.566232
m3950,0.555231,0.316563,0.515971,0.621891,0.517104,0.419051,0.649063,0.796771,,0.589969,...,0.480608,0.233652,0.020660,0.502494,0.532813,0.425333,0.594070,,0.662502,0.573728
m3951,0.703109,0.223420,0.490683,0.660954,,0.380572,0.676682,0.131729,,0.022272,...,0.426416,0.387007,0.173395,0.398000,,0.307334,0.656516,0.662502,,0.690165


In [21]:
def display_pairwise_similarity(S, movie_ids):
    # Extract the relevant rows and columns
    similarity_submatrix = S.loc[movie_ids, movie_ids]
    return pd.DataFrame(similarity_submatrix, index=movie_ids, columns=movie_ids).round(7)

# Assuming movie IDs are zero-indexed in your matrix
movie_ids = ['m1', 'm10', 'm100', 'm1510', 'm260', 'm3212']
pairwise_similarities = display_pairwise_similarity(cosine_similarity_df, movie_ids)
print(pairwise_similarities)

             m1       m10      m100  m1510      m260  m3212
m1          NaN  0.512105  0.392000    NaN  0.741148    NaN
m10    0.512105       NaN  0.547458    NaN  0.534334    NaN
m100   0.392000  0.547458       NaN    NaN  0.329694    NaN
m1510       NaN       NaN       NaN    NaN       NaN    NaN
m260   0.741148  0.534334  0.329694    NaN       NaN    NaN
m3212       NaN       NaN       NaN    NaN       NaN    NaN


In [22]:
def top_30_similarities(row):
    # Copy to make sure cosine similarity does not chnage
    row_copy=row.copy()
    # Select top 30 indexes from a column
    cols_to_keep=row_copy.fillna(-np.inf).sort_values(ascending=False)[:30].index
    # Set others to nan
    row_copy[~row_copy.index.isin(cols_to_keep)]=np.nan
    
    return row_copy

top_30_similarity_matrix = cosine_similarity_df.apply(top_30_similarities, axis=1)


In [23]:
# Save matrix
top_30_similarity_matrix.to_csv('S.csv',index=False)

In [24]:
def make_user_rating(movie_ratings,R):
    # make sure movies are not missing from ratings

    cols=[int(i[1:]) for i in R.columns]

    movie_ids=pd.DataFrame(cols,columns=['MovieID'])

    ratings_=pd.merge(movie_ids,movie_ratings,how='left')['Rating']

    return ratings_.values

def make_new_user_rating(new_user_ratings:dict):
    hypthetical_ratings=np.zeros(3706)
    hypthetical_ratings[:]=np.nan

    for k,v in new_user_ratings.items():
        hypthetical_ratings[k]=v
    


In [25]:

def myICBF(user,S,top_movies_by_genre,print_prediction=False):
    numerator = np.sum(S * user,axis=1)

    user2 = user.copy()
    mask = (user2!=0) & (~np.isnan(user2))
    user2[mask]=1

    denominator = np.sum(S * user2,axis=1)

    pred=numerator/denominator

    pred.index=S.columns

    # return only unrated movies
    pred=pred[np.isnan(user)]


    # sort unrated movies
    pred=pred.sort_values(ascending=False)

    # drop na's
    pred.dropna(inplace=True)

    if print_prediction:
        print(pred[:10])

    # Apply logic

    # If less that 10 observations, return observations + top suggestions from Comedy genre.
    if pred.shape[0]<10:
        length=pred.shape[0]
        # recommendations=pred.iloc[:length]

        # get top 10-length movies by a genre Comedy:
        # recommendations=get_top_movies_by_rating('Comedy',ratings,movies)['MovieID'].iloc[:10-length]
        recommendations=top_movies_by_genre[top_movies_by_genre['Genres']=='Comedy']['MovieID'].iloc[:10-length]
        
        pred=[int(i[1:]) for i in pred.index]

        # Concatenate with recommendations

        pred+=recommendations.tolist()

        return 

    else:
        # Otherwise just clean the index and return it
        pred=[int(i[1:]) for i in pred[:10].index]

        return pred


In [26]:
S=pd.read_csv('./S.csv')

In [28]:
# User “u1181” from the rating matrix R
test_1181=ratings[ratings['UserID']==1181].sort_values('MovieID')[['MovieID','Rating']]
ratings_1181=make_user_rating(test_1181,S)

suggestion=myICBF(ratings_1181,S,top_movies_by_genre,print_prediction=True)
print('\nBelow are the movie names:\n')
for i in suggestion:
    print(movies[movies['MovieID']==i]['Title'].tolist())
# pprint.pprint(movies[movies['MovieID'].isin(suggestion)]['Title'].tolist())

m3732    5.000000
m749     4.526559
m3899    4.526066
m1235    4.000000
m2082    4.000000
m2793    4.000000
m1914    4.000000
m1253    4.000000
m504     4.000000
m3789    4.000000
dtype: float64

Below are the movie names:

['Fury, The (1978)']
['Man from Down Under, The (1943)']
['Circus (2000)']
['Harold and Maude (1971)']
['Mighty Ducks, The (1992)']
['American Werewolf in Paris, An (1997)']
['Smoke Signals (1998)']
['Day the Earth Stood Still, The (1951)']
['No Escape (1994)']
['Pawnbroker, The (1965)']


In [29]:
# User “u1351” from the rating matrix R
test_1351=ratings[ratings['UserID']==1351].sort_values('MovieID')[['MovieID','Rating']]
ratings_1351=make_user_rating(test_1351,S)

suggestion=myICBF(ratings_1351,S,top_movies_by_genre,print_prediction=True)
# pprint.pprint(movies[movies['MovieID'].isin(suggestion)]['Title'].tolist())
print('\nBelow are the movie names:\n')
for i in suggestion:
    print(movies[movies['MovieID']==i]['Title'].tolist())



m2127    5.0
m1514    5.0
m2061    5.0
m1871    5.0
m1901    5.0
m3887    5.0
m1102    5.0
m2156    5.0
m3232    5.0
m2063    5.0
dtype: float64

Below are the movie names:

['First Love, Last Rites (1997)']
['Temptress Moon (Feng Yue) (1996)']
['Full Tilt Boogie (1997)']
['Friend of the Deceased, A (1997)']
['Dear Jesse (1997)']
['Went to Coney Island on a Mission From God... Be Back by Five (1998)']
['American Strays (1996)']
['Best Man, The (Il Testimone dello sposo) (1997)']
['Seven Chances (1925)']
['Seventh Heaven (Le Septième ciel) (1997)']


In [30]:
# A hypothetical user who rates movie “m1613” with 5 and movie “m1755” with 4.
hypthetical_ratings=np.zeros(S.columns.shape[0])
hypthetical_ratings[:]=np.nan
hypthetical_ratings[np.where(S.columns=='m1613')]=5
hypthetical_ratings[np.where(S.columns=='m1755')]=4

suggestion=myICBF(hypthetical_ratings,S,top_movies_by_genre,print_prediction=True)
# pprint.pprint(movies[movies['MovieID'].isin(suggestion)]['Title'].tolist())
print('\nBelow are the movie names:\n')
for i in suggestion:
    print(movies[movies['MovieID']==i]['Title'].tolist())


m2805    5.0
m46      5.0
m592     5.0
m1017    5.0
m691     5.0
m3269    5.0
m2548    5.0
m2771    5.0
m2750    5.0
m2718    5.0
dtype: float64

Below are the movie names:

['Mickey Blue Eyes (1999)']
['How to Make an American Quilt (1995)']
['Batman (1989)']
['Swiss Family Robinson (1960)']
['Mrs. Winterbourne (1996)']
['Forever Young (1992)']
['Rage: Carrie 2, The (1999)']
['Brokedown Palace (1999)']
['Radio Days (1987)']
['Drop Dead Gorgeous (1999)']
