Ammar Alzureiqi - ammar3@illinois.edu Ahmed Elfarra - ahmedse2@illinois.edu

Website link: https://movie-recommender-aa.streamlit.app/

In [3]:
import pandas as pd
import numpy as np
import requests
from PIL import Image
from sklearn.metrics.pairwise import cosine_similarity
import heapq

# Helper Functions

Not all used in this file, but used in the streamlit web app

In [1]:
def get_displayed_movies():
    return movies.head(50)

def get_recommended_movies(new_user_ratings):
    return movies.head(10)

def get_popular_movies(genre: str):
    if genre == genres[1]:
        return movies.head(10)
    else: 
        return movies[10:20]

def get_movie_image_url(movie_id):
    img = Image.open(f"movies_folder/{movie_id}.jpg")
    return img

# Function to get top movies for the selected genre
def get_top_movies(genre, movie_stats, movies, top_n=10):
    # Filter movies by the selected genre
    genre_movies = movies[movies['Genres'].str.contains(genre)]
    
    # Join with the movie stats
    genre_movies_stats = genre_movies.join(movie_stats, on='MovieID')
    
    # Filter movies with more than a threshold of ratings to avoid movies with few high ratings
    popular_movies = genre_movies_stats[genre_movies_stats['count'] > 100]
    
    # Get the top N movies by average rating
    top_movies = popular_movies.sort_values(by='mean', ascending=False).head(top_n)
    return top_movies

def rating_to_stars(rating):
    full_stars = int(rating)
    half_star = "⭐" if rating - full_stars >= 0.5 else ""
    return '⭐' * full_stars + half_star

def myIBCF(newuser):
    S_top30 = pd.read_csv('modified_similarity_matrix.csv')
    S_top30 = S_top30.iloc[:,1:]
    pred = pd.DataFrame(index=range(3706))
    
    pred = []
    for i in range(3706):
        Sl = S_top30.iloc[:, i]
        if sum(np.isfinite(Sl) & np.isfinite(newuser)) == 1:
            result = Sl[(np.isfinite(Sl) & np.isfinite(newuser))] * newuser[(np.isfinite(Sl) & np.isfinite(newuser))]
            result = (1 / np.nansum(Sl[np.isfinite(newuser)])) * result
            pred.append(result.iloc[0])
        elif sum(np.isfinite(Sl) & np.isfinite(newuser)) == 0:
            result = 0
            pred.append(result)
        else:
            result = np.nansum(np.multiply(Sl, newuser))
            result = (1 / np.nansum(Sl[np.isfinite(newuser)])) * result
            pred.append(result)
    print(pred)
    nan_positions = np.isnan(newuser)
    
    # Replace NaN values in 'w' with corresponding values from 'pred'
    newuser = [pred_value if is_nan else w_value for w_value, pred_value, is_nan in zip(newuser, pred, nan_positions)]
    
    top_10_indices_and_values = heapq.nlargest(10, enumerate(newuser), key=lambda x: x[1])
    
    # Unpack the result into separate lists of indices and values
    top_10_indices, top_10_values = zip(*top_10_indices_and_values)

    return S_top30.columns[list(top_10_indices)]

# System 1

Here we select the top 10 movies of each genre based on a weighted average of the ratings via:

(mean_ratings * count) / (count + median)

where:

- mean_ratings is the mean of the ratings for the movie
- count is the number of reviews for this movie
- median is the median number of reviews for movies in this genre
  
This weighted rating is how we deal with movies that receive only one 5-point review be considered highly rated and such.

(notes: the get_top_movies function was altered from the one above used in streamlit in order for it to work in the notebook)

In [3]:
def get_top_movies(genre, movie_stats, movies):
    genre_movies = movies[movies['Genres'].str.contains(genre)]
    merged_data = pd.merge(genre_movies, movie_stats, left_on='MovieID', right_index=True)
    top_movies = merged_data.sort_values(by='weighted_mean', ascending=False).head(10)
    return top_movies

movies_columns = ['MovieID', 'Title', 'Genres']
movies = pd.read_csv('data/movies.dat', sep='::', engine='python', names=movies_columns, encoding='ISO-8859-1')

ratings_columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
ratings = pd.read_csv('data/ratings.dat', sep='::', engine='python', names=ratings_columns, encoding='ISO-8859-1')

genre = 'Sci-Fi'  # Replace this with your preferred genre

# Load and preprocess the data
data = pd.merge(ratings, movies, on='MovieID')
movie_stats = data.groupby('MovieID').agg({'Rating': ['mean', 'count']})
movie_stats.columns = ['mean', 'count']
genre_median_ratings = data.groupby('Genres')['Rating'].count().median()
movie_stats['weighted_mean'] = (
    (movie_stats['mean'] * movie_stats['count']) / (movie_stats['count'] + genre_median_ratings)
)

top_movies = get_top_movies(genre, movie_stats, movies)

print(f"Top 10 Highly-Rated Movies in {genre}")
for _, row in top_movies.iterrows():
    movie_id = row['MovieID']
    title = row['Title']
    rating = row['weighted_mean']
    image_url = get_movie_image_url(movie_id)

    print(f"{title}")
    print(f"Rating: {rating_to_stars(rating)} ({rating:.1f})")
    print(image_url)
    print("\n")

Top 10 Highly-Rated Movies in Sci-Fi
Star Wars: Episode IV - A New Hope (1977)
Rating: ⭐⭐⭐ (3.4)
<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=185x278 at 0x1578EDDD0>


Star Wars: Episode V - The Empire Strikes Back (1980)
Rating: ⭐⭐⭐ (3.2)
<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=185x278 at 0x157811CD0>


Matrix, The (1999)
Rating: ⭐⭐⭐ (3.1)
<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=185x278 at 0x1578FFE50>


Star Wars: Episode VI - Return of the Jedi (1983)
Rating: ⭐⭐⭐ (3.0)
<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=185x278 at 0x1578AB150>


Terminator 2: Judgment Day (1991)
Rating: ⭐⭐⭐ (3.0)
<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=185x278 at 0x1578A9E10>


Back to the Future (1985)
Rating: ⭐⭐⭐ (2.9)
<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=185x278 at 0x1578FD490>


Terminator, The (1984)
Rating: ⭐⭐⭐ (2.8)
<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=185x278 at 0x1578AA890>


Alien (1979)
Rating: ⭐

# System 2

In [7]:
# Step 1: Load and Normalize the rating matrix
R = pd.read_csv('ratings.csv', index_col=0)
# R_normalized = R.sub(R.mean(axis=1), axis=0).fillna(0)
R_centered = R.sub(R.mean(axis=1), axis=0)

FileNotFoundError: [Errno 2] No such file or directory: 'ratings.csv'

This code creates the cosine similarity and writes it to a separate file

In [None]:
S = pd.DataFrame(index=R_centered.columns, columns=R_centered.columns)
num_movies = len(R_centered.columns)

for i in range(num_movies):
    for j in range(i, num_movies):  # Note: Only compute upper triangle to avoid redundancy
        users_ij = R_centered.index[(R_centered.iloc[:, i].notna()) & (R_centered.iloc[:, j].notna())]

        if len(users_ij) > 2:
            # Extract columns as 1D arrays
            movie_i_ratings = R_centered.loc[users_ij, R_centered.columns[i]].values.reshape(1, -1)
            movie_j_ratings = R_centered.loc[users_ij, R_centered.columns[j]].values.reshape(1, -1)

            cos_sim = cosine_similarity(movie_i_ratings, movie_j_ratings)
            similarity = (1 + cos_sim[0][0]) / 2
            S.at[R_centered.columns[i], R_centered.columns[j]] = similarity
            S.at[R_centered.columns[j], R_centered.columns[i]] = similarity

# Save the similarity matrix to a CSV file
S.to_csv('cosine_similarity_matrix.csv')

In [5]:
# S.to_csv('similarity_matrix.csv')
S = pd.read_csv('cosine_similarity_matrix.csv', index_col=0)

In [6]:
np.fill_diagonal(S.values, np.nan)

In [7]:
# Sort each row, keeping the top 30 values and setting the rest to NA
S_top30 = pd.DataFrame(np.nan, index=S.index, columns=S.columns)
for i in S.index:
    row = S.loc[i, :].copy()
    row_sorted = row.sort_values(ascending=False)
    S_top30.loc[i, row_sorted.index[:30]] = row_sorted[:30]

# Save the modified similarity matrix to a CSV file
S_top30.to_csv('modified_similarity_matrix.csv')

In [8]:
movies_to_display = ["m1", "m10", "m100", "m1510", "m260", "m3212"]
pairwise_similarities = S.loc[movies_to_display, movies_to_display]
print(pairwise_similarities.round(7))

             m1       m10      m100  m1510      m260  m3212
m1          NaN  0.512105  0.392000    NaN  0.741148    NaN
m10    0.512105       NaN  0.547458    NaN  0.534334    NaN
m100   0.392000  0.547458       NaN    NaN  0.329694    NaN
m1510       NaN       NaN       NaN    NaN       NaN    NaN
m260   0.741148  0.534334  0.329694    NaN       NaN    NaN
m3212       NaN       NaN       NaN    NaN       NaN    NaN


In [9]:
S_top30 = pd.read_csv('modified_similarity_matrix.csv')
S_top30 = S_top30.iloc[:,1:]

In [10]:
user_hypothetical = (R.loc['u1351', :])
user_hypothetical['m1613'] = 5
user_hypothetical['m1755'] = 4

In [2]:
print(myIBCF(R.loc['u1181'].values))
print(myIBCF(R.loc['u1351'].values))
print(myIBCF(user_hypothetical))

NameError: name 'R' is not defined

### This last segment is just to imitate what would happen in the web app

In [12]:
temp = [np.nan]*50
full_list = temp + [np.nan] * (3706 - len(temp))
full_list[2] = 3
full_list = pd.Series(full_list)

In [13]:
templist = myIBCF(full_list)
cleaned_list = [int(index[1:]) for index in templist]
cleaned_list

[2626, 2994, 691, 100, 1085, 113, 1572, 1582, 1585, 1622]

In [14]:
movies_columns = ['movie_id', 'title', 'genres']
movies = pd.read_csv('data/movies.dat', sep='::', engine='python', names=movies_columns, encoding='ISO-8859-1')
movies['movie_id'] = movies['movie_id'].astype(int)

In [15]:
tempmovies = movies[movies['movie_id'].isin(cleaned_list)]

# Reorder the rows in the tempmovies DataFrame based on the shuffled list
tempmovies.loc[:, 'movie_id'] = pd.Categorical(tempmovies['movie_id'], categories=cleaned_list, ordered=True)
tempmovies = tempmovies.sort_values(by='movie_id').reset_index(drop=True)
tempmovies

Unnamed: 0,movie_id,title,genres
0,2626,Edge of Seventeen (1998),Comedy|Drama|Romance
1,2994,"City, The (1998)",Drama
2,691,Mrs. Winterbourne (1996),Comedy|Romance
3,100,City Hall (1996),Drama|Thriller
4,1085,"Old Man and the Sea, The (1958)",Adventure|Drama
5,113,Before and After (1996),Drama|Mystery
6,1572,Contempt (Le Mépris) (1963),Drama
7,1582,Wild America (1997),Adventure|Children's
8,1585,Love Serenade (1996),Comedy
9,1622,Kicked in the Head (1997),Comedy|Drama
