In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ipywidgets as widgets
from IPython.display import display

In [2]:
# Load movie and ratings data
movies_file_path = "C:/Users/16474/Desktop/ml-latest/movies.csv"
ratings_file_path = "C:/Users/16474/Desktop/ml-latest/ratings.csv"

# Read the datasets
movies_df = pd.read_csv(movies_file_path)
ratings_df = pd.read_csv(ratings_file_path)

# Drop the 'timestamp' column from the ratings DataFrame
ratings_df = ratings_df.drop('timestamp', axis=1)

In [22]:


# Create 'clean_title' column by removing the release year from the title
# Update the code to explicitly specify regex=True
movies_df['clean_title'] = movies_df['title'].str.replace(r'\s*\(\d+\)', '', regex=True)


# Split the 'genres' column into separate genre categories
genres_split = movies_df['genres'].str.get_dummies('|')

# Concatenate the original DataFrame with the genres DataFrame
movies_with_genres_df = pd.concat([movies_df.drop(columns=['genres']), genres_split], axis=1)

# Display the DataFrame
movies_with_genres_df.head()

Unnamed: 0,movieId,title,clean_title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Toy Story,0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Jumanji,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Grumpier Old Men,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Waiting to Exhale,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Father of the Bride Part II,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:

# Calculate basic statistics for ratings
ratings_statistics = ratings_df['rating'].describe()

# Display the statistics
print("Ratings Statistics:")
print(ratings_statistics)


Ratings Statistics:
count    3.383216e+07
mean     3.542540e+00
std      1.063959e+00
min      5.000000e-01
25%      3.000000e+00
50%      4.000000e+00
75%      4.000000e+00
max      5.000000e+00
Name: rating, dtype: float64


In [4]:
# Count the number of ratings for each movie
movie_rating_counts = ratings_df.groupby('movieId').size()

# Display the movie rating counts
print("Number of times each movie has been rated:")
print(movie_rating_counts)


Number of times each movie has been rated:
movieId
1         76813
2         30209
3         15820
4          3028
5         15801
          ...  
288967        1
288971        1
288975        1
288977        1
288983        1
Length: 83239, dtype: int64


In [5]:
# Count the number of ratings for each movie
movie_rating_counts = ratings_df['movieId'].value_counts()

# Find the top 10 most rated movies
top_10_most_rated_movies = movie_rating_counts.head(10)

# Display the top 10 most rated movies
print("Top 10 most rated movies:")
print(top_10_most_rated_movies)


Top 10 most rated movies:
318     122296
356     113581
296     108756
2571    107056
593     101802
260      97202
2959     86207
527      84232
480      83026
1196     80200
Name: movieId, dtype: int64


In [6]:

# Count the number of ratings for each movie
movie_rating_counts = ratings_df['movieId'].value_counts().reset_index()
movie_rating_counts.columns = ['movieId', 'rating_count']

# Find the top 10 most rated movies
top_10_most_rated_movies = movie_rating_counts.head(10)

# Merge with movies DataFrame to get movie names
top_10_most_rated_movies_with_names = pd.merge(top_10_most_rated_movies, movies_df, on='movieId')

# Display the top 10 most rated movies with movie names
print("Top 10 most rated movies:")
print(top_10_most_rated_movies_with_names[['movieId', 'title', 'rating_count']])


Top 10 most rated movies:
   movieId                                              title  rating_count
0      318                   Shawshank Redemption, The (1994)        122296
1      356                                Forrest Gump (1994)        113581
2      296                                Pulp Fiction (1994)        108756
3     2571                                 Matrix, The (1999)        107056
4      593                   Silence of the Lambs, The (1991)        101802
5      260          Star Wars: Episode IV - A New Hope (1977)         97202
6     2959                                  Fight Club (1999)         86207
7      527                            Schindler's List (1993)         84232
8      480                               Jurassic Park (1993)         83026
9     1196  Star Wars: Episode V - The Empire Strikes Back...         80200


In [7]:
# Calculate the average rating for each movie
average_ratings = ratings_df.groupby('movieId')['rating'].mean()

# Sort the average ratings in descending order and get the top 10
top_10_avg_ratings = average_ratings.sort_values(ascending=False).head(10)

# Display the top 10 movies with the highest average ratings
print("Top 10 movies with the highest average ratings:")
print(top_10_avg_ratings)

Top 10 movies with the highest average ratings:
movieId
267038    5.0
282157    5.0
281624    5.0
251222    5.0
246624    5.0
216789    5.0
200088    5.0
200086    5.0
268808    5.0
200016    5.0
Name: rating, dtype: float64


In [8]:
# Calculate the average rating for each movie
average_ratings = ratings_df.groupby('movieId')['rating'].mean().reset_index()
average_ratings.columns = ['movieId', 'avg_rating']

# Sort the average ratings in descending order and get the top 10
top_10_avg_ratings = average_ratings.sort_values(by='avg_rating', ascending=False).head(10)

# Merge with movies DataFrame to get movie names
top_10_avg_ratings_with_names = pd.merge(top_10_avg_ratings, movies_df, on='movieId')

# Display the top 10 movies with the highest average ratings with movie names
print("Top 10 movies with the highest average ratings:")
print(top_10_avg_ratings_with_names[['movieId', 'title', 'avg_rating']])


Top 10 movies with the highest average ratings:
   movieId                                       title  avg_rating
0   267038             Christmas in the Rockies (2021)         5.0
1   282157              A Royal Corgi Christmas (2022)         5.0
2   281624                               Hidden (2020)         5.0
3   251222            İstanbul Beneath My Wings (1996)         5.0
4   246624                      Two for the Win (2021)         5.0
5   216789  Destination: Pluto Beyond the Flyby (2016)         5.0
6   200088                       Little Loopers (2015)         5.0
7   200086                    2BPerfectlyHonest (2004)         5.0
8   268808                     A Ballad of Love (1970)         5.0
9   200016                     The Nagano Tapes (2018)         5.0


In [9]:
# Function to clean movie titles
def clean_title(title):
    title = re.sub("[^a-zA-Z0-9 ]", "", title)
    return title


In [10]:
# Apply title cleaning to the movies dataframe
movies_df["clean_title"] = movies_df["title"].apply(clean_title)

In [11]:
# Create TF-IDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies_df["clean_title"])

In [12]:
# Function to search for movies based on input title
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies_df.iloc[indices].iloc[::-1]
    
    return results

In [13]:
# Widget for inputting movie title and displaying search results
movie_input = widgets.Text(
    
    description='Movie Title:',
    disabled=False
)
movie_list = widgets.Output()

# Callback function for handling text input changes
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            display(search(title))

movie_input.observe(on_type, names='value')


display(movie_input, movie_list)

Text(value='', description='Movie Title:')

Output()

In [14]:
def find_similar_movies(movie_id):
    # Find users who like the same movie as us
    similar_users = ratings_df[(ratings_df["movieId"] == movie_id) & (ratings_df["rating"] > 4)]["userId"].unique()
    # Find any movie that similar_users rated 5 star
    similar_user_recs = ratings_df[(ratings_df["userId"].isin(similar_users)) & (ratings_df["rating"] > 4)]["movieId"]
    
    # Only use the rating that is over 10 %
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .10]
    
    # Find anyone who has rated the movies in similar_user_recs 5 star
    all_users = ratings_df[(ratings_df["movieId"].isin(similar_user_recs.index)) & (ratings_df["rating"] > 4)]
    # Find the percentage of all_users who reccomend tghe movies in similar_user_recs
    all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())
    
    # Compare the percentage between similar_user_recs and all_user_recs
    # We want the movies that have a big difference between "similar" and "all"
    rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    
    # Devide "similar" by "all" to get the score and display it in ascending to have the most recommended movies on top
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies_df, left_index=True, right_on="movieId")[["score", "title", "genres"]]

In [15]:
# Widget for inputting a movie name and displaying similar movie recommendations
movie_name_input = widgets.Text(
    value='',
    description='Movie Title:',
    disabled=False
)
recommendation_list = widgets.Output()

# Callback function for handling text input changes
def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title = data["new"]
        if len(title) > 5:
            results = search(title)
            movie_id = results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))

movie_name_input.observe(on_type, names='value')

display(movie_name_input, recommendation_list)

Text(value='', description='Movie Title:')

Output()