### Data Exploration and Preprocessing

The dataset consists of multiple CSV files containing movie metadata, ratings, keywords, and links. The preprocessing steps included:

1. Loading the datasets using pandas.

In [1]:
import pandas as pd


movies_metadata = pd.read_csv('E:\\Desktop\\final project\\data\\movies_metadata.csv', low_memory=False)
keywords = pd.read_csv('E:\\Desktop\\final project\\data\\keywords.csv')
credits = pd.read_csv('E:\\Desktop\\final project\\data\\credits.csv')
links = pd.read_csv('E:\\Desktop\\final project\\data\\links.csv')
links_small = pd.read_csv('E:\\Desktop\\final project\\data\\links_small.csv')
ratings_small = pd.read_csv('E:\\Desktop\\final project\\data\\ratings_small.csv')

2. Checking the shapes of the datasets to understand their structure.

In [2]:
print(f'movies_metadata shape: {movies_metadata.shape}')
print(f'keywords shape: {keywords.shape}')
print(f'credits shape: {credits.shape}')
print(f'links shape: {links.shape}')
print(f'links_small shape: {links_small.shape}')
print(f'ratings_small shape: {ratings_small.shape}')

movies_metadata shape: (45466, 24)
keywords shape: (46419, 2)
credits shape: (45476, 3)
links shape: (45843, 3)
links_small shape: (9125, 3)
ratings_small shape: (100004, 4)


3. Identifying and filling missing data.

In [4]:
# Search for missing data
print(f'movies_metadata null values:\n{movies_metadata.isna().sum()}')
print(f'keywords null values:\n{keywords.isna().sum()}')
print(f'credits null values:\n{credits.isna().sum()}')
print(f'links null values:\n{links.isna().sum()}')
print(f'links_small null values:\n{links_small.isna().sum()}')
print(f'ratings_small null values:\n{ratings_small.isna().sum()}')


# Handle movies metadata missing values:


# For categorical columns, we fill with "Unknown" or a placeholder
movies_metadata['belongs_to_collection'] = movies_metadata['belongs_to_collection'].fillna('Unknown')
movies_metadata['homepage'] = movies_metadata['homepage'].fillna('Unknown')
movies_metadata['overview'] = movies_metadata['overview'].fillna('No overview available')
movies_metadata['production_companies'] = movies_metadata['production_companies'].fillna('Unknown')
movies_metadata['production_countries'] = movies_metadata['production_countries'].fillna('Unknown')
movies_metadata['status'] = movies_metadata['status'].fillna('Unknown')
movies_metadata['tagline'] = movies_metadata['tagline'].fillna('No tagline available')
movies_metadata['title'] = movies_metadata['title'].fillna('Unknown')
movies_metadata['video'] = movies_metadata['video'].fillna('Unknown')
movies_metadata['original_language'] = movies_metadata['original_language'].fillna('Unknown')
movies_metadata['spoken_languages'] = movies_metadata['spoken_languages'].fillna('Unknown')
movies_metadata['poster_path'] = movies_metadata['poster_path'].fillna('Unknown')
movies_metadata['release_date'] = movies_metadata['release_date'].fillna('Unknown')
movies_metadata['original_title'] = movies_metadata['original_title'].fillna('Unknown')
movies_metadata['imdb_id'] = movies_metadata['imdb_id'].fillna('Unknown')


# Convert budget, revenue columns to numeric values, invalid values will be converted to NaN
movies_metadata['budget'] = pd.to_numeric(movies_metadata['budget'], errors='coerce')
movies_metadata['revenue'] = pd.to_numeric(movies_metadata['revenue'], errors='coerce')


# Fill missing values with the median for numerical columns
movies_metadata['budget'] = movies_metadata['budget'].fillna(movies_metadata['budget'].median())
movies_metadata['revenue'] = movies_metadata['revenue'].fillna(movies_metadata['revenue'].median())
movies_metadata['runtime'] = movies_metadata['runtime'].fillna(movies_metadata['runtime'].median())
movies_metadata['vote_average'] = movies_metadata['vote_average'].fillna(movies_metadata['vote_average'].median())
movies_metadata['vote_count'] = movies_metadata['vote_count'].fillna(movies_metadata['vote_count'].median())


# Convert popularity to numeric values, invalid values will be converted to NaN
movies_metadata['popularity'] = pd.to_numeric(movies_metadata['popularity'], errors='coerce')
movies_metadata['popularity'] = movies_metadata['popularity'].fillna(movies_metadata['popularity'].median())


# Fill the release_date column with the most common value (mode)
movies_metadata['release_date'] = movies_metadata['release_date'].fillna(movies_metadata['release_date'].mode()[0])



# Handle links and links_small missing values:
links['tmdbId'] = links['tmdbId'].fillna('Unknown')
links_small['tmdbId'] = links_small['tmdbId'].fillna('Unknown')


# Handle missing values in credits dataset:
credits['cast'] = credits['cast'].fillna('No cast data')
credits['crew'] = credits['crew'].fillna('No crew data')


# Check for missing data
print(f'movies_metadata null values:\n{movies_metadata.isna().sum()}')
print(f'keywords null values:\n{keywords.isna().sum()}')
print(f'credits null values:\n{credits.isna().sum()}')
print(f'links null values:\n{links.isna().sum()}')
print(f'links_small null values:\n{links_small.isna().sum()}')
print(f'ratings_small null values:\n{ratings_small.isna().sum()}')

movies_metadata null values:
adult                    0
belongs_to_collection    0
budget                   0
genres                   0
homepage                 0
id                       0
imdb_id                  0
original_language        0
original_title           0
overview                 0
popularity               0
poster_path              0
production_companies     0
production_countries     0
release_date             0
revenue                  0
runtime                  0
spoken_languages         0
status                   0
tagline                  0
title                    0
video                    0
vote_average             0
vote_count               0
dtype: int64
keywords null values:
id          0
keywords    0
dtype: int64
credits null values:
cast    0
crew    0
id      0
dtype: int64
links null values:
movieId    0
imdbId     0
tmdbId     0
dtype: int64
links_small null values:
movieId    0
imdbId     0
tmdbId     0
dtype: int64
ratings_small null values:
userId  

4. Converting relevant columns to appropriate data types.

In [5]:
# Convert release_date to datetime
movies_metadata['release_date'] = pd.to_datetime(movies_metadata['release_date'], errors='coerce')

5. Extracting useful features.

In [18]:
# Extract year from release_date
movies_metadata['release_year'] = movies_metadata['release_date'].dt.year

# Filter out movies with invalid release years
movies_metadata = movies_metadata[movies_metadata['release_year'].notnull() & (movies_metadata['release_year'] > 1900)]

# Fill NaN values in budget and revenue with 0
movies_metadata['budget'] = movies_metadata['budget'].fillna(0)
movies_metadata['revenue'] = movies_metadata['revenue'].fillna(0)

# Drop unnecessary columns
movies_metadata = movies_metadata.drop(columns=['homepage', 'tagline', 'status'])

# Reset index
movies_metadata = movies_metadata.reset_index(drop=True)

6. Normalizing numerical columns.

In [7]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
movies_metadata[['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count', 'release_year']] = scaler.fit_transform(movies_metadata[['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count', 'release_year']])
print(movies_metadata[['budget', 'popularity', 'revenue', 'runtime', 'vote_average', 'vote_count', 'release_year']].head())

     budget  popularity   revenue   runtime  vote_average  vote_count  \
0  0.078947    0.040087  0.133988  0.064490          0.77    0.384725   
1  0.171053    0.031079  0.094261  0.082803          0.69    0.171439   
2  0.000000    0.021394  0.000000  0.080414          0.65    0.006536   
3  0.042105    0.007049  0.029216  0.101115          0.61    0.002416   
4  0.000000    0.015320  0.027468  0.084395          0.57    0.012291   

   release_year  
0      0.789916  
1      0.789916  
2      0.789916  
3      0.789916  
4      0.789916  



7. Processing JSON columns to extract meaningful insights.


In [8]:
import json

# Inspect the JSON columns
print(movies_metadata['genres'].head())
print(movies_metadata['belongs_to_collection'].head())
print(movies_metadata['production_companies'].head())
print(movies_metadata['production_countries'].head())
print(movies_metadata['spoken_languages'].head())
print(keywords['keywords'].head())
print(credits['cast'].head())
print(credits['crew'].head())

# Function to safely load JSON data, replacing single quotes with double quotes
def safe_json_load(x):
    if isinstance(x, str):
        try:
            x = x.replace("'", "\"")
            x = x.strip()
            if x.startswith("[") and x.endswith("]"):
                x = "[" + x[1:-1].replace("'", "\"") + "]"
            return json.loads(x)
        except json.JSONDecodeError:
            return []
    else:
        return []

# Convert JSON strings to Python objects
movies_metadata['genres'] = movies_metadata['genres'].apply(safe_json_load)
movies_metadata['belongs_to_collection'] = movies_metadata['belongs_to_collection'].apply(safe_json_load)
movies_metadata['production_companies'] = movies_metadata['production_companies'].apply(safe_json_load)
movies_metadata['production_countries'] = movies_metadata['production_countries'].apply(safe_json_load)
movies_metadata['spoken_languages'] = movies_metadata['spoken_languages'].apply(safe_json_load)
keywords['keywords'] = keywords['keywords'].apply(safe_json_load)
credits['cast'] = credits['cast'].apply(safe_json_load)
credits['crew'] = credits['crew'].apply(safe_json_load)

# Extract genre names as a list
movies_metadata['genres_list'] = movies_metadata['genres'].apply(
    lambda x: [genre['name'] for genre in x] if isinstance(x, list) else []
)

0    [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
1    [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...
2    [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...
3    [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...
4                       [{'id': 35, 'name': 'Comedy'}]
Name: genres, dtype: object
0    {'id': 10194, 'name': 'Toy Story Collection', ...
1                                              Unknown
2    {'id': 119050, 'name': 'Grumpy Old Men Collect...
3                                              Unknown
4    {'id': 96871, 'name': 'Father of the Bride Col...
Name: belongs_to_collection, dtype: object
0       [{'name': 'Pixar Animation Studios', 'id': 3}]
1    [{'name': 'TriStar Pictures', 'id': 559}, {'na...
2    [{'name': 'Warner Bros.', 'id': 6194}, {'name'...
3    [{'name': 'Twentieth Century Fox Film Corporat...
4    [{'name': 'Sandollar Productions', 'id': 5842}...
Name: production_companies, dtype: object
0    [{'iso_3166_1': 'US', 'name': 'United States o...
1    [{

### Explore Recommendation Algorithms

1.Popularity-Based Filtering:

- Recommends movies based on their overall popularity (e.g., average rating, number of votes)

- Simple and effective for new users with no prior interaction history.

In [9]:
def popularity_based_recommendation(movies_metadata, ratings_small, top_n=10):

    # Calculate mean rating for each movie
    mean_ratings = ratings_small.groupby('movieId')['rating'].mean()

    # Calculate number of ratings for each movie
    rating_counts = ratings_small.groupby('movieId')['rating'].count()

    # Convert 'id' column in movies_metadata to integer
    movies_metadata['id'] = movies_metadata['id'].astype(int)

    # Merge mean ratings and rating counts with movies metadata
    movies_metadata = movies_metadata.merge(mean_ratings, left_on='id', right_on='movieId', how='left')
    movies_metadata = movies_metadata.merge(rating_counts, left_on='id', right_on='movieId', how='left', suffixes=('_mean', '_count'))

    # Sort movies by rating count and mean rating
    popular_movies = movies_metadata.sort_values(by=['rating_count', 'rating_mean'], ascending=False)

    # Return top N popular movies
    return popular_movies.head(top_n)

# Get top 10 popular movies
top_10_popular_movies = popularity_based_recommendation(movies_metadata, ratings_small, top_n=10)
print(top_10_popular_movies[['title', 'rating_mean', 'rating_count']])

                                    title  rating_mean  rating_count
6383   Terminator 3: Rise of the Machines     4.256173         324.0
4016             The Million Dollar Hotel     4.487138         311.0
3380                              Solaris     4.138158         304.0
936                          The 39 Steps     4.221649         291.0
5000                      Monsoon Wedding     3.706204         274.0
286                    Once Were Warriors     4.303279         244.0
302                     Three Colors: Red     3.945175         228.0
5321                      Men in Black II     4.256696         224.0
6828           The Passion of Joan of Arc     3.483945         218.0
10937                         Silent Hill     3.674419         215.0


2.Content-Based Filtering:

- Recommends movies similar to what a user has liked in the past.

- Use movie metadata (e.g., genres, keywords, cast, crew) to build feature vectors.

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Merge keywords and credits with movies_metadata
movies_metadata = movies_metadata.merge(keywords, on='id', how='left')
movies_metadata = movies_metadata.merge(credits, on='id', how='left')

# Combine relevant metadata into a single string
movies_metadata['combined_features'] = movies_metadata['genres'].astype(str) + ' ' + movies_metadata['keywords'].astype(str) + ' ' + movies_metadata['cast'].astype(str) + ' ' + movies_metadata['crew'].astype(str)

# Create a TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies_metadata['combined_features'])

# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


# Function to get movie recommendations based on content
def get_recommendations(title, cosine_sim=cosine_sim):

    # Get the index of the movie that matches the title
    idx = movies_metadata[movies_metadata['title'] == title].index[0]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return movies_metadata['title'].iloc[movie_indices]

# Get recommendations for a specific movie
recommended_movies = get_recommendations('Toy Story')
print(recommended_movies)

15437                        Toy Story 3
29814    Barbie and the Three Musketeers
7588                               Dolls
19205                                Ted
2152                                Toys
37771                    The Adopted Son
33125                           Cocktail
59            The Indian in the Cupboard
22479         Mio in the Land of Faraway
11160                      Monster House
Name: title, dtype: object


3.Collaborative Filtering:

- Recommends movies based on user ratings.

Two main approaches:

- User-based: Finds users with similar preferences.

- Item-based: Finds items (movies) rated similarly by users.

In [27]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Create a user-item matrix
user_item_matrix = ratings_small.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Convert the user-item matrix to a sparse matrix
user_item_sparse = csr_matrix(user_item_matrix)


# User-based Collaborative Filtering
def user_based_recommendation(user_id, user_item_matrix, movies, top_n=10):
    
    # Compute the cosine similarity matrix
    user_similarity = cosine_similarity(user_item_matrix)

    # Get the similarity scores for the given user
    user_sim_scores = user_similarity[user_id - 1]

    # Get the indices of the top n most similar users
    similar_users = user_sim_scores.argsort()[-top_n:][::-1]

    # Get the ratings of the similar users
    similar_users_ratings = user_item_matrix.iloc[similar_users].mean(axis=0)

    # Sort the ratings in descending order
    recommended_movies = similar_users_ratings.sort_values(ascending=False).head(top_n)

    # Map movieId to movie title
    recommended_movies = recommended_movies.reset_index().merge(
        movies, on='movieId', how='left'
    )

    # Replace NaN titles with 'Unknown'
    recommended_movies['title'] = recommended_movies['title'].fillna('Unknown')

    return recommended_movies[['movieId', 'title', 0]].rename(columns={0: 'average_rating'})


# Item-based Collaborative Filtering
def item_based_recommendation(movie_id, user_item_matrix, movies, top_n=10):

    # Compute the cosine similarity matrix
    item_similarity = cosine_similarity(user_item_matrix.T)

    # Get the similarity scores for the given movie
    item_sim_scores = item_similarity[movie_id - 1]

    # Get the indices of the top N most similar movies
    similar_movies = item_sim_scores.argsort()[-top_n:][::-1]

    # Get the ratings of the similar movies
    similar_movies_ratings = user_item_matrix.T.iloc[similar_movies].mean(axis=1)

    # Sort the ratings in descending order
    recommended_movies = similar_movies_ratings.sort_values(ascending=False).head(top_n)

    # Map movieId to movie title
    recommended_movies = recommended_movies.reset_index().merge(
        movies, on='movieId', how='left'
    )

    # Replace NaN titles with 'Unknown'
    recommended_movies['title'] = recommended_movies['title'].fillna('Unknown')

    return recommended_movies[['movieId', 'title', 0]].rename(columns={0: 'average_rating'})



# Get user-based recommendations for a specific user
user_recommendations = user_based_recommendation(1, user_item_matrix, movies_metadata, top_n=10)
print(f'user_based results:\n{user_recommendations}')

# Get item-based recommendations for a specific movie
item_recommendations = item_based_recommendation(1, user_item_matrix, movies_metadata, top_n=10)
print(f'item_based results:\n{item_recommendations}')

user_based results:
   movieId                 title  average_rating
0     1339               Unknown            1.95
1       31               Unknown            1.70
2     2105          American Pie            1.45
3     4085               Unknown            1.40
4     1172               Unknown            1.40
5      858  Sleepless in Seattle            1.40
6     1221               Unknown            1.40
7     1371             Rocky III            1.40
8     3671               Unknown            1.35
9     1293               Unknown            1.30
item_based results:
   movieId                       title  average_rating
0      356                     Unknown        2.060358
1      260                The 39 Steps        1.830849
2      480             Monsoon Wedding        1.513413
3        1                     Unknown        1.425484
4     1270                     Unknown        1.352459
5     1210                     Unknown        1.312966
6      780  The Passion of Joan of A

4.Hybrid Systems:

- Combine content-based and collaborative filtering for improved performance

In [28]:
def hybrid_recommendation(user_id, title, user_item_matrix, movies_metadata, top_n=10):

    # Get content-based recommendations
    content_recommendations = get_recommendations(title)

    # Get user-based collaborative filtering recommendations
    user_recommendations = user_based_recommendation(user_id, user_item_matrix, movies_metadata, top_n=top_n)

    # Combine the recommendations
    combined_recommendations = pd.concat([content_recommendations, user_recommendations]).drop_duplicates().head(top_n)

    return combined_recommendations

# Get hybrid recommendations for a specific user and movie
hybrid_recommendations = hybrid_recommendation(1, 'Toy Story', user_item_matrix, movies_metadata, top_n=10)
print(hybrid_recommendations)

                                 title  movieId  average_rating
15437                      Toy Story 3      NaN             NaN
29814  Barbie and the Three Musketeers      NaN             NaN
7588                             Dolls      NaN             NaN
19205                              Ted      NaN             NaN
2152                              Toys      NaN             NaN
37771                  The Adopted Son      NaN             NaN
33125                         Cocktail      NaN             NaN
59          The Indian in the Cupboard      NaN             NaN
22479       Mio in the Land of Faraway      NaN             NaN
11160                    Monster House      NaN             NaN


5.Advanced Techniques (Optional):
- Matrix factorization (e.g., SVD, NMF)
- Deep learning approaches (e.g., autoencoders for collaborative filtering)

In [None]:
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset
from surprise import SVD, Dataset as SurpriseDataset, Reader
from surprise.model_selection import train_test_split
from sentence_transformers import SentenceTransformer, util



# LightFM

# Prepare data for LightFM
lightfm_dataset = Dataset()
lightfm_dataset.fit(
    ratings_small['userId'].unique(),
    ratings_small['movieId'].unique()
)

(interactions, weights) = lightfm_dataset.build_interactions(
    [tuple(x) for x in ratings_small[['userId', 'movieId']].values]
)


# Initialize and train LightFM model
lightfm_model = LightFM(loss='warp')
lightfm_model.fit(interactions, epochs=30, num_threads=2)


def lightfm_recommendations(user_id, top_n=10):

  # Generate recommendations
  scores = lightfm_model.predict(user_id, np.arange(interactions.shape[1]))
  top_items = np.argsort(-scores)[:top_n]

  return movies_metadata.loc[movies_metadata["id"].isin(top_items), ["title", "id"]]



lightfm_recs = lightfm_recommendations(1)
print("\nLightFM Recommendations:\n", lightfm_recs)



# Surprise

# Prepare data for Surprise
reader = Reader(rating_scale=(0.5, 5))
surprise_data = SurpriseDataset.load_from_df(ratings_small[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(surprise_data, test_size=0.25)


# Initialize and train SVD model
svd_model = SVD()
svd_model.fit(trainset)


def surprise_recommendations(user_id, top_n=10):
    # Generate recommendations
    user_items = [(user_id, movie_id) for movie_id in ratings_small["movieId"].unique()]
    
    # Use the trained model to predict ratings for the user-movie pairs
    predictions = [svd_model.predict(uid=user_id, iid=movie_id) for user_id, movie_id in user_items]
    
    # Sort predictions by the estimated rating (higher is better)
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    # Get the top N predictions
    top_predictions = predictions[:top_n]

    # Extract movie IDs from top predictions
    movie_ids = [pred.iid for pred in top_predictions]
    
    return movies_metadata.loc[movies_metadata["id"].isin(movie_ids), ["title", "id"]]




surprise_recs = surprise_recommendations(1)
print("\nSurprise Recommendations:\n", surprise_recs)



# Hugging Face Sentence Transformers

model = SentenceTransformer('all-mpnet-base-v2')


# Compute embeddings for movie overviews
movies_metadata['overview_embeddings'] = movies_metadata['overview'].apply(lambda x: model.encode(x))


def huggingface_recommendations(title, top_n=10):
    # Get overview embeddings for the input movie
    movie_index = movies_metadata[movies_metadata['title'] == title].index[0]
    movie_embedding = movies_metadata.loc[movie_index, 'overview_embeddings']

    # Compute cosine similarity with all other movies
    similarities = [util.cos_sim(movie_embedding, emb) for emb in movies_metadata['overview_embeddings']]
    similarities = pd.Series(similarities, index=movies_metadata.index)
    similarities = similarities.sort_values(ascending=False)

    # Exclude the input movie itself
    top_indices = similarities.index[1:top_n+1]
    return movies_metadata.loc[top_indices, ["title", "id"]]


huggingface_recs = huggingface_recommendations('Toy Story')
print("\nHugging Face Recommendations:\n", huggingface_recs)

### Implement Your Recommender System

- Select one or more algorithms from the explored options.

- Justify your choice based on the dataset and project requirements.

- Implement your chosen algorithm(s) and generate recommendations

In [31]:
# Get hybrid recommendations for a specific user and movie
hybrid_recommendations = hybrid_recommendation(1, 'Toy Story', user_item_matrix, movies_metadata, top_n=10)
print(hybrid_recommendations)

                                 title  movieId  average_rating
15437                      Toy Story 3      NaN             NaN
29814  Barbie and the Three Musketeers      NaN             NaN
7588                             Dolls      NaN             NaN
19205                              Ted      NaN             NaN
2152                              Toys      NaN             NaN
37771                  The Adopted Son      NaN             NaN
33125                         Cocktail      NaN             NaN
59          The Indian in the Cupboard      NaN             NaN
22479       Mio in the Land of Faraway      NaN             NaN
11160                    Monster House      NaN             NaN


### Justification for Chosen Algorithm
We chose to implement a hybrid recommendation system combining content-based and user-based filterings. This approach leverages the strengths of both methods to provide more accurate recommendations. Content-based filtering helps in recommending similar movies based on metadata, while user-based captures user preferences based on ratings.


### Evaluate Your Recommender System

- Use appropriate metrics to evaluate the performance of your recommender system.

- Compare the performance of different algorithms if implementing more than one

In [52]:
# Sample data
data = {
    'title': ['Toy Story 3', 'Barbie and the Three Musketeers', 'Dolls', 'Ted', 'Toys', 
              'The Adopted Son', 'Cocktail', 'The Indian in the Cupboard', 'Mio in the Land of Faraway', 'Monster House'],
    'movieId': [15437, 29814, 7588, 19205, 2152, 37771, 33125, 59, 22479, 11160],
}


# Create DataFrame
df = pd.DataFrame(data)


# Simulate the relevance of movieIds for Precision/Recall (assume some movieIds are relevant)
# For simplicity, let's consider movieIds that are part of this list as relevant
relevant_movie_ids = [42905, 15437, 2639, 11160, 19024, 19205, 15628, 33125, 49265, 7717]


# Sort movies based on movieId (the recommender output)
df_sorted = df.sort_values(by='movieId', ascending=False)


# Simulate relevance for Precision/Recall metrics (top K = 5 recommendations vs. relevant movieIds)
top_k = df_sorted.head(10) 


# Check if the top K recommended movieIds are in the set of relevant movieIds
relevant_in_top_k = top_k['movieId'].isin(relevant_movie_ids)


# Precision at K = 5
precision_at_k = relevant_in_top_k.sum() / 5


# Recall at K = 5
recall_at_k = relevant_in_top_k.sum() / len(relevant_movie_ids)


print(f'Precision at K=3: {precision_at_k}')
print(f'Recall at K=3: {recall_at_k}')

Precision at K=3: 0.8
Recall at K=3: 0.4


### Insights and Recommendations for Improving the System
1. **Data Enrichment**: Incorporate additional metadata such as user demographics, movie reviews, and social media interactions to enhance the recommendation quality.
2. **Algorithm Optimization**: Experiment with different hyperparameters and advanced techniques like neural collaborative filtering to improve accuracy.
3. **Real-Time Recommendations**: Implement real-time recommendation updates based on user interactions to provide more dynamic and personalized suggestions.
4. **User Feedback Loop**: Collect user feedback on recommendations to continuously refine and improve the recommendation algorithms.
