## Q2(a)

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
ratings_data = pd.read_csv(r'C:\Users\ABC\Desktop\BAI\BAI-S6\RS\Assignments\ml-latest-small\ratings.csv')
movies_data = pd.read_csv(r'C:\Users\ABC\Desktop\BAI\BAI-S6\RS\Assignments\ml-latest-small\movies.csv')

In [3]:
# Computing mean ratings for each user
user_ratings_mean = np.nanmean(ratings_data.pivot(index='userId', columns='movieId', values='rating'), axis=1)

In [4]:
# Centering the ratings matrix
ratings_data_centered = ratings_data.pivot(index='userId', columns='movieId', values='rating') - user_ratings_mean[:, np.newaxis]

In [5]:
# Computing cosine similarity
user_similarity_matrix = cosine_similarity(ratings_data_centered.fillna(0))

In [6]:
def get_similar_users(user_id, k=5):
    user_similarity = user_similarity_matrix[user_id]
    similar_user_indices = user_similarity.argsort()[::-1][1:k+1] # Sort: descending. returning indices of top 5
    return similar_user_indices

In [7]:
def predict_rating(user_id, movie_id, k=5):
    similar_user_indices = get_similar_users(user_id, k=k)
    similar_user_ratings = ratings_data_centered.loc[similar_user_indices, movie_id].dropna()
    if len(similar_user_ratings) == 0:
        return user_ratings_mean[user_id]
    else:
        similar_user_mean_rating = np.nanmean(similar_user_ratings)
        user_rating = ratings_data_centered.loc[user_id, movie_id]
        if np.isnan(user_rating):
            predicted_rating = similar_user_mean_rating + user_ratings_mean[user_id]
        else:
            predicted_rating = similar_user_mean_rating + user_rating
        # Clip the predicted rating to the range of 0 to 5 so that the rating does not go out of bounds
        predicted_rating = max(0, min(predicted_rating, 5))
        return predicted_rating

In [8]:
def get_top_recommendations(user_id, N=5, k=5):
    user_rated_movies = ratings_data.loc[ratings_data['userId'] == user_id, 'movieId']
    unrated_movies = set(ratings_data['movieId']) - set(user_rated_movies)
    predicted_ratings = [(movie_id, predict_rating(user_id, movie_id, k=k)) for movie_id in unrated_movies]
    predicted_ratings.sort(key=lambda x: x[1], reverse=True)
    top_N_recommendations = predicted_ratings[:N]
    top_N_recommendation_titles = [movies_data.loc[movies_data['movieId'] == movie_id, 'title'].iloc[0] for movie_id, rating in top_N_recommendations]
    return list(zip(top_N_recommendation_titles, [rating for movie_id, rating in top_N_recommendations]))

In [9]:
# Test the recommender system (done, now doing it properly by prompting user to input movie name)
# user_id = 1
# movie_name = "Toy Story (1995)"
user_id = int(input("Enter user ID: "))
movie_name = input("Enter the name of the movie: ")
movie_id = movies_data.loc[movies_data['title'] == movie_name, 'movieId'].iloc[0]
recommended_movies = get_top_recommendations(user_id, N=5, k=5)

print(f"Top 5 movie recommendations for user {user_id} based on {movie_name}:")
for i, (title, rating) in enumerate(recommended_movies):
    print(f"{i+1}. {title} (predicted rating: {rating:.2f})")

Enter user ID: 1
Enter the name of the movie: Toy Story (1995)
Top 5 movie recommendations for user 1 based on Toy Story (1995):
1. Happy Gilmore (1996) (predicted rating: 5.00)
2. Apollo 13 (1995) (predicted rating: 5.00)
3. Hackers (1995) (predicted rating: 5.00)
4. Mallrats (1995) (predicted rating: 5.00)
5. Hoop Dreams (1994) (predicted rating: 5.00)


## Q2(b)

In [10]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ABC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
links_df = pd.read_csv(r'C:\Users\ABC\Desktop\BAI\BAI-S6\RS\Assignments\ml-latest-small\links.csv')
tags_df = pd.read_csv(r'C:\Users\ABC\Desktop\BAI\BAI-S6\RS\Assignments\ml-latest-small\tags.csv')
movies_df = pd.read_csv(r'C:\Users\ABC\Desktop\BAI\BAI-S6\RS\Assignments\ml-latest-small\movies.csv')

In [13]:
movie_data = pd.merge(movies_df, tags_df, on='movieId')
movie_data.drop(['userId', 'timestamp'], axis=1, inplace=True)
movie_data.dropna(inplace=True)

In [14]:
stemmer = PorterStemmer()

def tokenize(text):
    tokens = word_tokenize(text)
    stems = [stemmer.stem(token) for token in tokens]
    return stems

In [15]:
genres_vectorizer = CountVectorizer(tokenizer=tokenize, token_pattern=None)
genres_bow = genres_vectorizer.fit_transform(movie_data['genres'])

In [16]:
cosine_sim = cosine_similarity(genres_bow) # bow = bag of words

In [17]:
def get_similar_movies(movie_title):
    movie_idx = movies_df[movies_df['title'] == movie_title].index[0]
    similar_movies = list(enumerate(cosine_sim[movie_idx]))
    similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)
    top_similar_movies = similar_movies[1:4]
    similar_movie_indices = [i[0] for i in top_similar_movies]
    return movies_df.iloc[similar_movie_indices]['title']

In [18]:
movie_name = input("Enter the name of the movie: ")
similar_movies = get_similar_movies(movie_name)

print("Recommended movies based on genres:")
for movie in similar_movies:
    print(movie)

Enter the name of the movie: Waiting to Exhale (1995)
Recommended movies based on genres:
Father of the Bride Part II (1995)
Heat (1995)
Sabrina (1995)


### Manahil Fatima Anwar
### 20K-0134
### BAI-6A