In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# -*- coding: utf-8 -*-
"""exploration_and_model_sprint2.ipynb

Automatically generated by Colab.

Original file is located at
    https://colab.research.google.com/drive/1aOgMcZaD80Q5irXoTDpZceZdUKv_PbwA
"""

!pip install https://github.com/maciejkula/spotlight/archive/master.zip

# Import necessary libraries
import os
import h5py
import pandas as pd
from spotlight.datasets import _transport
from spotlight.interactions import Interactions
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pickle

# Constants and helper functions
VARIANTS = ('100K', '1M', '10M', '20M')
URL_PREFIX = ('https://github.com/maciejkula/recommender_datasets/releases/download')
VERSION = 'v0.2.0'

def _get_movielens(dataset):
    extension = '.hdf5'
    path = _transport.get_data('/'.join((URL_PREFIX, VERSION, dataset + extension)),
                               os.path.join('movielens', VERSION),
                               'movielens_{}{}'.format(dataset, extension))
    with h5py.File(path, 'r') as data:
        return (data['/user_id'][:], data['/item_id'][:], data['/rating'][:], data['/timestamp'][:])

def get_movielens_dataset(variant='100K'):
    if variant not in VARIANTS:
        raise ValueError(f'Variant must be one of {VARIANTS}, got {variant}.')
    url = 'movielens_{}'.format(variant)
    return Interactions(*_get_movielens(url))

# Load Movielens dataset
dataset = get_movielens_dataset(variant='100K')
print(dataset.__dict__)

# Convert dataset to pandas DataFrame for easier manipulation
df = pd.DataFrame({
    'user_id': dataset.user_ids,
    'item_id': dataset.item_ids,
    'rating': dataset.ratings,
    'timestamp': dataset.timestamps
})

# Loading the Movies dataset with genres
moviesDF = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Agile/movies.csv', sep=",")
moviesDF.movieId = moviesDF.movieId.astype(int)
moviesDF.set_index("movieId", inplace=True)

# Step 1: Preprocess movie data
ratingCounts = df.groupby("item_id")["user_id"].count()
moviesDF["RatingCount"] = ratingCounts
moviesDF["year"] = moviesDF.title.str.extract(r'\(([0-9]{4})\)', expand=False)
moviesDF["year"] = moviesDF.year.astype(float)
moviesDF.fillna(0, inplace=True)

# Step 2: Genre selection
def filter_movies_by_genre(genre):
    """
    Filters movies based on the selected genre and generates recommendations based on rating counts.

    :param genre: Genre to filter movies by (e.g., 'Action', 'Comedy', 'Drama')
    :return: A dataframe of top recommended movies from the selected genre
    """
    genre_filtered = moviesDF[moviesDF['genres'].str.contains(genre, case=False, na=False)]
    genre_filtered = genre_filtered[genre_filtered["RatingCount"] >= 10]
    recommended_movies = genre_filtered.sort_values(["RatingCount", "year"], ascending=[False, True])
    return recommended_movies[['title', 'RatingCount', 'year']].head(100)

# Display available genres
print("Available genres:")
available_genres = set('|'.join(moviesDF['genres'].dropna()).split('|'))
for idx, genre in enumerate(available_genres, 1):
    print(f"{idx}. {genre}")

# Ask the user to select a genre
selected_genre_index = int(input("\nSelect a genre by number: ")) - 1
selected_genre = list(available_genres)[selected_genre_index]

print(f"\nYou selected: {selected_genre}\n")

# Get filtered movies by selected genre
recommended_movies = filter_movies_by_genre(selected_genre)
print("\nTop 100 Recommended Movies in the selected genre:")
print(recommended_movies)

# Step 3: Generate movie recommendations using KNN (considering user ratings)
user_movie_matrix = df.pivot_table(index='user_id', columns='item_id', values='rating', fill_value=0)

# Example of new user ratings
new_user_ratings = {
    242: 5,  # Movie ID 242 rated 5
    302: 4,  # Movie ID 302 rated 4
    377: 3,  # Movie ID 377 rated 3
    346: 2,  # Movie ID 346 rated 2
    1090: 1, # Movie ID 1090 rated 1
    51: 5,   # Movie ID 51 rated 5
    225: 4,  # Movie ID 225 rated 4
    203: 3,  # Movie ID 203 rated 3
    476: 2,  # Movie ID 476 rated 2
    204: 1   # Movie ID 204 rated 1
}

# Create the user vector for KNN (movie ratings for the new user)
new_user_vector = np.zeros(user_movie_matrix.shape[1])
for movie_id, rating in new_user_ratings.items():
    if movie_id in user_movie_matrix.columns:
        new_user_vector[user_movie_matrix.columns.get_loc(movie_id)] = rating

# Normalize the user-item matrix (row normalization)
user_movie_matrix_normalized = normalize(user_movie_matrix, axis=1)

# Fit KNN model to the normalized matrix
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_movie_matrix_normalized)

# Normalize the new user's ratings vector
new_user_vector_normalized = normalize([new_user_vector], axis=1)

# Find the closest users to the new user
distances, indices = knn_model.kneighbors(new_user_vector_normalized, n_neighbors=3)

closest_users_ids = user_movie_matrix.index[indices[0]]

# Step 4: Get top-rated movies from closest users (but only from the selected genre)
top_rated_movies = (
    user_movie_matrix.loc[closest_users_ids]
    .mean(axis=0)
    .sort_values(ascending=False)
)

# Filter top-rated movies by the selected genre
top_rated_movies_with_individuals = user_movie_matrix.loc[closest_users_ids].T.loc[top_rated_movies.index]
top_rated_movies_with_individuals["Mean Rating"] = top_rated_movies

# Step 5: Exclude movies that the new user has already rated
recommended_movies = []
i = 0
already_seen_movies = []
while len(recommended_movies) < 10 and i < len(top_rated_movies):
    movie_id = top_rated_movies.index[i]
    if movie_id not in new_user_ratings:
        if movie_id in recommended_movies:
            i += 1
            continue
        recommended_movies.append(movie_id)
    else:
        already_seen_movies.append(movie_id)
    i += 1

updated_recommendations = top_rated_movies_with_individuals.loc[recommended_movies]
print(f"Closest user IDs: {list(closest_users_ids)}")
print(f"\nNew user has already seen the following movies: {already_seen_movies}")
print("\nUpdated top recommended movies (excluding those already rated by the new user):")
print(updated_recommendations)

# Step 6: Save the user-movie matrix for future use
with open("user_movie_matrix.pkl", "wb") as f:
    pickle.dump(user_movie_matrix, f)


Collecting https://github.com/maciejkula/spotlight/archive/master.zip
  Using cached https://github.com/maciejkula/spotlight/archive/master.zip
  Preparing metadata (setup.py) ... [?25l[?25hdone
{'num_users': 944, 'num_items': 1683, 'user_ids': array([196, 186,  22, ..., 276,  13,  12], dtype=int32), 'item_ids': array([ 242,  302,  377, ..., 1090,  225,  203], dtype=int32), 'ratings': array([3., 3., 1., ..., 1., 2., 3.], dtype=float32), 'timestamps': array([881250949, 891717742, 878887116, ..., 874795795, 882399156,
       879959583], dtype=int32), 'weights': None}
Available genres:
1. Fantasy
2. IMAX
3. Western
4. Children
5. Romance
6. Drama
7. Mystery
8. Action
9. War
10. Crime
11. Sci-Fi
12. (no genres listed)
13. Musical
14. Film-Noir
15. Horror
16. Animation
17. Thriller
18. Documentary
19. Adventure
20. Comedy

Select a genre by number: 6

You selected: Drama


Top 100 Recommended Movies in the selected genre:
                                         title  RatingCount    year