In [207]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process
import seaborn as sns
import matplotlib.pyplot as plt
import plotly_express as px
import re

- Some of the code in this file have been explained in the Explorative analyis file

- I will will explain where i a new piece of code and how i came to any conclusions

In [208]:
movies, ratings = pd.read_csv('../data/movies.csv'), pd.read_csv('../data/ratings.csv')

movies_df = movies.copy()
ratings_df = ratings.copy()

In [209]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [210]:
movies['year']  = movies['title'].str.extract(r'\((\d{4})\)')

movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [211]:
genres_df = movies['genres'].str.get_dummies('|')

# calculate the most common genres for each movie
movies['most_common_genre'] = genres_df.apply(lambda x: x.idxmax(), axis=1)

# show the resulting DataFrame
movies.head()

Unnamed: 0,movieId,title,genres,year,most_common_genre
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,Adventure
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,Adventure
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,Comedy
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,Comedy
4,5,Father of the Bride Part II (1995),Comedy,1995,Comedy


In [212]:
movies = movies.dropna(subset=['year'], how='any')

In [213]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,27753440.0,27753440.0,27753440.0,27753440.0
mean,141942.0,18488.0,3.530445,1193122000.0
std,81707.4,35102.63,1.066353,216048200.0
min,1.0,1.0,0.5,789652000.0
25%,71176.0,1097.0,3.0,998605300.0
50%,142022.0,2716.0,3.5,1174256000.0
75%,212459.0,7150.0,4.0,1422744000.0
max,283228.0,193886.0,5.0,1537945000.0


In [214]:
movies.describe()

Unnamed: 0,movieId
count,57771.0
mean,111636.410673
std,59889.476538
min,1.0
25%,72067.5
50%,126028.0
75%,161239.0
max,193886.0


In [217]:
movies['year'] = movies['year'].astype(int)

In [224]:
movies.loc[:, 'title_no_year'] = movies['title'].apply(lambda x: x.split("(")[0].rstrip())

---

## 1.3) Recommender system

- For ease, I have divided my code into small functions

In [231]:
movies

Unnamed: 0,movieId,title,genres,year,most_common_genre,title_no_year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,Adventure,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,Adventure,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,Comedy,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,Comedy,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,1995,Comedy,Father of the Bride Part II
...,...,...,...,...,...,...
58093,193876,The Great Glinka (1946),(no genres listed),1946,(no genres listed),The Great Glinka
58094,193878,Les tribulations d'une caissière (2011),Comedy,2011,Comedy,Les tribulations d'une caissière
58095,193880,Her Name Was Mumu (2016),Drama,2016,Drama,Her Name Was Mumu
58096,193882,Flora (2017),Adventure|Drama|Horror|Sci-Fi,2017,Adventure,Flora


In [233]:
# process.extractOne is a fuzzywuzzy function that returns that matches my inputed string or movie title,
# to the movie titles in my dataset
# It also returns how accurate the matching is like in the print below, after title, the 100 means its a perfect match

def get_movie(word):
    movie = process.extractOne(word, movies['title'])
    return [movie[0], movie[1]]

movie = get_movie('Lord of the Rings: The Fellowship of the Ring, The (2001)')
movie

['Lord of the Rings: The Fellowship of the Ring, The (2001)', 100]

In [234]:
movies[movies['title'] == 'Lord of the Rings: The Fellowship of the Ring, The (2001)']

Unnamed: 0,movieId,title,genres,year,most_common_genre,title_no_year
4898,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy,2001,Adventure,"Lord of the Rings: The Fellowship of the Ring,..."


In [235]:
#Return the movie Id of the movie I am looking for recommendations for from the movies dataset

def get_movie_Id():
    cleaned = movies[movies['title'] == movie[0]]
    return cleaned['movieId'].values[0]
    
get_movie_Id()

4993

In [236]:
#Return the release year of the movie I am looking for recommendations for from the movies dataset

def get_movie_year():
    cleaned = movies[movies['title'] == movie[0]]
    return cleaned['year'].values[0]

year = get_movie_year()
year-5

1996

In [239]:
# Clean dataset a smaller and more manageable dataset
# Recommendations for movies dont need the entire movieset
# The assumption is, a person looking for recommendations for a comedy movy will most likely want a comedy related movies returned

def process_movies():
    movie_title = movie[0]
    movie_rows = movies[movies['title'] == movie_title]
    categories = []
    unique_genres = []
    
    # Split the strings in the genres column by splitting each of strings for eg. Adventure|Fantasy so as to return a list 
    # that looks like this ['Adventure', 'Fantasy']
    for genres in movie_rows['genres'].str.split('|'):
        for genre in genres:
            if genre not in unique_genres:
                unique_genres.append(genre)
                
    # My goal is to return two categories per movie. But in some cases a movie only has one genre 
    # and I have handle this possibility
    # By checking the length of unique_genres and stopping at the second index
    if len(unique_genres) > 2:
        for index, item in enumerate(unique_genres):
            if index < 2:
                categories.append(item)
            else:
                break
    else:
        categories = list(unique_genres)

    dfs = []
    for category in categories:
        df = movies[movies['most_common_genre'] == category]
        dfs.append(df)

    if len(dfs) == 0:
        df = pd.DataFrame(columns=movies.columns)
    else:
        df = pd.concat(dfs)

    df = df[df['year'] >= (year - 10)]
    return df
processed = process_movies()
processed

['Adventure', 'Fantasy']


Unnamed: 0,movieId,title,genres,year,most_common_genre,title_no_year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,Adventure,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,Adventure,Jumanji
7,8,Tom and Huck (1995),Adventure|Children,1995,Adventure,Tom and Huck
12,13,Balto (1995),Adventure|Animation|Children,1995,Adventure,Balto
28,29,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi,1995,Adventure,"City of Lost Children, The"
...,...,...,...,...,...,...
56961,190903,Dracula in Love (2018),Fantasy|Horror,2018,Fantasy,Dracula in Love
57362,192005,Blood Ransom (2014),Fantasy|Horror|Romance,2014,Fantasy,Blood Ransom
57545,192467,Ammoru (1995),Fantasy|Horror,1995,Fantasy,Ammoru
57634,192713,Compulsion (2018),Fantasy|Horror|Thriller,2018,Fantasy,Compulsion


In [240]:
# Reducing the ratings dataframe to only include the ratings of the movies that I have in my processed movie dataset above
# I use masking and isin() funtion to only pick rows that are both found in ratings and processed datasets

def clean_ratings():
    rate = ratings[ratings['movieId'].isin(processed['movieId'])]
    return rate
rate = clean_ratings()
rate

Unnamed: 0,userId,movieId,rating,timestamp
32,3,828,4.0,945141610
42,4,1,4.0,1113765937
43,4,2,4.0,1113767306
76,4,150,4.0,1113765768
79,4,158,0.5,1127946524
...,...,...,...,...
27753331,283228,1,4.5,1379882801
27753338,283228,368,4.0,1354159876
27753410,283228,3114,4.5,1379882803
27753422,283228,4886,5.0,1379882828


In [241]:
# Create a pivot table
# The pivot() method reshapes the data in the rate dataframe (cleaned ratings datafraame) such that the rows correspond to unique movieId values,
# the columns correspond to unique userId values, and the values in the dataframe are the rating values for each (movieId, userId) pair.
# the NaN values are filled with a value. These NaNs are rows where the user did not rate the movie

ratings_features = rate.pivot(
    columns="userId", index="movieId", values="rating"
).fillna(0)
# ratings_features = ratings_features.apply(lambda x: x.fillna(0), axis=1)

ratings_features


userId,3,4,5,6,10,11,13,14,15,16,...,283215,283217,283218,283219,283220,283222,283224,283226,283227,283228
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,4.0,0.0,0.0,5.0,0.0,0.0,4.5,4.0,0.0,...,4.0,5.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.5
2,0.0,4.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193631,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193731,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [242]:
# The csr_matrix() function is used to create a Compressed Sparse Row (CSR) matrix from a dense matrix or an array-like object. 
# In this case, the values attribute of the ratings_features dataframe is used as input to the csr_matrix() function, 
# which creates a sparse matrix where each row corresponds to a movie and each column corresponds to a user, 
# and the values in the matrix are the ratings given by each user to each movie.

# The advantage of using a sparse matrix is that it saves memory by only storing the non-zero values, 
# which is especially useful for large datasets with many missing values. In the case of movie ratings, 
# it is common for most users to have only rated a small fraction of all movies, 
# so a sparse matrix can significantly reduce memory usage compared to a dense matrix.

# The resulting matrix_movies_users object can be used as input to various machine learning algorithms that work with sparse matrices, 
# such as collaborative filtering or matrix factorization.

#Source: ChatGPT
matrix_movies_users = csr_matrix(ratings_features.values)

In [243]:
# Creates the KNN model using the pivot table
# Create an instance of the K Nearest Neighbor
# metric='cosine': This sets the distance metric to be used to the cosine distance
# algorithm='brute': This sets the algorithm to be used to calculate the nearest neighbors to "brute force" or "brute" algorithm.
# n_neighbors=20: This sets the number of neighbors to consider to 20. 

model_KNN = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)
model_KNN.fit(ratings_features)

In [275]:
# Creates the KNN model using the csr matrix
model_K = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)
model_K.fit(matrix_movies_users)

- The below answers are explained in file 1_3

In [245]:
movieId = get_movie_Id()
movieId

4993

In [246]:
# When I created the pivot dataframe, the userId became columns and the movieIds became rows, this means the original indices that the dataset had 
# were changed to the movieId
# Inorder to get the right row in the matrix i needed to the correct movieId index in the pivot dataframe using the .get_loc()
# and the specify which row I am searching for in csr matrix.
row_idx = ratings_features.index.get_loc(movieId)
row = matrix_movies_users[row_idx, :]
row

<1x212657 sparse matrix of type '<class 'numpy.float64'>'
	with 61883 stored elements in Compressed Sparse Row format>

In [247]:
row_idx

132

In [271]:
matrix_movies_users[132]

<1x212657 sparse matrix of type '<class 'numpy.float64'>'
	with 61883 stored elements in Compressed Sparse Row format>

In [277]:
def recommender_system(movie_name, dataframe, model, number_recommendations):
    ind = []
    movie_id = process.extractOne(movie_name, processed['title'])[1]
    movie_idx = process.extractOne(movie_name, processed['title'])[2]
  
    print('Movie Selected: ', processed['title'][movie_idx], 'Id: ',movie_id)
    print('Searching for recommendation....')

    row_idx = ratings_features.index.get_loc(movieId)
    row = matrix_movies_users[row_idx, :]
    
    distances, indices = model.kneighbors(dataframe[row_idx, :], n_neighbors=number_recommendations+1)
    selected = indices[0]
    selected = selected[selected != row_idx]

    selected_movies = processed.iloc[selected]

    
    return selected_movies


## 1.3a&b How my system works

- My goal in this exercise is to recommend movies to a user based on inputed movie for toy story.

- First i used fuzzywuzzy to return a close match to the movie inputed by the user. This reduces the potential for 
errors since you would need a perfect match for movie you are handling in the dataframe.

- When I get a close enough string as a movie, I reduce the size of my datasets. The original movies dataset is over 58000 rows and the ratings are over 
27 000 000 rows which is obviously too large for my computer to handle. So the best option is to clean these datasets and use what i need.

- I do this by reducing the datsets by at most 2 categories taken from the inputed movie's genres

- I subsequently reduced the ratings to only contain the ratings of the movies found in my cleaned/processed movies dataset

- I create a create pivot daframe using the cleaned ratings dataset

- create a csr_matrix with my pivot dataframe. This helps safe memory usage by only storing non zero values. the csr matrix is then used 
for my recommender algorithm 

#### How KNN works here
- From the csr matrix, each row is a vector. Each of these vectors is a movie since I had my movieIds as rows in the pivot dataframe. These vectors are found in this high 
dimensional matrix space. The KNN recommendation here works by checking the cosine angle between my inputed movie and the K nearest other vectors. The other vector vectors in this matrix with smaller angles compared to my movie will be returned 

<img src="../assets/cos.webp" alt="description of the image" width="300" height="200">

A good example is the image above. Joao Felix and Messi are similar, but Jaoa has fewer years of play and doesnt have as many ratings but is  very similar to messi as opposed
to Cristiano who is quite different but simailar in amout of ratings as Messi. A euclidean distance would have picked Messi and Ronaldo where a cosine would pick Joao as similar to Messi.

Cosine similarity measures the similarity between two vectors or data points in multidimensional space. It is measured by the cosine of the angle between two vectors or data points. It determines whether these two vectors are pointing in the same direction. It is often used to measure similarity in text analysis.

When KNN makes inference about a movie, KNN will calculate the “distance” between the target movie and every other movie in its database, then it ranks its distances and returns the top K nearest neighbor movies as the most similar movie recommendations.


- **[cosine similarity](https://www.kipi.bi/post/basics-to-knn-algorithm)**
- **[recommender system towardsdatascience](https://towardsdatascience.com/prototyping-a-recommender-system-step-by-step-part-1-knn-item-based-collaborative-filtering-637969614ea)**

*the below article is very detailed as it looks into the types of recommdation systems, and eventually goes through a similar recommendation system as this but for books*
- **[recommender system medium.com](https://aman-makwana101932.medium.com/understanding-recommendation-system-and-knn-with-project-book-recommendation-system-c648e47ff4f6)**


- **Also used chatGPT**

>

## 1.3b How the KNN recommendation works

In [278]:

recommendations = recommender_system('Lord of the Rings: The Fellowship of the Ring, The (2001)', matrix_movies_users,model_K, 20)
recommendations

Movie Selected:  Lord of the Rings: The Fellowship of the Ring, The (2001) Id:  100
Searching for recommendation....


Unnamed: 0,movieId,title,genres,year,most_common_genre,title_no_year
6376,6485,Rugrats Go Wild! (2003),Adventure|Animation|Children|Comedy,2003,Adventure,Rugrats Go Wild!
4352,4446,Final Fantasy: The Spirits Within (2001),Adventure|Animation|Fantasy|Sci-Fi,2001,Adventure,Final Fantasy: The Spirits Within
6631,6740,Bingo (1991),Adventure|Comedy,1991,Adventure,Bingo
4885,4980,Bill & Ted's Bogus Journey (1991),Adventure|Comedy|Fantasy|Sci-Fi,1991,Adventure,Bill & Ted's Bogus Journey
4895,4990,Jimmy Neutron: Boy Genius (2001),Adventure|Animation|Children|Comedy,2001,Adventure,Jimmy Neutron: Boy Genius
8040,8723,Warriors (Guerreros) (2002),Adventure|Drama|War,2002,Adventure,Warriors
6199,6297,Holes (2003),Adventure|Children|Comedy|Mystery,2003,Adventure,Holes
13345,65261,Ponyo (Gake no ue no Ponyo) (2008),Adventure|Animation|Children|Fantasy,2008,Adventure,Ponyo
7941,8624,Freedom Downtime (2001),Adventure|Crime|Documentary,2001,Adventure,Freedom Downtime
11026,45208,RV (2006),Adventure|Children|Comedy,2006,Adventure,RV


In [262]:
# This only takes my entire code and make it into a class without having to run every cell
# I am also using the class in my app.py file for my backend in flask

class RecommendationSystem:
    def __init__(self, title):
        if not title:
            print("Error: Movie title cannot be empty.")
            return
        self.title = title
        self.movies = movies
        self.ratings = ratings
        
    def get_movie(self):
        movie = process.extractOne(self.title, movies['title'])
        return [movie[0], movie[1]]

    def get_movie_Id(self):
        movie = self.get_movie()
        cleaned = movies[movies['title'] == movie[0]]
        return cleaned['movieId'].values[0]
    
    def get_movie_year(self):
        movie = self.get_movie()
        cleaned = movies[movies['title'] == movie[0]]
        return cleaned['year'].values[0]

    def process_movies(self):
        year = get_movie_year()
        movie = self.get_movie()
        movie_title = movie[0]
        movie_rows = movies[movies['title'] == movie_title]
        # common_genres = set(movie_rows['genres'].str.split('|').sum())
        categories = []
        
        unique_genres = []
        for genres in movie_rows['genres'].str.split('|'):
            for genre in genres:
                if genre not in unique_genres:
                    unique_genres.append(genre)

        print(unique_genres)

        if len(unique_genres) > 2:
            for index, item in enumerate(unique_genres):
                if index < 2:
                    categories.append(item)
                else:
                    break
        else:
            categories = list(unique_genres)

        dfs = []
        for category in categories:
            df = movies[movies['most_common_genre'] == category]
            dfs.append(df)

        if len(dfs) == 0:
            df = pd.DataFrame(columns=movies.columns)
        else:
            df = pd.concat(dfs)

        df = df[df['year'] >= (year - 10)]
        
        return df
    
    def clean_ratings(self):
        processed = self.process_movies()
        rate = ratings[ratings['movieId'].isin(processed['movieId'])]
        return rate
    
    def ratings_features(self):
        rate = self.clean_ratings()
        ratings_features = rate.pivot(columns='userId', index='movieId', values='rating').fillna(0)
        return ratings_features
        
    def matrix_dataframe(self):
        ratings_features = self.ratings_features()
        matrix_movies_users = csr_matrix(ratings_features.values)
        
        return matrix_movies_users
    
    def recommend(self):
        ratings_features = self.ratings_features()
        title = self.get_movie()
        print(title)
        mat = self.matrix_dataframe()
        movieId = self.get_movie_Id()
        model_KNN = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20)
        model_KNN.fit(mat)

        row_idx = ratings_features.index.get_loc(movieId)
        
        distances, indices = model_KNN.kneighbors(mat[row_idx, :], n_neighbors=20)
        selected = indices[0]
        selected = selected[selected != row_idx]

        selected_movies = processed.iloc[selected]

        
        return selected_movies
        
        
   

In [263]:
rec2 = RecommendationSystem('Lord of the Rings: The Fellowship of the Ring, The (2001)')

In [264]:
rec2.recommend()

['Adventure', 'Fantasy']
['Lord of the Rings: The Fellowship of the Ring, The (2001)', 100]
['Adventure', 'Fantasy']


Unnamed: 0,movieId,title,genres,year,most_common_genre,title_no_year
6376,6485,Rugrats Go Wild! (2003),Adventure|Animation|Children|Comedy,2003,Adventure,Rugrats Go Wild!
4352,4446,Final Fantasy: The Spirits Within (2001),Adventure|Animation|Fantasy|Sci-Fi,2001,Adventure,Final Fantasy: The Spirits Within
6631,6740,Bingo (1991),Adventure|Comedy,1991,Adventure,Bingo
4885,4980,Bill & Ted's Bogus Journey (1991),Adventure|Comedy|Fantasy|Sci-Fi,1991,Adventure,Bill & Ted's Bogus Journey
4895,4990,Jimmy Neutron: Boy Genius (2001),Adventure|Animation|Children|Comedy,2001,Adventure,Jimmy Neutron: Boy Genius
8040,8723,Warriors (Guerreros) (2002),Adventure|Drama|War,2002,Adventure,Warriors
6199,6297,Holes (2003),Adventure|Children|Comedy|Mystery,2003,Adventure,Holes
13345,65261,Ponyo (Gake no ue no Ponyo) (2008),Adventure|Animation|Children|Fantasy,2008,Adventure,Ponyo
7941,8624,Freedom Downtime (2001),Adventure|Crime|Documentary,2001,Adventure,Freedom Downtime
11026,45208,RV (2006),Adventure|Children|Comedy,2006,Adventure,RV
