In [97]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process
import seaborn as sns
import matplotlib.pyplot as plt
import plotly_express as px
import re

- Some of the code in this file have been explained in the Explorative analyis file

- I will will explain where i a new piece of code and how i came to any conclusions

In [98]:
movies, ratings = pd.read_csv('../data/movies.csv'), pd.read_csv('../data/ratings.csv')


In [99]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [100]:
movies['year']  = movies['title'].str.extract(r'\((\d{4})\)')

movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II (1995),Comedy,1995


In [101]:
movies.describe()

Unnamed: 0,movieId
count,58098.0
mean,111919.516197
std,59862.660956
min,1.0
25%,72437.75
50%,126549.0
75%,161449.5
max,193886.0


In [102]:
movies.loc[:, 'title_no_year'] = movies['title'].apply(lambda x: x.split("(")[0].rstrip())

---

## 1.3) Recommender system

- The below answers are explained in file 1_3

In [103]:
ratings['movieId'].nunique()

53889

In [104]:
movies['movieId'].nunique()

58098

In [105]:
new_ratings = ratings[ratings['movieId'].isin(movies['movieId'])]
new_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27753444 entries, 0 to 27753443
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 1.0 GB


In [106]:
# Convert movieId & userId to Categorical's to make them work with csr_matrix

movieIds = pd.Categorical(new_ratings['movieId'], categories=movies['movieId'])
userIds = pd.Categorical(new_ratings['userId'])

# Create the csr matrix
matrix = csr_matrix((new_ratings['rating'], (movieIds.codes, userIds.codes)))

matrix.shape

(58098, 283228)

In [107]:
model_nearest_neighbor = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10)
model_nearest_neighbor.fit(matrix)

In [108]:
movies[movies['title'] == 'Sharknado (2013)']

Unnamed: 0,movieId,title,genres,year,title_no_year
21519,103596,Sharknado (2013),Sci-Fi,2013,Sharknado


In [109]:
movies[movies['title'] == 'Sharknado (2013)']['movieId'].values[0]

103596

In [110]:
ratings[ratings['movieId'] == 103596]

Unnamed: 0,userId,movieId,rating,timestamp
13221,134,103596,2.0,1424660688
107849,1089,103596,2.5,1377528529
240325,2329,103596,2.0,1421016581
351780,3583,103596,3.0,1468361712
411651,4203,103596,1.5,1511044668
...,...,...,...,...
27267975,278225,103596,2.5,1442375893
27300077,278554,103596,0.5,1502763424
27362890,279173,103596,1.0,1501596274
27625875,281869,103596,0.5,1423035948


In [111]:
process.extractOne('sharknado', movies['title'])

('Sharknado (2013)', 90, 21519)

In [112]:
process.extractOne('sharknado', movies['title'])[2]

21519

In [113]:
def recommender_system(movie_name, number_recommendations):
    title = process.extractOne(movie_name, movies['title'])[0]
    movie_id = process.extractOne(movie_name, movies['title'])[1]
    movie_idx = process.extractOne(movie_name, movies['title'])[2]
    id = movies[movies['title'] == title]['movieId'].values[0]
    print('Movie Selected: ', movies['title'][movie_idx], 'Id: ',movie_id)
    print('Searching for recommendation....')

    
    distances, indices = model_nearest_neighbor.kneighbors(matrix[movie_idx], n_neighbors=number_recommendations)
    
    indice = indices[0]
    
    selected = indice[indice != movie_idx]

    selected_movies = movies.iloc[selected]
    print(indices)
    return selected_movies

In [114]:
recommendations = recommender_system('sharknado',10)
recommendations

Movie Selected:  Sharknado (2013) Id:  90
Searching for recommendation....
[[21519 24658 32886 43775 34324 23884 19722 26342 20988 15787]]


Unnamed: 0,movieId,title,genres,year,title_no_year
24658,114242,Sharknado 2: The Second One (2014),Horror|Sci-Fi|Thriller,2014,Sharknado 2: The Second One
32886,136305,Sharknado 3: Oh Hell No! (2015),Horror|Sci-Fi,2015,Sharknado 3: Oh Hell No!
43775,161918,Sharknado 4: The 4th Awakens (2016),Action|Adventure|Horror|Sci-Fi,2016,Sharknado 4: The 4th Awakens
34324,139759,Lavalantula (2015),Horror|Sci-Fi,2015,Lavalantula
23884,111663,Zombeavers (2014),Action|Comedy|Horror,2014,Zombeavers
19722,96923,2-Headed Shark Attack (2012),Comedy|Horror,2012,2-Headed Shark Attack
26342,119705,Piranhaconda (2012),Horror|Sci-Fi,2012,Piranhaconda
20988,101739,Evil Dead (2013),Horror,2013,Evil Dead
15787,79879,Piranha (Piranha 3D) (2010),Action|Horror|Thriller,2010,Piranha


## 1.3a&b How my system works AND how KNN works

- My goal in this exercise is to recommend movies to a user based on inputed movie for toy story.

- First i used fuzzywuzzy to return a close match to the movie inputed by the user. This reduces the potential for 
errors since you would need a perfect match for movie you are handling in the dataframe.

- I get a close enough string as a movie from fuzzywuzzy.

- I have cleaned the ratings the ratings dataset to only contain movieIds that are present in the movies dataframe.

- I create a csr matrix using pandas Categorical and for ratings specify to include even rows in the movies dataframe which might otherwie not be included in the matrix if they do not have any ratings since a csr matrix only 
    stores non-zero values.

- Eventually initialise my nearest neighbor model with the from scipy, then i pass in the metric "cosine" which  specifies the distance metric used to measure the similarity between two data points. In this case, the cosine similarity metric is used, which measures the cosine of the angle between two vectors. A vector in recommender system would each row in the matrix corresponding each movie.  The 'brute' force algorithm is used, which computes the distances between all pairs of points in the dataset and n_neighbors=10 specifies the number of nearest neighbors to be returned for each data point.

- Finally the indices returned are used to return a dataframe containing just the movies recommended to the user.

#### How KNN works here
Source: https://www.ibm.com/se-en/topics/knn
- The k-nearest neighbors algorithm, also known as KNN or k-NN, is a non-parametric, supervised learning classifier, which uses proximity to make classifications or predictions about the grouping of an individual data point. While it can be used for either regression or classification problems, it is typically used as a classification algorithm, working off the assumption that similar points can be found near one another. the goal of the k-nearest neighbor algorithm is to identify the nearest neighbors of a given query point, so that we can assign a class label to that point. In order to do this, KNN has a few requirements

<img src="../assets/cos.webp" alt="description of the image" width="300" height="200">

A good example is the image above. Joao Felix and Messi are similar, but Jaoa has fewer years of play and doesnt have as many ratings but is  very similar to messi as opposed
to Cristiano who is quite different but simailar in amout of ratings as Messi. A euclidean distance would have picked Messi and Ronaldo where a cosine would pick Joao as similar to Messi.

Cosine similarity measures the similarity between two vectors or data points in multidimensional space. It is measured by the cosine of the angle between two vectors or data points. It determines whether these two vectors are pointing in the same direction. It is often used to measure similarity in text analysis.

When KNN makes inference about a movie, KNN will calculate the “distance” between the target movie and every other movie in its database, then it ranks its distances and returns the top K nearest neighbor movies as the most similar movie recommendations.


- **[cosine similarity](https://www.kipi.bi/post/basics-to-knn-algorithm)**
- **[recommender system towardsdatascience](https://towardsdatascience.com/prototyping-a-recommender-system-step-by-step-part-1-knn-item-based-collaborative-filtering-637969614ea)**

*the below article is very detailed as it looks into the types of recommdation systems, and eventually goes through a similar recommendation system as this but for books*
- **[recommender system medium.com](https://aman-makwana101932.medium.com/understanding-recommendation-system-and-knn-with-project-book-recommendation-system-c648e47ff4f6)**


- **Also used chatGPT**

>