## Loading Libraries

In [1]:
import pandas as pd
import numpy as np

## Loading Preprocessed data

In [2]:
movie_data=pd.read_csv(r"D:\Projects\Content-Based_Movie_Recommendation_System-main\Backend\Data\processed_data_for_movie_recommendation.csv")

movie_data.head()
movie_data.shape    # (4809, 3)
movie_data.isna().sum() # 0
movie_data.columns

Index(['movie_id', 'title', 'tags'], dtype='object')

## Recommender System

### Make a Word-dictionary

In [3]:
## Using Count_vectorizer

# from sklearn.feature_extraction.text import CountVectorizer
# cv=CountVectorizer(max_features=10000)
# word_vectors=cv.fit_transform(movie_data["tags"]).toarray()
# cv.get_feature_names()

In [4]:
## Using TFIDF

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
word_vector=tfidf.fit_transform(movie_data["tags"]).toarray()      # Convert words to arr[vectors]

word=tfidf.get_feature_names()
#word                                                              # 33313 of word dictionary

### Calculating Similarity or distance-metrics
* Here we need to calculate similarity of each movie with rest of movies
* In High-Dimensional Space "Eucleadian Distance" is not a good measure so we are using "Cosine Similarity"

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix=cosine_similarity(word_vector)            # cosine_similarity(word_vector)  # creates a matrix containing matrix of "similarity_score" of each movies with each other
similarity_matrix=np.round_(similarity_matrix,decimals=2)   # round-up to 2 decimal point
similarity_matrix.shape                                     # (4809, 4809)
similarity_matrix

array([[1.  , 0.02, 0.03, ..., 0.02, 0.02, 0.  ],
       [0.02, 1.  , 0.02, ..., 0.02, 0.  , 0.01],
       [0.03, 0.02, 1.  , ..., 0.02, 0.04, 0.  ],
       ...,
       [0.02, 0.02, 0.02, ..., 1.  , 0.01, 0.03],
       [0.02, 0.  , 0.04, ..., 0.01, 1.  , 0.01],
       [0.  , 0.01, 0.  , ..., 0.03, 0.01, 1.  ]])

In [11]:
 # write similarity matrix to npy file
np.save(r'D:\Projects\Content-Based_Movie_Recommendation_System-main\Backend\Data\movies_similarity_matrix.npy',similarity_matrix) 

### Recommendation Engine

In [17]:
import pandas as pd
import numpy as np

data=pd.read_csv(r"D:\Projects\Content-Based_Movie_Recommendation_System-main\Backend\Data\movie_recommendation_data.csv")          # load (Movie-ID,title) data
similarity_matrix=np.load(r"D:\Projects\Content-Based_Movie_Recommendation_System-main\Backend\Data\movies_similarity_matrix.npy")  # load similarity matrix

In [19]:
def recommend(movie_name):
    """ Takes A movie name & Returns Top-3 similar list("movie_Id","movie_title","similarity) """
    Recommended_movies = []

    movie_index=data[data["title"]==(movie_name)].index[0]                                   # return movie_Id from movie name
    similarity=similarity_matrix[movie_index]
    recommend_movies=sorted(list(enumerate(similarity)),reverse=True,key=lambda x:x[1])[1:4] # return top-3 most similar(movie_Id,similarity_matrix)
    for movie in recommend_movies:
        Recommended_movies.append([movie[0],data.iloc[movie[0]].title,round(movie[1],2)])
    
    return Recommended_movies

# movies=recommend("Pirates of the Caribbean: On Stranger Tides")
# movies=recommend("Furious 7")
# for movie in movies:
#     print("ID:{} - {} ;Similarity'{}'".format(movie[0],movie[1],movie[2]))

In [21]:
## Predict Your Next Movie
recommend("Avatar")
recommend("The Avengers")
recommend("Titanic")
recommend("Toy Story 3")
recommend("Furious 7")
recommend("World War Z")

movies=recommend("Furious 7")
for movie in movies:
    print("ID:{} - {} ;Similarity':{}'".format(movie[0],movie[1],movie[2]))


ID:99 - The Fast and the Furious ;Similarity':0.33'
ID:204 - Fast Five ;Similarity':0.29'
ID:405 - The Fast and the Furious: Tokyo Drift ;Similarity':0.23'


In [16]:
## For Movie recommendation we only require data(movie-ID,title) & similarity-matrix.npy
df=movie_data[['movie_id','title']]
df.to_csv(r"D:\Projects\Content-Based_Movie_Recommendation_System-main\Backend\Data\movie_recommendation_data.csv",index=False)