### Importing the libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
movies = pd.read_csv(r"C:\Users\DELL\Downloads\The Movies Dataset\movies_metadata.csv",
                    usecols=["id","overview","title","vote_average","vote_count","release_date"],low_memory=False)

In [3]:
movies.head()

Unnamed: 0,id,overview,release_date,title,vote_average,vote_count
0,862,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,Toy Story,7.7,5415.0
1,8844,When siblings Judy and Peter discover an encha...,1995-12-15,Jumanji,6.9,2413.0
2,15602,A family wedding reignites the ancient feud be...,1995-12-22,Grumpier Old Men,6.5,92.0
3,31357,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,Waiting to Exhale,6.1,34.0
4,11862,Just when George Banks has recovered from his ...,1995-02-10,Father of the Bride Part II,5.7,173.0


In [4]:
movies = movies.reset_index(drop=True)
movies = movies.dropna()
movies = movies.drop_duplicates()
movies = movies.rename(columns={"id":"movieId"})
movies["movieId"] = movies["movieId"].astype("int64")

In [5]:
#We will focus only on the overview from the dataset:
movies["overview"].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [6]:
#We are using the TF-IDF method and setting up the model:
tfidf = TfidfVectorizer(stop_words="english", min_df = 4)

#We removed commonly used words such as 'and', 'the', 'on', 'in', as they do not carry significant values.

In [7]:
#We replaced NaN values with blanks as NaNs can cause issues in calculations:
movies['overview'] = movies['overview'].fillna('')

In [8]:
#After fitting, we transform the data:
tfidf_matrix = tfidf.fit_transform(movies['overview'])

In [9]:
#There are 45,466 movie reviews and 75,827 words:
tfidf_matrix.shape

(44407, 23834)

### Cosine Similarity Matrix

In [10]:
#This is the part where we find which movies are similar to each other, mathematically speaking, using text vectors.
cosine_sim = cosine_similarity(tfidf_matrix,
                               tfidf_matrix)
#The cosine_sim.shape gives us the similarities between documents.
cosine_sim.shape

(44407, 44407)

### Making Recommendations based on similarities

In [11]:
#To evaluate the calculated scores, we retrieve the names.

indices = pd.Series(movies.index, index=movies['title'])

In [12]:
#There are multiple instances of movies.
indices.index.value_counts()

title
Cinderella              11
Hamlet                   9
Beauty and the Beast     8
Alice in Wonderland      8
Les Misérables           8
                        ..
No Greater Love          1
A Woman in Berlin        1
Talhotblond              1
Tortilla Flat            1
Queerama                 1
Name: count, Length: 41303, dtype: int64

In [13]:
#We keep one of the duplicate movies and delete the others. We take the last one for freshness.
indices = indices[~indices.index.duplicated(keep='last')]

In [14]:
#We note the index of the movie "Sherlock Holmes".
movie_index = indices["Sherlock Holmes"]

In [15]:
#Accessing cosine_sim with the index of Sherlock Holmes.
cosine_sim[movie_index]

array([0.00630183, 0.00923754, 0.        , ..., 0.        , 0.01089884,
       0.        ])

In [16]:
#We create a dataframe called similarity_scores and retrieve the similar ones, evaluating them as scores.
similarity_scores = pd.DataFrame(cosine_sim[movie_index],columns=["score"])

In [17]:
#Fetching the top 10 movies with the highest scores. The first observation includes the movie itself, so we use 1 to 11.
movie_indices = similarity_scores.sort_values("score", ascending=False)[1:11].index

In [18]:
#Retrieving the titles of the movies with index information.
movies['title'].iloc[movie_indices]

35745    The Dog of Flanders
16735    The Heart Elsewhere
31594         We Can Do That
30608      Drama of Jealousy
25451             Marvellous
44609             The Mitten
21348      Darling Companion
12104        The Dog Problem
33454       The Empty Canvas
42413         Death by Death
Name: title, dtype: object

### Building a Recommendation Function

In [19]:
def content_based_recommender(title, cosine_sim, dataframe):
    # making index
    indices = pd.Series(dataframe.index, index=dataframe['title'])
    indices = indices[~indices.index.duplicated(keep='last')]
    # catch title's index
    movie_index = indices[title]
    # calculating similarty score to target
    similarity_scores = pd.DataFrame(cosine_sim[movie_index], columns=["score"])
    # bring 10 movie
    movie_indices = similarity_scores.sort_values("score", ascending=False)[1:11].index
    return dataframe['title'].iloc[movie_indices]

In [21]:
content_based_recommender("Toy Story", cosine_sim, movies)

15348                                     Toy Story 3
2997                                      Toy Story 2
10301                          The 40 Year Old Virgin
24523                                       Small Fry
8327                                        The Champ
23843                     Andy Hardy's Blonde Trouble
43427                Andy Kaufman Plays Carnegie Hall
29202                                      Hot Splash
38476    Superstar: The Life and Times of Andy Warhol
42721    Andy Peters: Exclamation Mark Question Point
Name: title, dtype: object

In [22]:
content_based_recommender("The Matrix", cosine_sim, movies)

27610                So Sweet, So Dead
3534                             Lured
21                             Copycat
2069                            Frenzy
20626       The Wandering Soul Murders
7583             The Stendhal Syndrome
28141               Mark Strikes Again
23816    Tables Turned on the Gardener
26944            Whistling in Brooklyn
28203                Kommissarie Späck
Name: title, dtype: object