## Importing and reading csv


In [45]:
import pandas
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import random

In [46]:
movies = pandas.read_csv('data/movies.csv')
credits = pandas.read_csv('data/credits.csv')
ratings = pandas.read_csv('data/ratings.csv')
movies_small = pandas.read_csv('data/movies_small.csv', sep=';')

In [47]:
movies_small

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,190000000,"[{""id"": 28, ""name"": ""Action""}]",http://www.furious7.com/,168259,"[{""id"": 830, ""name"": ""car race""}, {""id"": 3428,...",en,Furious 7,Deckard Shaw seeks revenge against Dominic Tor...,102.322217,"[{""name"": ""Universal Pictures"", ""id"": 33}, {""n...","[{""iso_3166_1"": ""JP"", ""name"": ""Japan""}, {""iso_...",2015-04-01,1506249360,137,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Vengeance Hits Home,Furious 7,7.3,4176
1,200000000,"[{""id"": 16, ""name"": ""Animation""}, {""id"": 10751...",http://www.disney.go.com/cars/,49013,"[{""id"": 830, ""name"": ""car race""}, {""id"": 9663,...",en,Cars 2,Star race car Lightning McQueen and his pal Ma...,49.98659,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2011-06-11,559852396,106,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Ka-ciao!,Cars 2,5.8,2033
2,170000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 878, ""na...",http://marvel.com/guardians,118340,"[{""id"": 8828, ""name"": ""marvel comic""}, {""id"": ...",en,Guardians of the Galaxy,"Light years from Earth, 26 years after being a...",481.098624,"[{""name"": ""Marvel Studios"", ""id"": 420}, {""name...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2014-07-30,773328629,121,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,All heroes start somewhere.,Guardians of the Galaxy,7.9,9742
3,145000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.kungfupanda.com/,140300,"[{""id"": 478, ""name"": ""china""}, {""id"": 779, ""na...",en,Kung Fu Panda 3,"Continuing his ""legendary adventures of awesom...",56.747978,"[{""name"": ""Twentieth Century Fox Film Corporat...","[{""iso_3166_1"": ""CN"", ""name"": ""China""}, {""iso_...",2016-01-23,521170825,95,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Grab destiny by the rice dumplings.,Kung Fu Panda 3,6.7,1603
4,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-07-16,1084939099,165,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106
5,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2012-03-07,284139100,132,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124


## Intalize TF-IDF and remove na values


In [48]:
tfidf = TfidfVectorizer(stop_words='english')
movies['overview'] = movies['overview'].fillna('')

In [49]:
tfidf_matrix = tfidf.fit_transform(movies['overview'])

## Create TF-IDF matrix and put in dataframe and then print first 3 rows


In [50]:
pandas.DataFrame(tfidf_matrix.toarray(),

                 columns=tfidf.get_feature_names_out())[0:3]

Unnamed: 0,00,000,007,07am,10,100,1000,101,108,10th,...,zuckerberg,zula,zuzu,zyklon,æon,éloigne,émigré,été,única,über
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Similarity matrix


In [51]:
similarity_matrix = linear_kernel(tfidf_matrix, tfidf_matrix)

In [52]:
similarity_matrix[1]

array([0.        , 1.        , 0.        , ..., 0.02160533, 0.        ,
       0.        ])

In [53]:
similarity_matrix.shape

(4803, 4803)

## Find the most similar movies


In [59]:
def get_similar_movies(movie_title, similarity_matrix):
    # Find the index of the movie in the DataFrame
    movie_index = movies[movies['title'].str.lower() ==
                         movie_title.lower()].index[0]

    # Create a list of tuples (index, similarity_score)
    similarity_scores = list(enumerate(similarity_matrix[movie_index]))

    # Sort the list of tuples by similarity score in descending order
    similarity_scores = sorted(
        similarity_scores, key=lambda x: x[1], reverse=True)

    # Exclude the first item (itself)
    similarity_scores = similarity_scores[1:]

    # If there are more than 10 similar movies, randomly select 10
    if len(similarity_scores) > 10:
        similarity_scores = random.sample(similarity_scores, 10)

    # Extract the indices of top similar movies
    similar_movie_indices = [index[0] for index in similarity_scores]

    # Return the titles of the similar movies using the indices
    similar_movie_titles = movies['title'].iloc[similar_movie_indices]

    return similar_movie_titles

In [62]:
movies_title = 'John Carter'
get_similar_movies(movies_title, similarity_matrix)

3669            Should've Been Romeo
4685    The Case of the Grinning Cat
1941            Where the Truth Lies
29                           Skyfall
2284                     The Shining
550            The Angry Birds Movie
3345               Definitely, Maybe
616                            Ted 2
1162     He's Just Not That Into You
1724               Pride & Prejudice
Name: title, dtype: object