<h4>Imports</h4>

In [1]:
import pandas as pd

<h5>Data Loading</h5>

In [2]:
# Loading movies.csv file
movies = pd.read_csv('movies.csv')

# Loading tags.csv file
tags = pd.read_csv('tags.csv')

In [3]:
tags.head()

Unnamed: 0,userId,movieid,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [4]:
movies.head()

Unnamed: 0,movieid,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
# Merging the two files on 'movieid'
data = pd.merge(movies, tags, on='movieid')

# Grouping tags by movieid and concatenate them into a single string
data = data.groupby('movieid')['tag'].apply(' '.join).reset_index()

In [6]:
data

Unnamed: 0,movieid,tag
0,1,pixar pixar fun
1,2,fantasy magic board game Robin Williams game
2,3,moldy old
3,5,pregnancy remake
4,7,remake
...,...,...
1567,183611,Comedy funny Rachel McAdams
1568,184471,adventure Alicia Vikander video game adaptation
1569,187593,Josh Brolin Ryan Reynolds sarcasm
1570,187595,Emilia Clarke star wars


In [7]:
# Merging the tags with the movies dataframe on 'movieId'
movies = pd.merge(movies, data, on='movieid')

In [8]:
# Extracting the genres from the 'genres' column
movies['genres'] = movies['genres'].str.split('|')

# Extracting the tags from the 'tag' column
# movies['tag'] = movies['tag'].str.lower().str.replace('-', ' ')

In [9]:
movies

Unnamed: 0,movieid,title,genres,tag
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",pixar pixar fun
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",fantasy magic board game Robin Williams game
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",moldy old
3,5,Father of the Bride Part II (1995),[Comedy],pregnancy remake
4,7,Sabrina (1995),"[Comedy, Romance]",remake
...,...,...,...,...
1567,183611,Game Night (2018),"[Action, Comedy, Crime, Horror]",Comedy funny Rachel McAdams
1568,184471,Tomb Raider (2018),"[Action, Adventure, Fantasy]",adventure Alicia Vikander video game adaptation
1569,187593,Deadpool 2 (2018),"[Action, Comedy, Sci-Fi]",Josh Brolin Ryan Reynolds sarcasm
1570,187595,Solo: A Star Wars Story (2018),"[Action, Adventure, Children, Sci-Fi]",Emilia Clarke star wars


In [10]:
# Combining the genres into a single string separated by single space
movies['content'] = movies.apply(lambda row: ' '.join(row['genres']), axis=1)

In [11]:
movies

Unnamed: 0,movieid,title,genres,tag,content
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",pixar pixar fun,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",fantasy magic board game Robin Williams game,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",moldy old,Comedy Romance
3,5,Father of the Bride Part II (1995),[Comedy],pregnancy remake,Comedy
4,7,Sabrina (1995),"[Comedy, Romance]",remake,Comedy Romance
...,...,...,...,...,...
1567,183611,Game Night (2018),"[Action, Comedy, Crime, Horror]",Comedy funny Rachel McAdams,Action Comedy Crime Horror
1568,184471,Tomb Raider (2018),"[Action, Adventure, Fantasy]",adventure Alicia Vikander video game adaptation,Action Adventure Fantasy
1569,187593,Deadpool 2 (2018),"[Action, Comedy, Sci-Fi]",Josh Brolin Ryan Reynolds sarcasm,Action Comedy Sci-Fi
1570,187595,Solo: A Star Wars Story (2018),"[Action, Adventure, Children, Sci-Fi]",Emilia Clarke star wars,Action Adventure Children Sci-Fi


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Creating a TfidfVectorizer object
vectorizer = TfidfVectorizer(stop_words='english')

# Transforming content column into a numeric representation
content_matrix = vectorizer.fit_transform(movies['content'])

# Computing cosine similarity between the movies
similarities = cosine_similarity(content_matrix)

# Converting the similarities array to a pandas DataFrame
similarity_df = pd.DataFrame(similarities, index=movies.index, columns=movies.index)


In [13]:
similarity_df.to_csv("similarity_dataset.csv", index=False)

In [14]:
def get_similar_movies(movie_name, n=10):
    # Getting the index of the movie
    movie_idx = movies[movies['title'] == movie_name].index[0]

    # Getting the similarity scores for the given movie
    movie_scores = similarity_df[movie_idx].sort_values(ascending=False)

    # Getting the top n similar movies
    similar_movies = movies.loc[movie_scores.iloc[1:n+1].index]

#     return similar_movies['title'].tolist()
    return [(row['title'], row['movieid']) for index, row in similar_movies.iterrows()]

In [15]:
similar_movies = get_similar_movies("Bug's Life, A (1998)")

In [16]:
similar_movies

[('Finding Nemo (2003)', 6377),
 ('Wallace & Gromit in The Curse of the Were-Rabbit (2005)', 38038),
 ('101 Dalmatians (One Hundred and One Dalmatians) (1961)', 2085),
 ('Secret of NIMH, The (1982)', 2139),
 ('Up (2009)', 68954),
 ('Incredibles, The (2004)', 8961),
 ('Wallace & Gromit: A Close Shave (1995)', 745),
 ('Toy Story (1995)', 1),
 ('Toy Story 2 (1999)', 3114),
 ('Aladdin (1992)', 588)]