In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.stem.porter import PorterStemmer

In [11]:
movies_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DataSets/movies_data.csv')

#Feature Selection
features_df = movies_df[['originalTitle', 'genres', 'crew', 'overview']]

features_df = features_df.applymap(lambda x: str(x).lower())

features_df['genres'] = features_df['genres'].str.replace(',', ' ')
features_df['crew'] = features_df['crew'].apply(lambda x: ' '.join(name.replace(' ', '') for name in x.split(',')))

features_df['tag'] = (
    features_df['originalTitle'] + ' ' +
    features_df['overview'] + ' ' +
    features_df['genres'] + ' ' +
    features_df['crew']
)

# Drop rows with any missing values
features_df.dropna(inplace=True)

In [12]:
ps = PorterStemmer()
def textToWordRoots(text):
    words = []

    for word in text.split():
      words.append(ps.stem(word))

    return ' '.join(words)

In [13]:
features_df['tag'] = features_df['tag'].apply(textToWordRoots)

In [26]:
cv = CountVectorizer(max_features=7000, stop_words='english')
features_matrix = cv.fit_transform(features_df['tag']).toarray()

In [27]:
similarity_matrix = cosine_similarity(features_matrix)

In [28]:
np.save('similarity_matrix.npy', similarity_matrix)

In [29]:
def recommend(movie_index, similarity_matrix, top_n=10):
    similarity_scores = list(enumerate(similarity_matrix[movie_index]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:top_n+1]
    movie_indices = [i[0] for i in similarity_scores]

    return movie_indices

In [30]:
def get_movie_index(title, movies_df):
    return movies_df[movies_df['originalTitle'].str.contains(title, case=False, na=False)].index[0]

In [33]:
movie_title = "shrek"
movie_index = get_movie_index(movie_title, movies_df)
print(movie_index)
recommended_indices = recommend(movie_index, similarity_matrix, 10)
recommended_movies = movies_df.iloc[recommended_indices]['originalTitle'].tolist()

print("Recommended movies for '{}':".format(movie_title))
print(recommended_movies)

18
Recommended movies for 'shrek':
['Shrek 2', 'Shrek the Third', 'Shrek Forever After', 'The Addams Family 2', 'Shanghai Noon', 'The Princess and the Frog', 'Wish Dragon', 'Puss in Boots', 'Gong fu yu jia', 'Trolls']
