In [3]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words='english', max_features=157262)
svd = TruncatedSVD(n_components=200, random_state=42)

#Cast

In [4]:
movies = pd.read_csv("/content/drive/MyDrive/Cast.csv")

In [5]:
movies['tags'][0]

'dileeprao leonardodicaprio johnathangear action sciencefict adventur rescu mission dream airplan pari franc virtualr kidnap philosophi spi allegori manipul carcrash heist memori architectur losangel california dreamworld subconsci'

In [6]:
vector = tfidf.fit_transform(movies['tags'])
vector.shape

(157262, 157262)

In [7]:
vector = svd.fit_transform(vector)
vector.shape

(157262, 200)

In [8]:
similarity = cosine_similarity(vector)

In [9]:
similarity

array([[ 1.00000000e+00,  1.62979237e-01,  4.00198543e-02, ...,
        -2.27057473e-03,  9.18487418e-04,  9.63913788e-03],
       [ 1.62979237e-01,  1.00000000e+00, -3.29547259e-03, ...,
         9.73836038e-03,  2.48462032e-02,  2.56762274e-03],
       [ 4.00198543e-02, -3.29547259e-03,  1.00000000e+00, ...,
        -1.50546383e-03,  1.89119424e-02,  2.04673189e-03],
       ...,
       [-2.27057473e-03,  9.73836038e-03, -1.50546383e-03, ...,
         1.00000000e+00, -1.52243734e-03,  3.70644725e-01],
       [ 9.18487418e-04,  2.48462032e-02,  1.89119424e-02, ...,
        -1.52243734e-03,  1.00000000e+00,  4.44367448e-03],
       [ 9.63913788e-03,  2.56762274e-03,  2.04673189e-03, ...,
         3.70644725e-01,  4.44367448e-03,  1.00000000e+00]])

In [10]:
similarity.shape

(157262, 157262)

In [11]:
top_n = 25
updated_similarity2 = np.zeros((similarity.shape[0], top_n), dtype=int)

In [12]:
for i in range(similarity.shape[0]):
    sorted_indices = np.argsort(-similarity[i])[:top_n]
    updated_similarity2[i] = sorted_indices

In [13]:
updated_similarity2[101]

array([   101,   1403,   2938,   5016,  76390,   7837,   1801,  14692,
       151487,  68140, 136145, 123866,   2589,  46032,  73197,   2743,
        59928, 148679, 118846,   2501,   1381,  19545,  30169,   5330,
        84951])

In [14]:
updated_similarity2.shape

(157262, 25)

In [15]:
def movie_recommend(movie):
    movie_index = movies[movies['title'] == movie].index[0]
    recommended_indices = updated_similarity2[movie_index][:10]

    for i in recommended_indices:
        print(movies.iloc[i].title)

In [16]:
movie_recommend('Around the World in 80 Days')

Around the World in 80 Days
A View to a Kill
Bean
Men in Black: International
Ghost Dance
Teheran '43
RED 2
Sherlock Holmes: A Game of Shadows
The Scorpio Letters
Inspector Clouseau


In [17]:
with open('/content/drive/MyDrive/cast_similarity.pkl', 'wb') as file:
    pickle.dump(updated_similarity2, file)