In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words='english', max_features=150000)

# Cast

In [3]:
movies = pd.read_csv("/content/drive/MyDrive/Director.csv")

In [4]:
movies.head()

Unnamed: 0,id,imdb_id,title,tags
0,27205,tt1375666,Inception,christophernolan action sciencefict adventur r...
1,157336,tt0816692,Interstellar,christophernolan adventur drama sciencefict re...
2,155,tt0468569,The Dark Knight,christophernolan drama action crime thriller j...
3,19995,tt0499549,Avatar,jamescameron action adventur fantasi sciencefi...
4,24428,tt0848228,The Avengers,josswhedon sciencefict action adventur newyork...


In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114321 entries, 0 to 114320
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   id       114321 non-null  int64 
 1   imdb_id  114321 non-null  object
 2   title    114321 non-null  object
 3   tags     114321 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.5+ MB


In [6]:
movies['tags'][0]

'christophernolan action sciencefict adventur rescu mission dream airplan pari franc virtualr kidnap philosophi spi allegori manipul carcrash heist memori architectur losangel california dreamworld subconsci unitedkingdom unitedstatesofamerica legendarypictur syncopi warnerbros.pictur'

In [7]:
vector = tfidf.fit_transform(movies['tags'])
vector.shape

(114321, 150000)

In [8]:
similarity = cosine_similarity(vector)

In [9]:
similarity

array([[1.        , 0.20989835, 0.18787579, ..., 0.        , 0.01179515,
        0.00243231],
       [0.20989835, 1.        , 0.1306619 , ..., 0.        , 0.01082168,
        0.00172985],
       [0.18787579, 0.1306619 , 1.        , ..., 0.        , 0.01405497,
        0.00224669],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.01179515, 0.01082168, 0.01405497, ..., 0.        , 1.        ,
        0.        ],
       [0.00243231, 0.00172985, 0.00224669, ..., 0.        , 0.        ,
        1.        ]])

In [10]:
similarity.shape

(114321, 114321)

In [11]:
top_n = 25
updated_similarity = np.zeros((similarity.shape[0], top_n), dtype=int)
updated_similarity.shape

(114321, 25)

In [12]:
for i in range(similarity.shape[0]):
    sorted_indices = np.argsort(-similarity[i])[:top_n]
    updated_similarity[i] = sorted_indices

In [13]:
updated_similarity[1915]

array([  1915,  14196, 107168,  23245,  25948,  52894,  96833,  33896,
        86796,  31364,  26360,  55858,   4752,  38531,  82485,   4269,
        25909,  44993,  14177,  94650,  38908,  28779,   2144,  55807,
        58267])

In [14]:
updated_similarity.shape

(114321, 25)

In [15]:
def movie_recommend(movie):
    movie_index = movies[movies['title'] == movie].index[0]
    recommended_indices = updated_similarity[movie_index][:10]

    for i in recommended_indices:
        print(movies.iloc[i].title)

In [16]:
movie_recommend('Around the World in 80 Days')

Around the World in 80 Days
Five Weeks in a Balloon
Law of the Barbary Coast
Sherlock Holmes: A Game of Shadows
The Ridiculous 6
Murdered Innocence
A View to a Kill
Ghost Dance
Terminator 3: Rise of the Machines
A Perilous Journey


In [17]:
movie_recommend('Iron Man')

Iron Man
Iron Man 2
Black Panther
Iron Man 3
Guardians of the Galaxy Vol. 2
Guardians of the Galaxy
Doctor Strange in the Multiverse of Madness
The Avengers
Captain America: Civil War
Shang-Chi and the Legend of the Ten Rings


In [18]:
with open('/content/drive/MyDrive/director_similarity.pkl', 'wb') as file:
    pickle.dump(updated_similarity, file)