In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [43]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words='english', max_features=150000)

# Cast

In [44]:
movies = pd.read_csv("/content/drive/MyDrive/Cast.csv")

In [45]:
movies.head()

Unnamed: 0,id,imdb_id,title,tags
0,27205,tt1375666,Inception,dileeprao leonardodicaprio johnathangear magnu...
1,157336,tt0816692,Interstellar,floranolan billirwin annehathaway mackenziefoy...
2,155,tt0468569,The Dark Knight,williamfichtn maritzacabrera michaelcain willi...
3,19995,tt0499549,Avatar,chrismala jodielandau aliciavela-bailey julene...
4,24428,tt0848228,The Avengers,damionpoiti hankamo ashleyjohnson jerryleetuck...


In [46]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114321 entries, 0 to 114320
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   id       114321 non-null  int64 
 1   imdb_id  114321 non-null  object
 2   title    114321 non-null  object
 3   tags     114321 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.5+ MB


In [47]:
movies['tags'][0]

'dileeprao leonardodicaprio johnathangear magnusnolan clairegear adamcol andrewpleavin earlcameron nicolasclerc ryanhayward virgilebramli silvielaguna angelanathenson carlgilliard kraigthornb jasontendel tomhardi nicolepulliam marioncotillard jean-micheldagori jackgilroy cillianmurphi coraliededyker timkelleh shannonwel danielgirondeaud helenacullinan talulahriley petepostlethwait michaelcain tai-lile felixscott markfleischmann mirandanolan tombereng michaelgaston josephgordon-levitt lukashaa marcraducci jackmurray yujiokumoto elliotpag russfega natashabeaumont jillmaddrel shelleylang alexlombard peterbasham lisareynold taylorgear tohorumasamun kenwatanab'

In [48]:
vector = tfidf.fit_transform(movies['tags'])
vector.shape

(114321, 150000)

In [49]:
similarity = cosine_similarity(vector)

In [50]:
similarity

array([[1.        , 0.05090562, 0.02240296, ..., 0.        , 0.        ,
        0.        ],
       [0.05090562, 1.        , 0.02920194, ..., 0.        , 0.        ,
        0.        ],
       [0.02240296, 0.02920194, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [51]:
similarity.shape

(114321, 114321)

In [52]:
top_n = 25
updated_similarity = np.zeros((similarity.shape[0], top_n), dtype=int)
updated_similarity.shape

(114321, 25)

In [53]:
for i in range(similarity.shape[0]):
    sorted_indices = np.argsort(-similarity[i])[:top_n]
    updated_similarity[i] = sorted_indices

In [54]:
updated_similarity[1915]

array([  1915,  70208, 112436,  20870,  15381,  31362,  73941,  39697,
        36339,  26333,  32873,  84347,  87199,  70041,  54788, 100238,
        37892,  81770,  84648,  88036,  23056,  60104,  14360,    543,
        60786])

In [55]:
updated_similarity.shape

(114321, 25)

In [56]:
def movie_recommend(movie):
    movie_index = movies[movies['title'] == movie].index[0]
    recommended_indices = updated_similarity[movie_index][:10]

    for i in recommended_indices:
        print(movies.iloc[i].title)

In [57]:
movie_recommend('Around the World in 80 Days')

Around the World in 80 Days
Thunderbolt
Robin-B-Hood
wkw/tk/1996@7'55''hk.net
Dolphin Reef
Animals in Love
Dead or Alive: Final
The Legend of Drunken Master
Crime Story
Gorgeous


In [58]:
movie_recommend('Iron Man')

Iron Man
Supermoto
Respect Yourself: The Stax Records Story
Lillehammer ’94: 16 Days of Glory
Democracy on Trial
Terror in Europe
Inside the Uvalde Response
Atlanta’s Olympic Glory
Sydney 2000: Stories of Olympic Glory
The Choice 2016


In [59]:
with open('/content/drive/MyDrive/cast_similarity.pkl', 'wb') as file:
    pickle.dump(updated_similarity, file)