In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words='english', max_features=150000)

#Storyline

In [3]:
movies = pd.read_csv("/content/drive/MyDrive/Storyline.csv")

In [4]:
movies.head()

Unnamed: 0,id,imdb_id,title,tags
0,27205,tt1375666,Inception,"cobb, a skill thief who commit corpor espionag..."
1,157336,tt0816692,Interstellar,the adventur of a group of explor who make use...
2,155,tt0468569,The Dark Knight,batman rais the stake in hi war on crime. with...
3,19995,tt0499549,Avatar,"in the 22nd century, a parapleg marin is dispa..."
4,24428,tt0848228,The Avengers,when an unexpect enemi emerg and threaten glob...


In [5]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114321 entries, 0 to 114320
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   id       114321 non-null  int64 
 1   imdb_id  114321 non-null  object
 2   title    114321 non-null  object
 3   tags     114321 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.5+ MB


In [6]:
movies['tags'][0]

'cobb, a skill thief who commit corpor espionag by infiltr the subconsci of hi target is offer a chanc to regain hi old life as payment for a task consid to be impossible: "inception", the implant of anoth person\' idea into a target\' subconscious. action sciencefict adventur rescu mission dream airplan pari franc virtualr kidnap philosophi spi allegori manipul carcrash heist memori architectur losangel california dreamworld subconsci dileeprao leonardodicaprio johnathangear magnusnolan clairegear adamcol andrewpleavin earlcameron nicolasclerc ryanhayward'

In [7]:
vector = tfidf.fit_transform(movies['tags'])
vector.shape

(114321, 150000)

In [8]:
similarity = cosine_similarity(vector)

In [9]:
similarity

array([[1.00000000e+00, 3.30588207e-02, 5.08839985e-03, ...,
        0.00000000e+00, 0.00000000e+00, 1.23578136e-02],
       [3.30588207e-02, 1.00000000e+00, 8.58155918e-04, ...,
        0.00000000e+00, 1.06827830e-03, 0.00000000e+00],
       [5.08839985e-03, 8.58155918e-04, 1.00000000e+00, ...,
        0.00000000e+00, 5.50161105e-03, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.06827830e-03, 5.50161105e-03, ...,
        0.00000000e+00, 1.00000000e+00, 2.01262451e-02],
       [1.23578136e-02, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 2.01262451e-02, 1.00000000e+00]])

In [10]:
similarity.shape

(114321, 114321)

In [11]:
top_n = 25
updated_similarity = np.zeros((similarity.shape[0], top_n), dtype=int)
updated_similarity.shape

(114321, 25)

In [12]:
for i in range(similarity.shape[0]):
    sorted_indices = np.argsort(-similarity[i])[:top_n]
    updated_similarity[i] = sorted_indices

In [13]:
updated_similarity[1915]

array([  1915,  60920,  42685,  38531,  67898,  69271, 114269, 110068,
        33896, 107168,  64964,  88892, 100082,  37368, 101446,  63911,
         2584,  40190,  53460,  26091,  99664,  51779,  34561,  90362,
        52894])

In [14]:
updated_similarity.shape

(114321, 25)

In [15]:
def movie_recommend(movie):
    movie_index = movies[movies['title'] == movie].index[0]
    recommended_indices = updated_similarity[movie_index][:10]

    for i in recommended_indices:
        print(movies.iloc[i].title)

In [16]:
movie_recommend('Around the World in 80 Days')

Around the World in 80 Days
wkw/tk/1996@7'55''hk.net
Around the World in Eighty Days
City Hunter
Man of Tai Chi
Kings of Hades
Adult Fun
Sherlock Holmes: A Game of Shadows
Is This Reasonable?
A View to a Kill


In [17]:
movie_recommend('Iron Man')

Iron Man
Iron Man 2
Iron Man 3
Spider-Man: Homecoming
Guardians of the Galaxy Vol. 2
Spider-Man 4
Marvel One-Shot: Item 47
Team Thor
Avengers: Age of Ultron
Alice's Knaughty Knight


In [18]:
with open('/content/drive/MyDrive/storyline_similarity.pkl', 'wb') as file:
    pickle.dump(updated_similarity, file)