In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

tfidf = TfidfVectorizer(stop_words='english')
svd = TruncatedSVD(n_components=200, random_state=42)

#Storyline

In [2]:
movies = pd.read_csv("/content/drive/MyDrive/Storyline.csv")

In [3]:
movies['tags'][0]

'cobb, a skill thief who commit corpor espionag by infiltr the subconsci of hi target is offer a chanc to regain hi old life as payment for a task consid to be impossible: "inception", the implant of anoth person\' idea into a target\' subconscious. action sciencefict adventur rescu mission dream airplan pari franc virtualr kidnap philosophi spi allegori manipul carcrash heist memori architectur losangel california dreamworld subconsci dileeprao leonardodicaprio johnathangear christophernolan'

In [4]:
vector = tfidf.fit_transform(movies['tags'])
vector.shape

(157262, 472546)

In [5]:
vector = svd.fit_transform(vector)
vector.shape

(157262, 200)

In [6]:
similarity = cosine_similarity(vector)

In [7]:
similarity

array([[1.        , 0.29058203, 0.20775534, ..., 0.10365966, 0.04093003,
        0.0453864 ],
       [0.29058203, 1.        , 0.03818294, ..., 0.02786701, 0.01161622,
        0.02183104],
       [0.20775534, 0.03818294, 1.        , ..., 0.01267307, 0.19186452,
        0.04525081],
       ...,
       [0.10365966, 0.02786701, 0.01267307, ..., 1.        , 0.00695493,
        0.25799405],
       [0.04093003, 0.01161622, 0.19186452, ..., 0.00695493, 1.        ,
        0.06711009],
       [0.0453864 , 0.02183104, 0.04525081, ..., 0.25799405, 0.06711009,
        1.        ]])

In [8]:
similarity.shape

(157262, 157262)

In [9]:
top_n = 25
updated_similarity = np.zeros((similarity.shape[0], top_n), dtype=int)
updated_similarity.shape

(157262, 25)

In [10]:
for i in range(similarity.shape[0]):
    sorted_indices = np.argsort(-similarity[i])[:top_n]
    updated_similarity[i] = sorted_indices

In [11]:
updated_similarity[101]

array([   101,  29492,     97,   9853,    875,  52568, 129088,   4210,
          335,  41074,  25253,   2668,   1169,  18329,  29838,    243,
        90661,  21574,   7275,   7866,  44060,    597,  64941, 141038,
        18650])

In [12]:
updated_similarity.shape

(157262, 25)

In [13]:
def movie_recommend(movie):
    movie_index = movies[movies['title'] == movie].index[0]
    recommended_indices = updated_similarity[movie_index][:10]

    for i in recommended_indices:
        print(movies.iloc[i].title)

In [14]:
movie_recommend('Around the World in 80 Days')

Around the World in 80 Days
A View to a Kill
Sherlock Holmes: A Game of Shadows
Bean
A Tale of Two Cities
Argoman the Fantastic Superman
The Beloved Rogue
The Four Musketeers
Gold for the Caesars
Black Venus


In [15]:
with open('/content/drive/MyDrive/storyline_similarity.pkl', 'wb') as file:
    pickle.dump(updated_similarity, file)