In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import difflib
from sklearn.metrics.pairwise import cosine_similarity
df = pd.read_csv("dataset.csv")
# imdb top 5000 dataset

In [3]:
# add index column
df.head
df = df.reset_index()

In [4]:
df.columns

Index(['index', 'Movie_Title', 'Year', 'Director', 'Actors', 'Rating',
       'Runtime(Mins)', 'Censor', 'Total_Gross', 'main_genre', 'side_genre'],
      dtype='object')

In [5]:
# combine data into all_features
all_features = df.apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

In [6]:
# convert data to feature vectors
vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(all_features)
print(feature_vectors)

  (0, 9494)	0.046398446414390185
  (0, 5814)	0.07500943436955142
  (0, 5754)	0.0647928944030109
  (0, 20229)	0.08359889872152851
  (0, 11262)	0.08356506328199677
  (0, 20151)	0.07493386316635334
  (0, 549)	0.17971388300257873
  (0, 5747)	0.2487938013366009
  (0, 13476)	0.36722147617936335
  (0, 13262)	0.2606265152634073
  (0, 11146)	0.2606265152634073
  (0, 18020)	0.2606265152634073
  (0, 18487)	0.4166749704123732
  (0, 17492)	0.5212530305268146
  (0, 1158)	0.14953644275995337
  (0, 12939)	0.2606265152634073
  (1, 8695)	0.09916665978212175
  (1, 5510)	0.23245351310061804
  (1, 4882)	0.3353001458663775
  (1, 595)	0.2569376658081933
  (1, 7785)	0.24473924108301046
  (1, 15038)	0.13694172550438705
  (1, 9679)	0.26946248328811273
  (1, 5675)	0.22089985857335268
  (1, 13736)	0.2747142864765611
  :	:
  (5560, 10278)	0.11855515690773293
  (5560, 18167)	0.11855515690773293
  (5561, 17923)	0.24240561839833571
  (5561, 13144)	0.24240561839833571
  (5561, 9874)	0.24240561839833571
  (5561, 13355)

In [7]:
# get similarity scores for all movies
similarity = cosine_similarity(feature_vectors)
similarity

array([[1.        , 0.01612664, 0.01345327, ..., 0.        , 0.        ,
        0.01299509],
       [0.01612664, 1.        , 0.03801125, ..., 0.        , 0.        ,
        0.        ],
       [0.01345327, 0.03801125, 1.        , ..., 0.02208956, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.02208956, ..., 1.        , 0.01586931,
        0.00655015],
       [0.        , 0.        , 0.        , ..., 0.01586931, 1.        ,
        0.0071276 ],
       [0.01299509, 0.        , 0.        , ..., 0.00655015, 0.0071276 ,
        1.        ]])

In [8]:
name = "wolf of wall street"
list_of_all_titles = df["Movie_Title"].tolist()
print(list_of_all_titles)

['Kantara', 'The Dark Knight', 'The Lord of the Rings: The Return of the King', 'Inception', 'The Lord of the Rings: The Two Towers', 'The Lord of the Rings: The Fellowship of the Ring', 'The Matrix', 'The Empire Strikes Back', 'Terminator 2: Judgment Day', 'Star Wars', 'Seppuku', 'Shichinin no samurai', 'Kaithi', 'Asuran', 'Sita Ramam', 'Gladiator', 'Léon', 'Vikram', 'Spider-Man: Into the Spider-Verse', 'Avengers: Endgame', 'Avengers: Infinity War', 'Top Gun: Maverick', 'The Dark Knight Rises', 'K.G.F: Chapter 2', 'Shershaah', 'Oldeuboi', 'Mononoke-hime', 'Aliens', 'Raiders of the Lost Ark', 'Vikram Vedha', 'Dangal', 'Spider-Man: No Way Home', 'Heat', 'Star Wars: Episode VI - Return of the Jedi', 'North by Northwest', 'Major', '1917', 'Uri: The Surgical Strike', 'K.G.F: Chapter 1', 'Dag II', 'Baahubali 2: The Conclusion', 'Gangs of Wasseypur', 'Paan Singh Tomar', 'Warrior', 'Kimetsu no Yaiba: Mugen Ressha-Hen', 'V for Vendetta', 'Batman Begins', 'Kill Bill: Vol. 1', 'Lock, Stock and T

In [9]:
find_close_match = difflib.get_close_matches(name, list_of_all_titles)
print(find_close_match)

['The Wolf of Wall Street', 'Wall Street']


In [10]:
fav_movie = find_close_match[0]
fav_movie

'The Wolf of Wall Street'

In [11]:
index = df[df.Movie_Title == fav_movie]["index"].values[0]

In [12]:
similar_movies =list(enumerate(similarity[index]))
print(similar_movies)

[(0, 0.0), (1, 0.01681266579865947), (2, 0.057442197874222126), (3, 0.10471690794521428), (4, 0.036266361981128925), (5, 0.0570184739733853), (6, 0.006459046161049181), (7, 0.006819839590482759), (8, 0.0), (9, 0.0), (10, 0.0), (11, 0.0), (12, 0.008588649564141378), (13, 0.0), (14, 0.0), (15, 0.0), (16, 0.009178950245565462), (17, 0.0), (18, 0.005597015721314334), (19, 0.0), (20, 0.0), (21, 0.0), (22, 0.007382509238660891), (23, 0.008172403227857531), (24, 0.01552345373285044), (25, 0.0), (26, 0.0), (27, 0.0), (28, 0.024083037945550917), (29, 0.007189192399698941), (30, 0.014616456574590474), (31, 0.0), (32, 0.009438129689553702), (33, 0.021131383717235077), (34, 0.0), (35, 0.014848140828159355), (36, 0.0), (37, 0.006221632891020261), (38, 0.007909858901466735), (39, 0.0), (40, 0.006744873735254983), (41, 0.02702928043519654), (42, 0.021474030807228317), (43, 0.0), (44, 0.0), (45, 0.0), (46, 0.008777832140393118), (47, 0.008589730652159148), (48, 0.051352766416920353), (49, 0.0), (50, 0

In [13]:
sorted_similar_movies = sorted(similar_movies, key = lambda x:x[1], reverse = True) 
sorted_similar_movies=sorted_similar_movies[:10]
print(sorted_similar_movies)

[(1563, 1.0000000000000002), (1675, 0.22823534074126353), (2263, 0.2205495999756602), (2417, 0.2091237815953202), (3785, 0.2054233861401273), (2996, 0.19907417296119512), (1734, 0.17302907740579476), (372, 0.15941530539853432), (479, 0.15834818356559346), (1577, 0.15047292367943765)]


In [14]:
df["main_genre"].unique()

array(['Action', 'Animation', 'Biography', 'Adventure', 'Western',
       'Drama', 'Crime', 'Comedy', 'Horror', 'Mystery', 'Film-Noir',
       'Fantasy', 'Musical'], dtype=object)