Importing the dependencies

In [None]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Data Collection and Pre-Processing

In [None]:
movies_data = pd.read_csv('/content/imdb.csv', encoding='latin1')

In [None]:
# printing the first 5 rows of the dataframe
movies_data.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [None]:
# number of rows and columns in the data frame

movies_data.shape

(15509, 10)

In [None]:
# selecting the relevant features for recommendation

selected_features = ['Name','Year','Genre','Actor 1','Director']
print(selected_features)

['Name', 'Year', 'Genre', 'Actor 1', 'Director']


In [None]:
for feature in selected_features:
    if feature in movies_data.columns:
        movies_data[feature] = movies_data[feature].fillna('')
    else:
        print(f"Feature '{feature}' not found in the DataFrame.")

In [None]:
# combining all the 5 selected features

combined_features = movies_data['Genre']+' '+movies_data['Name']+' '+movies_data['Actor 1']+' '+movies_data['Director']+' '+movies_data['Year']

In [None]:
print(combined_features)

0                          Drama   Manmauji J.S. Randhawa 
1        Drama #Gadhvi (He thought he was Gandhi) Rasik...
2        Drama, Musical #Homecoming Sayani Gupta Soumya...
3        Comedy, Romance #Yaaram Prateik Ovais Khan (2019)
4        Drama ...And Once Again Rajat Kapoor Amol Pale...
                               ...                        
15504    Action Zulm Ko Jala Doonga Naseeruddin Shah Ma...
15505    Action, Drama Zulmi Akshay Kumar Kuku Kohli (1...
15506    Action Zulmi Raj Sangeeta Tiwari Kiran Thej (2...
15507                        Action Zulmi Shikari   (1988)
15508    Action, Drama Zulm-O-Sitam Dharmendra K.C. Bok...
Length: 15509, dtype: object


In [None]:
# converting the text data to feature vectors

vectorizer = TfidfVectorizer()

In [None]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [None]:
print(feature_vectors)

  (0, 12458)	0.6564916798292249
  (0, 9385)	0.7418239667385912
  (0, 4678)	0.13680598191316182
  (1, 137)	0.15392591514190357
  (1, 1881)	0.2178892731988315
  (1, 5452)	0.23136721424291792
  (1, 4717)	0.31746948900387834
  (1, 12546)	0.31746948900387834
  (1, 5364)	0.24568673676110428
  (1, 16529)	0.2944287966048434
  (1, 15523)	0.33094743004796473
  (1, 6211)	0.5786094424659074
  (1, 5308)	0.28095085556075694
  (1, 4678)	0.05854721216623898
  (2, 139)	0.23874781839805767
  (2, 9219)	0.33681832314881144
  (2, 14669)	0.5089539789781461
  (2, 5879)	0.2906876882198753
  (2, 13694)	0.43808681996620996
  (2, 6350)	0.4882266637008332
  (2, 10224)	0.21823665502602063
  (2, 4678)	0.09003797547473448
  (3, 8005)	0.22845070312929877
  (3, 10995)	0.5457438363639985
  (3, 11788)	0.48552333816846577
  :	:
  (15505, 8300)	0.389246240448399
  (15505, 116)	0.32238085139667444
  (15505, 795)	0.34759260862046976
  (15505, 8465)	0.20728813029669937
  (15505, 511)	0.14697773542061357
  (15505, 4678)	0.104

Cosine Similarity

In [None]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)

In [None]:
print(similarity)

In [None]:
print(similarity.shape)

(15509, 15509)


Getting the movie name from the user

In [None]:
# getting the movie name from the user

movie_name = input(' Enter your favourite movie name : ')

 Enter your favourite movie name : Homecoming


In [None]:
# creating a list with all the movie names given in the dataset

list_of_all_titles = movies_data['Name'].tolist()
print(list_of_all_titles)



In [None]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)
print(find_close_match)

['Homecoming', '#Homecoming', "Mom's Coming"]


In [None]:
close_match = find_close_match[0]
print(close_match)

Homecoming


In [None]:
index_of_the_movie = movies_data[movies_data['Name'] == close_match].index.values[0]
print(index_of_the_movie)

5757


In [None]:
# getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, 0.013552468641245104), (1, 0.005799887152733784), (2, 0.3235799969627677), (3, 0.0), (4, 0.00892345317436489), (5, 0.06110570025457501), (6, 0.008459672801099369), (7, 0.0), (8, 0.0), (9, 0.0), (10, 0.007334496367380749), (11, 0.06357309991288299), (12, 0.0), (13, 0.00873167118885427), (14, 0.0), (15, 0.008491337291717846), (16, 0.0), (17, 0.008230480599750262), (18, 0.007805710598466366), (19, 0.0), (20, 0.007626739438828087), (21, 0.008614764766643359), (22, 0.0), (23, 0.0), (24, 0.0), (25, 0.0), (26, 0.0), (27, 0.0067412087335743985), (28, 0.007016844023840419), (29, 0.0), (30, 0.0078068944714221105), (31, 0.0), (32, 0.00825597145099702), (33, 0.0975322833636081), (34, 0.0), (35, 0.009373295274642176), (36, 0.008534048720329817), (37, 0.009158449353894569), (38, 0.0), (39, 0.06678611106416164), (40, 0.008121364785421015), (41, 0.0), (42, 0.00927320619282342), (43, 0.010028718725156227), (44, 0.0), (45, 0.0), (46, 0.009071510694918817), (47, 0.0), (48, 0.013602174091641259), (49

In [None]:
len(similarity_score)

15509

In [None]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)
print(sorted_similar_movies)

[(5757, 1.0), (2, 0.3235799969627677), (1287, 0.2410183319646349), (9722, 0.22629376812792779), (11586, 0.22588988629803278), (1434, 0.2029347892224344), (2257, 0.19944081482650308), (10160, 0.1966241330942393), (6729, 0.19265708423472705), (7990, 0.1862403858073958), (1288, 0.18537972109527845), (849, 0.18522232578279607), (7290, 0.18188001529662304), (2654, 0.1812202513972889), (1688, 0.17881657821408614), (5097, 0.17648788503296536), (4143, 0.17632968682224825), (6737, 0.174472681641053), (5257, 0.17409154709272068), (11751, 0.17272469044796598), (257, 0.1718500962042765), (6491, 0.17163308710582384), (8123, 0.17163308710582384), (5301, 0.17052635427510332), (15027, 0.17037724073376703), (5615, 0.16917464968601664), (6341, 0.16798180529676593), (1543, 0.16716598402420052), (4523, 0.16662028508962642), (9363, 0.16605956431955532), (14642, 0.1660316999104648), (822, 0.16579167387969543), (9985, 0.16536322406085577), (7992, 0.1653621631514598), (7212, 0.1651458937995848), (1802, 0.1651

In [None]:
print('Movies suggested for you:\n')

i = 1

for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movies_data[movies_data.index == index]['Name'].values[0]  # Assuming 'Name' contains movie titles
    if i < 30:
        print(i, '.', title_from_index)
        i += 1


Movies suggested for you:

1 . Homecoming
2 . #Homecoming
3 . Aryan
4 . Music Meri Jaan
5 . Rajkumar
6 . Awesome Mausam
7 . Bhookailas
8 . Netua
9 . Jhumroo
10 . Lab Kush
11 . Aryan: Unbreakable
12 . ALIF
13 . Kanpuriye
14 . Campus
15 . Bahadur
16 . Ghutan
17 . Dosti
18 . Jigar
19 . Gulfam
20 . Rangila
21 . Aaghaaz
22 . Jalan
23 . Lalach
24 . Gunehgaar
25 . Web Girls
26 . Hawayein
27 . Jab Tum Kaho
28 . Babooji
29 . Ek Tha Hero


Movie Recommendation Sytem

In [None]:
movie_name = input('Enter your favorite movie name: ')

list_of_all_titles = movies_data['Name'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

if find_close_match:
    close_match = find_close_match[0]
    index_of_the_movie = movies_data[movies_data['Name'] == close_match].index.values[0]

    similarity_score = list(enumerate(similarity[index_of_the_movie]))
    sorted_similar_movies = sorted(similarity_score, key=lambda x: x[1], reverse=True)

    print('Movies suggested for you:\n')

    i = 1
    for movie in sorted_similar_movies:
        index = movie[0]
        title_from_index = movies_data.loc[index, 'Name']
        if i < 30:
            print(i, '.', title_from_index)
            i += 1
else:
    print('No similar movie found.')

Enter your favorite movie name: Babooji
Movies suggested for you:

1 . Babooji
2 . Bahadur
3 . Bhole Bhale
4 . Dosti
5 . Jigar
6 . Rangila
7 . Jalan
8 . Lalach
9 . Labela
10 . Bhala Aadmi
11 . Passing Show
12 . Kar Bhala
13 . Halla Gulla
14 . Bebus
15 . Bakhshish
16 . Bhedi Bungla
17 . Bachke Rahna
18 . Shola Jo Bhadke
19 . Sachche Ka Bol Bala
20 . Badla
21 . Matwale
22 . Matlabi
23 . Achchhaji
24 . Bahadur Pratap
25 . Matlahi
26 . Nagma-E-Sahra
27 . Chhabila
28 . Beti
29 . Honey
