In [35]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Data collection and preprocessing

In [63]:
movies_data = pd.read_csv('/content/Moviedataset.csv')

In [37]:
movies_data.head()

Unnamed: 0,Index,MovieName,Genre,Rating,Director,Actor,PeopleVote,Year,Hero_Rating,movie_rating,content_rating
0,0,Mouna Guru,Action,7.7,Santha Kumar,Arulnithi,746,2011,8,8,7.9
1,1,7 Aum Arivu,Action,6.2,A.R. Murugadoss,Suriya,9479,2011,9,9,8.066667
2,2,Vaagai Sooda Vaa,Comedy,8.0,A. Sarkunam,Vimal,14522,2011,8,7,7.666667
3,3,Mankatha,Action,7.6,Venkat Prabhu,Ajith Kumar,12276,2011,6,8,7.2
4,4,Kanchana: Muni 2,Comedy,6.5,Lawrence Raghavendra,Lawrence Raghavendra,1044,2011,8,9,7.833333


In [38]:
# number of rows and columns in the data frame

movies_data.shape

(329, 11)

In [39]:
selected_features = ['Genre','Director','Actor']
print(selected_features)

['Genre', 'Director', 'Actor']


In [40]:
!pip install pandas



In [41]:
# replacing the null valuess with null string

for feature in selected_features:
  movies_data[feature] = movies_data[feature].fillna('')

In [42]:
# combining all the 5 selected features

combined_features = movies_data['Genre']+' '+movies_data['Director']+' '+movies_data['Actor']
print(combined_features)

0                         Action Santha Kumar Arulnithi
1                         Action A.R. Murugadoss Suriya
2                              Comedy A. Sarkunam Vimal
3                      Action Venkat Prabhu Ajith Kumar
4      Comedy Lawrence Raghavendra Lawrence Raghavendra
                             ...                       
324                            Comedy Rambala Santhanam
325                    Action Rajath Ravishankar Karthi
326               Comedy Sakthi Chidambaram Prabhu Deva
327                 Action Karthik Subbaraj Rajinikanth
328                             Action Siva Ajith Kumar
Length: 329, dtype: object


In [43]:
# converting the text data to feature vectors

vectorizer = TfidfVectorizer()

In [44]:
feature_vectors = vectorizer.fit_transform(combined_features)
print(feature_vectors)

  (0, 61)	0.6048963111981892
  (0, 220)	0.3849430529534935
  (0, 391)	0.6644371767935963
  (0, 6)	0.21081398619688965
  (1, 456)	0.6539368885927855
  (1, 257)	0.7115544879644914
  (1, 6)	0.2570150898199896
  (2, 503)	0.6364819385908981
  (2, 400)	0.6991318256177811
  (2, 117)	0.32573828797966115
  (3, 21)	0.5290015842455659
  (3, 306)	0.4513102575204591
  (3, 491)	0.5816818901687874
  (3, 220)	0.37016989276360135
  (3, 6)	0.2027234679644886
  (4, 326)	0.6967005490826524
  (4, 227)	0.6967005490826524
  (4, 117)	0.1709289028100906
  (5, 501)	0.721658788862413
  (5, 499)	0.5138539510186731
  (5, 138)	0.4638563457366238
  (6, 434)	0.48564523033390394
  (6, 178)	0.547835543719954
  (6, 216)	0.48564523033390394
  (6, 322)	0.44926622909496783
  :	:
  (322, 306)	0.459043045341681
  (322, 117)	0.30279344963354965
  (323, 453)	0.6727785337570709
  (323, 434)	0.70170650656705
  (323, 6)	0.23447179608036497
  (324, 350)	0.6965966844387332
  (324, 392)	0.6503392045494953
  (324, 117)	0.303004914572

Cosine similarity

In [45]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)
print(similarity)

[[1.         0.05418238 0.         ... 0.         0.04460375 0.20380431]
 [0.05418238 1.         0.         ... 0.         0.05437892 0.05732733]
 [0.         0.         1.         ... 0.08090114 0.         0.        ]
 ...
 [0.         0.         0.08090114 ... 1.         0.         0.        ]
 [0.04460375 0.05437892 0.         ... 0.         1.         0.04719272]
 [0.20380431 0.05732733 0.         ... 0.         0.04719272 1.        ]]


In [46]:
print(similarity.shape)

(329, 329)


Getting input from user

In [25]:
# getting the movie name from the user

movie_name = input(' Enter your favourite movie name : ')

 Enter your favourite movie name : payanam


In [47]:
# creating a list with all the movie names given in the dataset

list_of_all_MovieName = movies_data['MovieName'].tolist()
print(list_of_all_MovieName)

['Mouna Guru', '7 Aum Arivu', 'Vaagai Sooda Vaa', 'Mankatha', 'Kanchana: Muni 2', 'Deiva Thirumagal', 'Vaanam', 'Ko', 'Payanam', 'Yutham Sei', 'Aadukalam', 'Naduvula Konjam Pakkatha Kaanom', 'Thuppakki', 'Pizza', 'Naan', 'The Fly', 'Kalakalappu', 'Vazhakku Enn 18/9', 'Kumki', 'Neethaane En Ponvasantham', 'Thalaimuraigal', 'Biriyani', 'Ivan Veramaathiri', 'Kalyana Samayal Saadham', 'Irandam Ulagam', 'Pandiya Naadu', 'Arrambam', 'Onaayum Aattukkuttiyum', 'Raja Rani', '6 Candles', 'Moodar Koodam', 'Varuthapadatha Valibar Sangam', 'Thanga Meengal', 'Aadhalal Kadhal Seiveer', 'Maryan', 'Singam 2', 'Theeya Velai Seiyyanum Kumaru', 'Neram', 'Soodhu Kavvum', 'Ethir Neechal', 'Kedi Billa Killadi Ranga', 'Paradesi', 'Kaakkaa Muttai', 'Kayal', 'Pisasu', 'Lingaa', 'Kaaviya Thalaivan', 'Kaththi', 'Jeeva', 'Madras', 'Kathai Thiraikathai Vasanam Iyakkam', 'Jigarthanda', 'Velaiilla Pattadhari', 'Sathuranga Vettai', 'Saivam', 'Mundaasupatti', 'Kochadaiiyaan', 'Vaayai Moodi Pesavum', 'Cuckoo', 'Thegidi'

In [26]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_MovieName)
print(find_close_match)

['Payanam', 'Vaanam', 'Mupparimanam']


In [27]:
close_match = find_close_match[0]
print(close_match)

Payanam


In [50]:
# finding the index of the movie with title

index_of_the_movie = movies_data[movies_data.MovieName == close_match]['Index'].values[0]
print(index_of_the_movie)

8


In [51]:
# getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_the_movie]))
print(similarity_score)

[(0, 0.0357811938572575), (1, 0.043622849313710174), (2, 0.0), (3, 0.03440800033010293), (4, 0.0), (5, 0.0), (6, 0.2386528572350959), (7, 0.04237601385523519), (8, 1.0000000000000002), (9, 0.0), (10, 0.042826369602093455), (11, 0.0), (12, 0.040097847653922006), (13, 0.0), (14, 0.0), (15, 0.03591735392406042), (16, 0.0), (17, 0.0), (18, 0.0), (19, 0.0), (20, 0.0), (21, 0.03703910764573882), (22, 0.038834998021477385), (23, 0.0), (24, 0.03935657979358477), (25, 0.04405129013939925), (26, 0.03574971472114137), (27, 0.029137231990160925), (28, 0.0), (29, 0.0), (30, 0.0), (31, 0.0), (32, 0.0), (33, 0.0), (34, 0.0), (35, 0.04187318000704583), (36, 0.0), (37, 0.02568660316465744), (38, 0.0), (39, 0.0), (40, 0.0), (41, 0.0), (42, 0.0), (43, 0.0), (44, 0.0), (45, 0.03985762254745555), (46, 0.0), (47, 0.040097847653922006), (48, 0.0), (49, 0.03346594006504057), (50, 0.0), (51, 0.03554322900410081), (52, 0.039071551892230744), (53, 0.0), (54, 0.0), (55, 0.0), (56, 0.0), (57, 0.18338447472142924),

In [52]:
len(similarity_score)

329

In [53]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)
print(sorted_similar_movies)

[(8, 1.0000000000000002), (98, 0.5036968781522615), (222, 0.48305273768832047), (186, 0.4762967328031056), (215, 0.27165616069755094), (300, 0.2622709320421787), (121, 0.24716623686661962), (6, 0.2386528572350959), (180, 0.22626389929417284), (71, 0.213971051894083), (188, 0.1960954778637642), (57, 0.18338447472142924), (83, 0.04588843172028626), (174, 0.04583532387949245), (272, 0.044910144397444104), (25, 0.04405129013939925), (1, 0.043622849313710174), (221, 0.04325057082076474), (150, 0.0432315518001713), (233, 0.04319932509152142), (201, 0.04304991448161538), (10, 0.042826369602093455), (225, 0.042826369602093455), (271, 0.042826369602093455), (7, 0.04237601385523519), (245, 0.04237601385523519), (96, 0.04205484833613463), (35, 0.04187318000704583), (110, 0.04187318000704583), (303, 0.04186915989681643), (133, 0.04145193925640302), (63, 0.04097715967503352), (235, 0.04097715967503352), (297, 0.040680725202600757), (12, 0.040097847653922006), (47, 0.040097847653922006), (223, 0.040

In [56]:
# print the name of similar movies based on the index

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  MovieName_from_index = movies_data[movies_data.index==index]['MovieName'].values[0]
  if (i<20):
    print(i, '.',MovieName_from_index)
    i+=1

Movies suggested for you : 

1 . Payanam
2 . Oopiri
3 . Kaatrin Mozhi
4 . Brindavanam
5 . Maari 2
6 . Ayogya
7 . Velaikkaran
8 . Vaanam
9 . Sathriyan
10 . Thani Oruvan
11 . Sangili Bungili Kadhava Thorae
12 . Vaayai Moodi Pesavum
13 . I
14 . Vanamagan
15 . Kaappaan
16 . Pandiya Naadu
17 . 7 Aum Arivu
18 . 2
19 . Thupparivaalan


Movie recommendation system

In [64]:
movie_name = input(' Enter your favourite movie name : ')

list_of_all_MovieName = movies_data['MovieName'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_MovieName)

close_match = find_close_match[0]

index_of_the_movie = movies_data[movies_data.MovieName == close_match]['Index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)

print('Movies suggested for you : \n')

i = 1

for movie in sorted_similar_movies:
  index = movie[0]
  MovieName_from_index = movies_data[movies_data.index==index]['MovieName'].values[0]
  if (i<30):
    print(i, '.',MovieName_from_index)
    i+=1

 Enter your favourite movie name : Mouna Guru
Movies suggested for you : 

1 . Mouna Guru
2 . Magamuni
3 . Brindavanam
4 . K-13
5 . Uriyadi
6 . Uriyadi 2
7 . Demonte Colony
8 . 24
9 . Sindhubaadh
10 . Veeram
11 . Vivegam
12 . Nerkonda Paarvai
13 . Viswasam
14 . Theri
15 . Mersal
16 . Bigil
17 . Gangs Of Madras
18 . Maayavan
19 . Arrambam
20 . Mankatha
21 . Raatchasan
22 . Goli Soda
23 . Watchman
24 . Pannaiyarum Padminiyum
25 . Yennai Arindhaal
26 . Raja Rani
27 . 96
28 . 100% Kadhal
29 . Mundaasupatti
