In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('movies.csv')

In [3]:
df.head(10)

Unnamed: 0,index,genres,title
0,0,Action Adventure Fantasy Science Fiction,Avatar
1,1,Adventure Fantasy Action,Pirates of the Caribbean: At World's End
2,2,Action Adventure Crime,Spectre
3,3,Action Crime Drama Thriller,The Dark Knight Rises
4,4,Action Adventure Science Fiction,John Carter
5,5,Fantasy Action Adventure,Spider-Man 3
6,6,Animation Family,Tangled
7,7,Action Adventure Science Fiction,Avengers: Age of Ultron
8,8,Adventure Fantasy Family,Harry Potter and the Half-Blood Prince
9,9,Action Adventure Fantasy,Batman v Superman: Dawn of Justice


In [4]:
df.shape

(4693, 3)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4693 entries, 0 to 4692
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   index   4693 non-null   int64 
 1   genres  4666 non-null   object
 2   title   4693 non-null   object
dtypes: int64(1), object(2)
memory usage: 110.1+ KB


In [6]:
df.isnull().sum()

index      0
genres    27
title      0
dtype: int64

In [7]:
df['genres'] = df['genres'].fillna('')

In [8]:
df.isnull().sum()

index     0
genres    0
title     0
dtype: int64

In [9]:
## text to feature vector

from sklearn.feature_extraction.text import TfidfVectorizer

In [38]:
vectorizer = TfidfVectorizer()

In [39]:
feature_vector = vectorizer.fit_transform(df['genres'])
print(feature_vector)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 12343 stored elements and shape (4693, 22)>
  Coords	Values
  (0, 0)	0.3592031586687302
  (0, 1)	0.4130006715194316
  (0, 8)	0.5071491125814311
  (0, 17)	0.4707458516730511
  (0, 9)	0.4707458516730511
  (1, 0)	0.48138419365761775
  (1, 1)	0.5534806430329519
  (1, 8)	0.6796531732320795
  (2, 0)	0.5156631691245587
  (2, 1)	0.5928935478061559
  (2, 4)	0.6185214118995771
  (3, 0)	0.5089033268563234
  (3, 4)	0.6104131981773027
  (3, 6)	0.36295309828694733
  (3, 18)	0.48649581688790333
  (4, 0)	0.41677749925163493
  (4, 1)	0.4791978659181514
  (4, 17)	0.546198645831827
  (4, 9)	0.546198645831827
  (5, 0)	0.48138419365761775
  (5, 1)	0.5534806430329519
  (5, 8)	0.6796531732320795
  (6, 2)	0.7810998132540361
  (6, 7)	0.6244061832929827
  (7, 0)	0.41677749925163493
  :	:
  (4681, 12)	0.6716723210138477
  (4682, 6)	1.0
  (4683, 12)	1.0
  (4684, 6)	1.0
  (4685, 18)	0.5232506676246239
  (4685, 3)	0.4518041679821976
  (4685, 12)	0.722552

In [40]:
vectorizor.get_feature_names_out()

array(['action', 'adventure', 'animation', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'fiction', 'foreign',
       'history', 'horror', 'movie', 'music', 'mystery', 'romance',
       'science', 'thriller', 'tv', 'war', 'western'], dtype=object)

In [41]:
feature_vector.shape

(4693, 22)

## Cosine Similarity

In [42]:
## getting  the similar scores using cosine similirity

from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(feature_vector)
print(similarity)

[[1.         0.7461881  0.43009327 ... 0.         0.         0.        ]
 [0.7461881  1.         0.5763872  ... 0.         0.         0.        ]
 [0.43009327 0.5763872  1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.30646855 0.        ]
 [0.         0.         0.         ... 0.30646855 1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [43]:
similarity.shape

(4693, 4693)

## creating a list of all movie given in the datset

In [44]:
list_titles = df['title'].tolist()

In [45]:
#print(list_titles)

In [46]:
su =0
for l in list_titles:
    su= su+1


In [47]:
su

4693

## Getting movie name from user

In [48]:
movie_name = input('Enter your favourite movie name : ')

Enter your favourite movie name :  avatar


In [49]:
movie_name

'avatar'

In [50]:
# finding close match

import difflib


In [51]:
find_close_match = difflib.get_close_matches(movie_name, list_titles)

print(find_close_match)

['Avatar']


In [52]:
close_match = find_close_match[0]
print(close_match)

Avatar


In [53]:
# finding the index of the movie with title

movie_index = df[df.title == close_match]['index'].values[0]
print(movie_index)

0


## getting list of similar movies

In [54]:
similarity_score = list(enumerate(similarity[movie_index]))
#print(similarity_score)

In [55]:
len(similarity_score)

4693

In [56]:
# sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse=True)
#print(sorted_similar_movies)

## Print the name similar movies based on the index

In [57]:
print('Movies suggested for you : \n')
i = 1
for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = df[df.index == index]['title'].values[0]
    if(i<10):
        print(i,'.', title_from_index)
        i = i+1

Movies suggested for you : 

1 . Avatar
2 . Superman Returns
3 . Man of Steel
4 . X-Men: Days of Future Past
5 . Jupiter Ascending
6 . The Wolverine
7 . Superman
8 . Superman II
9 . Beastmaster 2: Through the Portal of Time


## saving the models

In [58]:
import pickle

In [61]:
import pickle

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer,f)

with open('feature_vector.pkl', 'wb') as f:
    pickle.dump(feature_vector,f)

#  list of movie titles
with open('titles.pkl', 'wb') as f:
    pickle.dump(list_titles, f)
