In [1]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
movies_data = pd.read_csv('movie_metadata.csv')

In [3]:
movies_data.head() 

Unnamed: 0,index,director,num_critic_for_reviews,duration,director_facebook_likes,cast,gross,genres,actor_1_name,movie_title,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,1,James Cameron,723.0,178.0,0.0,Joel David Moore,760505847.0,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,2,Gore Verbinski,302.0,169.0,563.0,Orlando Bloom,309404152.0,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,3,Sam Mendes,602.0,148.0,0.0,Rory Kinnear,200074175.0,Action|Adventure|Thriller,Christoph Waltz,Spectre,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,4,Christopher Nolan,813.0,164.0,22000.0,Christian Bale,448130642.0,Action|Thriller,Tom Hardy,The Dark Knight Rises,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,5,Doug Walker,,,131.0,Rob Walker,,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens ...,...,,,,,,,12.0,7.1,,0


In [4]:
movies_data.shape

(5043, 26)

# relevant features for recommendation

In [5]:

selected_features = ['genres','keywords','cast','director'] 
print(selected_features)

['genres', 'keywords', 'cast', 'director']


# Replace null values to null strings

In [6]:
for feature in selected_features: movies_data[feature] = movies_data[feature].fillna('')

In [7]:
combined_features = movies_data['genres']+' '+movies_data['keywords']+' '+movies_data['cast']+' '+movies_data['director'] 
print(combined_features)

0       Action|Adventure|Fantasy|Sci-Fi avatar|future|...
1       Action|Adventure|Fantasy goddess|marriage cere...
2       Action|Adventure|Thriller bomb|espionage|seque...
3       Action|Thriller deception|imprisonment|lawless...
4                     Documentary  Rob Walker Doug Walker
                              ...                        
5038    Comedy|Drama fraud|postal worker|prison|theft|...
5039    Crime|Drama|Mystery|Thriller cult|fbi|hideout|...
5040    Drama|Horror|Thriller  Maxwell Moody Benjamin ...
5041      Comedy|Drama|Romance  Daniel Henney Daniel Hsia
5042    Documentary actress name in title|crush|date|f...
Length: 5043, dtype: object


# Converting text data to feature vector

In [8]:
vectorizer = TfidfVectorizer() 
feature_vectors = vectorizer.fit_transform(combined_features) 
print(feature_vectors)

  (0, 1658)	0.28662157259541055
  (0, 5324)	0.20652014530768595
  (0, 7099)	0.28177437748512607
  (0, 2708)	0.19247081734803154
  (0, 5447)	0.2721260160856435
  (0, 7839)	0.4063604011129682
  (0, 7329)	0.3487260349813786
  (0, 6556)	0.3202466392103297
  (0, 4058)	0.25646905415694676
  (0, 772)	0.3744716085862767
  (0, 3761)	0.14266793606464317
  (0, 9245)	0.14266793606464317
  (0, 3660)	0.14274256025430299
  (0, 215)	0.12408881243291142
  (0, 169)	0.11366386200718732
  (1, 10907)	0.29123933491304677
  (1, 4355)	0.27538125664856994
  (1, 1247)	0.29123933491304677
  (1, 7692)	0.29123933491304677
  (1, 9587)	0.3296003843530864
  (1, 8094)	0.25740856430435805
  (1, 8351)	0.29123933491304677
  (1, 1876)	0.3454584626175632
  (1, 6580)	0.39948697589011606
  (1, 4316)	0.30962156852112016
  :	:
  (5040, 1062)	0.4409050412395956
  (5040, 4951)	0.1886178186983105
  (5040, 3133)	0.09856105866735869
  (5040, 10448)	0.13455287991128248
  (5041, 4987)	0.510349792691137
  (5041, 4765)	0.48692246978396

# Getting the similarity scores 

In [9]:
similarity = cosine_similarity(feature_vectors) 
print(similarity) 

[[1.         0.04139525 0.02994083 ... 0.         0.         0.        ]
 [0.04139525 1.         0.02545354 ... 0.         0.         0.        ]
 [0.02994083 0.02545354 1.         ... 0.01487357 0.         0.        ]
 ...
 [0.         0.         0.01487357 ... 1.         0.00947955 0.        ]
 [0.         0.         0.         ... 0.00947955 1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [10]:
print(similarity.shape)

(5043, 5043)


# Input from user 

In [11]:
movie_name = input(' Enter your favourite movie name : ')

 Enter your favourite movie name : Avatar


# creating a list with all the movie names given in the dataset

In [12]:
list_of_all_titles = movies_data['movie_title'].tolist() 
print(list_of_all_titles)

['Avatar\xa0', "Pirates of the Caribbean: At World's End\xa0", 'Spectre\xa0', 'The Dark Knight Rises\xa0', 'Star Wars: Episode VII - The Force Awakens\xa0            ', 'John Carter\xa0', 'Spider-Man 3\xa0', 'Tangled\xa0', 'Avengers: Age of Ultron\xa0', 'Harry Potter and the Half-Blood Prince\xa0', 'Batman v Superman: Dawn of Justice\xa0', 'Superman Returns\xa0', 'Quantum of Solace\xa0', "Pirates of the Caribbean: Dead Man's Chest\xa0", 'The Lone Ranger\xa0', 'Man of Steel\xa0', 'The Chronicles of Narnia: Prince Caspian\xa0', 'The Avengers\xa0', 'Pirates of the Caribbean: On Stranger Tides\xa0', 'Men in Black 3\xa0', 'The Hobbit: The Battle of the Five Armies\xa0', 'The Amazing Spider-Man\xa0', 'Robin Hood\xa0', 'The Hobbit: The Desolation of Smaug\xa0', 'The Golden Compass\xa0', 'King Kong\xa0', 'Titanic\xa0', 'Captain America: Civil War\xa0', 'Battleship\xa0', 'Jurassic World\xa0', 'Skyfall\xa0', 'Spider-Man 2\xa0', 'Iron Man 3\xa0', 'Alice in Wonderland\xa0', 'X-Men: The Last Stand\


# finding the close match for the movie name given by the user

In [13]:
find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles) 
print(find_close_match) 


['Avatar\xa0']


In [14]:
close_match = find_close_match[0] 
print(close_match)

Avatar 


# finding the index of the movie with the movie_title

In [15]:
index_of_the_movie = movies_data[movies_data.movie_title == close_match]['index'].values[0] 
print(index_of_the_movie)

1


# Getting a list of similar movies

In [16]:
similarity_score = list(enumerate(similarity[index_of_the_movie])) 
print(similarity_score) 


[(0, 0.041395249177780175), (1, 1.0000000000000002), (2, 0.02545354428131781), (3, 0.0104327762357404), (4, 0.0), (5, 0.02285673806189243), (6, 0.02351722715697118), (7, 0.024640432898137256), (8, 0.01879800925071464), (9, 0.03378820822405205), (10, 0.020646744112807584), (11, 0.01934432845190085), (12, 0.028582325457095548), (13, 0.3648171595229585), (14, 0.19833593269686312), (15, 0.0322653915374065), (16, 0.029851310348233785), (17, 0.022844312997665997), (18, 0.12858344487575263), (19, 0.039100391324152946), (20, 0.03076365056280542), (21, 0.03705144524768657), (22, 0.020876780704964265), (23, 0.029295236413457902), (24, 0.033621734470171395), (25, 0.021949774018503194), (26, 0.0), (27, 0.01843629650934723), (28, 0.019897244008894945), (29, 0.01959160494578306), (30, 0.020003860098119924), (31, 0.04383429412344472), (32, 0.02307339005385038), (33, 0.02231269531855817), (34, 0.03563420458163168), (35, 0.02860983928290875), (36, 0.023931404196586632), (37, 0.018739197515487208), (38,

In [17]:
sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True) 
print(sorted_similar_movies) 


[(1, 1.0000000000000002), (205, 0.4833363243854699), (13, 0.3648171595229585), (270, 0.23030373588946532), (1219, 0.21375031130671882), (340, 0.21182585420203237), (3926, 0.21177942128673038), (147, 0.20345527103456557), (14, 0.19833593269686312), (275, 0.19439366754657197), (2574, 0.1921035138381978), (181, 0.181402096695725), (4982, 0.176890123398813), (2278, 0.1664615982293379), (4490, 0.1641803165389802), (2445, 0.15689390357738206), (2708, 0.15166774647068065), (1618, 0.15096461283738927), (1740, 0.14842630427640133), (4898, 0.14620715001970505), (350, 0.1436818816027221), (1427, 0.13519697173629505), (18, 0.12858344487575263), (1037, 0.12791022868622723), (792, 0.1273825117064222), (1517, 0.12647523115163048), (2782, 0.1241548670601849), (479, 0.12171618105562157), (552, 0.11783440022960075), (610, 0.11755341014311298), (635, 0.11054243476072895), (586, 0.10825038615406074), (1642, 0.1080876270282998), (1932, 0.10765714723473471), (3003, 0.10711733332261533), (2651, 0.10639970576

In [18]:
print('Movies suggested for you : \n')

Movies suggested for you : 



In [96]:
i = 1

# Assuming 'sorted_similar_movies' is defined somewhere before this loop
for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movies_data[movies_data.index == index]['movie_title'].values[0]

    if i < 30:
        print(f"{i}. {title_from_index}")
        i += 1

movie_name = input('Enter your favourite movie name: ')

list_of_all_titles = movies_data['movie_title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_titles)

if not find_close_match:
    print(f"No close match found for '{movie_name}'. Please try again.")
else:
    close_match = find_close_match[0]

    index_of_the_movie = movies_data[movies_data.movie_title == close_match]['index'].values[0]

    similarity_score = list(enumerate(similarity[index_of_the_movie]))

    sorted_similar_movies = sorted(similarity_score, key=lambda x: x[1], reverse=True)


1. The Longest Ride 
2. Soul Food 
3. Whiplash 
4. Men of Honor 
5. The Monuments Men 
6. Faster 
7. Safe Haven 
8. George Washington 
9. Trouble with the Curve 
10. The Trials of Darryl Hunt 
11. The Campaign 
12. I'm Not There. 
13. Melancholia 
14. Mona Lisa Smile 
15. Toy Story 2 
16. Dear John 
17. High Plains Drifter 
18. The Doombolt Chase             
19. Love and Other Catastrophes 
20. The Prince of Tides 
21. Arthur             
22. Diary of the Dead 
23. Thir13en Ghosts 
24. Zookeeper 
25. All the Pretty Horses 
26. Christmas Mail 
27. Star Trek Beyond 
28. Vicky Cristina Barcelona 
29. Elf 
Enter your favourite movie name: 
No close match found for ''. Please try again.


In [19]:
i = 1

for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = movies_data.loc[movies_data.index == index, 'movie_title'].values[0]

    if i <= 30:
        print(f"{i}. {title_from_index}")
        i += 1

1. Pirates of the Caribbean: At World's End 
2. Pirates of the Caribbean: The Curse of the Black Pearl 
3. Pirates of the Caribbean: Dead Man's Chest 
4. The Lord of the Rings: The Fellowship of the Ring 
5. The Mexican 
6. The Lord of the Rings: The Two Towers 
7. Love Stinks 
8. Troy 
9. The Lone Ranger 
10. Kingdom of Heaven 
11. The Cookout 
12. Rango 
13. Supporting Characters 
14. The Weather Man 
15. Monsoon Wedding 
16. Being Julia 
17. Boat Trip 
18. Hope Springs 
19. Confessions of a Dangerous Mind 
20. She Done Him Wrong 
21. Cutthroat Island 
22. Neighbors 2: Sorority Rising 
23. Pirates of the Caribbean: On Stranger Tides 
24. Pirate Radio 
25. Sinbad: Legend of the Seven Seas 
26. Ponyo 
27. Jonah: A VeggieTales Movie 
28. Bewitched             
29. First Knight 
30. Stardust 
