In [166]:
import pandas as pd
import numpy as np

In [167]:
df = pd.read_csv(r'n_movies.csv')
df.head(2)

Unnamed: 0,title,year,certificate,duration,genre,rating,description,stars,votes
0,Cobra Kai,(2018– ),TV-14,30 min,"Action, Comedy, Drama",8.5,Decades after their 1984 All Valley Karate Tou...,"['Ralph Macchio, ', 'William Zabka, ', 'Courtn...",177031
1,The Crown,(2016– ),TV-MA,58 min,"Biography, Drama, History",8.7,Follows the political rivalries and romance of...,"['Claire Foy, ', 'Olivia Colman, ', 'Imelda St...",199885


In [168]:
df = df[['genre','title']]
df.head(3)

Unnamed: 0,genre,title
0,"Action, Comedy, Drama",Cobra Kai
1,"Biography, Drama, History",The Crown
2,"Crime, Drama",Better Call Saul


In [169]:
df['genre'].sample(10)

8608     Action, Drama, Romance
3040    Crime, Drama, Film-Noir
3186    Biography, Drama, Music
9875                     Comedy
5708         Documentary, Crime
6697                    Romance
5217                      Drama
1261       Action, Crime, Drama
8604                 Reality-TV
7561                  Animation
Name: genre, dtype: object

In [170]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9957 entries, 0 to 9956
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   genre   9884 non-null   object
 1   title   9957 non-null   object
dtypes: object(2)
memory usage: 155.7+ KB


In [171]:
df.columns

Index(['genre', 'title'], dtype='object')

In [172]:
df.dtypes

genre    object
title    object
dtype: object

In [173]:
df.isnull().sum()

genre    73
title     0
dtype: int64

In [174]:
df.dropna(inplace = True,ignore_index = True)

In [175]:
df.isnull().sum()

genre    0
title    0
dtype: int64

In [176]:
df['genre'] = df['genre'].apply(lambda x:x.replace('Sci-Fi','scifi'))

In [177]:
df['genre'].sample(5)

6404         Reality-TV, Romance
5362           Music, Reality-TV
2660    Adventure, Comedy, Drama
1272    Biography, Comedy, Drama
7113                      Comedy
Name: genre, dtype: object

In [178]:
df.iloc[195]

genre    Action, Adventure, scifi
title          Star Trek: Voyager
Name: 195, dtype: object

In [179]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [180]:
corpus = []
for i in range(len(df)):
    l = re.sub('[^a-zA-Z]',' ',df['genre'][i])
    l = l.split()
    l = [wnl.lemmatize(i) for i in l if i not in stopwords.words('english')]
    l = ' '.join(l)
    corpus.append(l)
print(corpus)    

['Action Comedy Drama', 'Biography Drama History', 'Crime Drama', 'Drama Horror Mystery', 'Animation Action Adventure', 'Drama Fantasy Horror', 'Animation Adventure Comedy', 'Crime Drama Thriller', 'Action Adventure Drama', 'Biography Drama Mystery', 'Drama Fantasy Horror', 'Action Crime Drama', 'Drama Horror Thriller', 'Crime Drama Mystery', 'Comedy', 'Thriller', 'Drama Romance Sport', 'Drama History Horror', 'Crime Drama', 'Action Adventure Drama', 'Action Adventure Drama', 'Drama Romance', 'Drama Fantasy Horror', 'Drama Horror scifi', 'Action Biography Crime', 'Action Crime Drama', 'Comedy Romance', 'Comedy Drama', 'Comedy Drama Romance', 'Crime Drama Thriller', 'Drama Mystery Thriller', 'Action Thriller', 'Comedy', 'Comedy Drama', 'Action Adventure Drama', 'Action Adventure Drama', 'Crime Drama Mystery', 'Comedy', 'Crime Drama Fantasy', 'Action Adventure Comedy', 'Action Drama History', 'Drama', 'Comedy Romance', 'Drama Mystery scifi', 'Comedy Crime', 'Action Drama Mystery', 'Comed

In [181]:
df['genre'] = corpus

In [182]:
df.head()

Unnamed: 0,genre,title
0,Action Comedy Drama,Cobra Kai
1,Biography Drama History,The Crown
2,Crime Drama,Better Call Saul
3,Drama Horror Mystery,Devil in Ohio
4,Animation Action Adventure,Cyberpunk: Edgerunners


In [183]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf.fit_transform(df['genre']).toarray()

array([[0.67597931, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [185]:
featured_vectors = pd.DataFrame(tfidf.fit_transform(df['genre']).toarray(),columns = tfidf.get_feature_names_out())
featured_vectors.columns

Index(['action', 'adventure', 'animation', 'biography', 'comedy', 'crime',
       'documentary', 'drama', 'family', 'fantasy', 'film', 'game', 'history',
       'horror', 'music', 'musical', 'mystery', 'news', 'noir', 'reality',
       'romance', 'scifi', 'short', 'show', 'sport', 'talk', 'thriller', 'tv',
       'war', 'western'],
      dtype='object')

In [188]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(featured_vectors)
similarity

array([[1.        , 0.14339609, 0.25967094, ..., 0.14339609, 0.15327473,
        0.15903507],
       [0.14339609, 1.        , 0.16131893, ..., 1.        , 0.09522096,
        0.09879953],
       [0.25967094, 0.16131893, 1.        , ..., 0.16131893, 0.17243228,
        0.1789126 ],
       ...,
       [0.14339609, 1.        , 0.16131893, ..., 1.        , 0.09522096,
        0.09879953],
       [0.15327473, 0.09522096, 0.17243228, ..., 0.09522096, 1.        ,
        0.57708926],
       [0.15903507, 0.09879953, 0.1789126 , ..., 0.09879953, 0.57708926,
        1.        ]])

In [193]:
list_of_all_titles = df['title'].tolist()


# Enter the name of the movie/ Tv show

In [305]:
movie_name = input('enter the movie you want:')

enter the movie you want:the witcher


In [306]:
import difflib

find_close_match = difflib.get_close_matches(movie_name,list_of_all_titles)

find_close_match

['The Witcher', 'The Witcher', 'The Witcher']

In [307]:
close_match = find_close_match[0]

In [308]:
close_match

'The Witcher'

In [309]:
index_of_movie = df[df.title == close_match].index.values[0]

In [310]:
index_of_movie

35

In [311]:
similarity_score = list(enumerate(similarity[index_of_movie]))

In [312]:
print(similarity_score)

[(0, 0.620392743473992), (1, 0.1293485031588596), (2, 0.23423265732517562), (3, 0.1434554308326832), (4, 0.7179816882324579), (5, 0.13825939036713603), (6, 0.4085538808095658), (7, 0.16631499487643348), (8, 1.0), (9, 0.13808564434288345), (10, 0.13825939036713603), (11, 0.5555436460025749), (12, 0.1458313322871607), (13, 0.16281626379973435), (14, 0.0), (15, 0.0), (16, 0.12920822247100006), (17, 0.13373042825138207), (18, 0.23423265732517562), (19, 1.0), (20, 1.0), (21, 0.21337383949065322), (22, 0.13825939036713603), (23, 0.12849107512197425), (24, 0.2770267874037069), (25, 0.5555436460025749), (26, 0.0), (27, 0.2825392084910625), (28, 0.18516387359549943), (29, 0.16631499487643348), (30, 0.15294996495962035), (31, 0.3710432240036657), (32, 0.0), (33, 0.2825392084910625), (34, 1.0), (35, 1.0), (36, 0.16281626379973435), (37, 0.0), (38, 0.15533524810337188), (39, 0.7865546018638476), (40, 0.45606427341722855), (41, 0.4333729047409489), (42, 0.0), (43, 0.13328076617242576), (44, 0.0), (

In [313]:
sorted_score = sorted(similarity_score,key = lambda x : x[1],reverse = True)[1:11]
sorted_score

[(19, 1.0),
 (20, 1.0),
 (34, 1.0),
 (35, 1.0),
 (57, 1.0),
 (78, 1.0),
 (87, 1.0),
 (107, 1.0),
 (151, 1.0),
 (153, 1.0)]

In [314]:
k = 1
for i in sorted_score:
    m = df.iloc[i[0]]['title'].capitalize()
    print(f'{k}.{m}')
    k += 1

1.Fate: the winx saga
2.The lord of the rings: the fellowship of the ring
3.Vikings
4.The witcher
5.The lord of the rings: the return of the king
6.The lord of the rings: the two towers
7.The flash
8.Smallville
9.Shadow and bone
10.Gladiator


In [315]:
df['title'].sample(5)

9078                    BattleBots
5731    Love You as the World Ends
8288             Never Have I Ever
2298        Video Game High School
7041              Burning Daylight
Name: title, dtype: object