# Importing libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import sigmoid_kernel


# loading & viewing the dataframe

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Data sets/tmdb_movies_data.csv')
df.head()

Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,...,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,135397,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,...,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/2015,5562,6.5,2015,137999939.3,1392446000.0
1,76341,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,...,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/2015,6185,7.1,2015,137999939.3,348161300.0
2,262500,tt2908446,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,http://www.thedivergentseries.movie/#insurgent,Robert Schwentke,One Choice Can Destroy You,...,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/2015,2480,6.3,2015,101199955.5,271619000.0
3,140607,tt2488496,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,http://www.starwars.com/films/star-wars-episod...,J.J. Abrams,Every generation has a story.,...,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/2015,5292,7.5,2015,183999919.0,1902723000.0
4,168259,tt2820852,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,http://www.furious7.com/,James Wan,Vengeance Hits Home,...,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/2015,2947,7.3,2015,174799923.1,1385749000.0


In [None]:
df.shape

(10866, 21)

# Simple dropping for nulls & duplicates

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10866 entries, 0 to 10865
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10866 non-null  int64  
 1   imdb_id               10856 non-null  object 
 2   popularity            10866 non-null  float64
 3   budget                10866 non-null  int64  
 4   revenue               10866 non-null  int64  
 5   original_title        10866 non-null  object 
 6   cast                  10790 non-null  object 
 7   homepage              2936 non-null   object 
 8   director              10822 non-null  object 
 9   tagline               8042 non-null   object 
 10  keywords              9373 non-null   object 
 11  overview              10862 non-null  object 
 12  runtime               10866 non-null  int64  
 13  genres                10843 non-null  object 
 14  production_companies  9836 non-null   object 
 15  release_date       

In [None]:
df.duplicated().sum()

1

In [None]:
df = df.dropna(subset=['overview', 'genres'])
df = df.drop_duplicates()
df.reset_index(drop=True, inplace=True)
df.shape

(10839, 21)

# Fitting the TF-IDF vectorizer based on overview of the movie

In [None]:
tfv = TfidfVectorizer(
    min_df=3, max_features=None, strip_accents='unicode',
    analyzer='word', ngram_range=(1, 3), stop_words='english'
)

In [None]:
# Fitting the TF-IDF on the 'overview' text
tfv_matrix = tfv.fit_transform(df['overview'])
tfv_matrix.shape

(10839, 20216)

# Using sigmoid to find similarity

In [None]:
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
sig[0]

array([0.76161493, 0.7615943 , 0.76159416, ..., 0.76159416, 0.76159416,
       0.76159416])

# Using cosine similarity

In [None]:
cosine_sim = cosine_similarity(tfv_matrix, tfv_matrix)
cosine_sim[0]

array([1.        , 0.00673532, 0.        , ..., 0.        , 0.        ,
       0.        ])

In [None]:
df.index[df['original_title']=='The Dark Knight Rises']

Index([4350], dtype='int64')

In [None]:
sorted(list(enumerate(cosine_sim[0])), reverse=True, key=lambda x: x[1])[1:6]

[(5374, 0.2550420630772667),
 (10197, 0.23065235284577623),
 (10166, 0.21668485304583163),
 (6926, 0.21574096828405706),
 (5724, 0.19833503850605155)]

# Giving recommendations based on sigmoid

In [None]:
def give_rec_sig(movie_title, sig=sig):
    # Get the index corresponding to original_title
    idx = df[df['original_title']== movie_title].index[0]

    # Sort the movies & get the scores of the 10 most similar movies
    sig_scores = sorted(list(enumerate(sig[idx])), reverse=True, key=lambda x: x[1])[1:11]

    # Movie indices
    movie_indices = [i[0] for i in sig_scores]

    # Top 10 most similar movies
    return df['original_title'].iloc[movie_indices]

In [None]:
print(give_rec_sig('Star Wars: The Force Awakens'))

1325                             Star Wars
7288               The Empire Strikes Back
10741        The Star Wars Holiday Special
6988                             Alexander
6057                          Atlantic Rim
4468                          Hope Springs
2165     Family Guy Presents: It's a Trap!
2877                   The Incredible Hulk
7966                    Return of the Jedi
7867           Once Upon a Time in America
Name: original_title, dtype: object


# Giving recommendations based on cosine similarity

In [None]:
def give_rec_cos(movie_title, cos=cosine_sim):
    # Get the index corresponding to original_title
    idx = df[df['original_title']== movie_title].index[0]

    # Sort the movies & get the scores of the 10 most similar movies
    cos_scores = sorted(list(enumerate(cos[idx])), reverse=True, key=lambda x: x[1])[1:11]

    # Movie indices
    movie_indices = [i[0] for i in cos_scores]

    # Top 10 most similar movies
    return df['original_title'].iloc[movie_indices]

In [None]:
print(give_rec_cos('Star Wars: The Force Awakens'))

1325                             Star Wars
7288               The Empire Strikes Back
10741        The Star Wars Holiday Special
6988                             Alexander
6057                          Atlantic Rim
4468                          Hope Springs
2165     Family Guy Presents: It's a Trap!
2877                   The Incredible Hulk
7966                    Return of the Jedi
7867           Once Upon a Time in America
Name: original_title, dtype: object


# Some simple preprocessing to make the director_based dataset

In [None]:
df['genres'] = df['genres'].str.replace('|', ' ')
df['genres'] = df['genres'].str.split(' ')
df.head()

Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,...,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,135397,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,...,Twenty-two years after the events of Jurassic ...,124,"[Action, Adventure, Science, Fiction, Thriller]",Universal Studios|Amblin Entertainment|Legenda...,6/9/2015,5562,6.5,2015,137999939.3,1392446000.0
1,76341,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,...,An apocalyptic story set in the furthest reach...,120,"[Action, Adventure, Science, Fiction, Thriller]",Village Roadshow Pictures|Kennedy Miller Produ...,5/13/2015,6185,7.1,2015,137999939.3,348161300.0
2,262500,tt2908446,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,http://www.thedivergentseries.movie/#insurgent,Robert Schwentke,One Choice Can Destroy You,...,Beatrice Prior must confront her inner demons ...,119,"[Adventure, Science, Fiction, Thriller]",Summit Entertainment|Mandeville Films|Red Wago...,3/18/2015,2480,6.3,2015,101199955.5,271619000.0
3,140607,tt2488496,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,http://www.starwars.com/films/star-wars-episod...,J.J. Abrams,Every generation has a story.,...,Thirty years after defeating the Galactic Empi...,136,"[Action, Adventure, Science, Fiction, Fantasy]",Lucasfilm|Truenorth Productions|Bad Robot,12/15/2015,5292,7.5,2015,183999919.0,1902723000.0
4,168259,tt2820852,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,http://www.furious7.com/,James Wan,Vengeance Hits Home,...,Deckard Shaw seeks revenge against Dominic Tor...,137,"[Action, Crime, Thriller]",Universal Pictures|Original Film|Media Rights ...,4/1/2015,2947,7.3,2015,174799923.1,1385749000.0


# Making the directors dataframe

In [None]:
directors_df = pd.DataFrame(columns=['director', 'genres', 'movie_titles'])
directors_df['genres'] = directors_df['genres'].astype('object')
directors_df['movie_titles'] = directors_df['movie_titles'].astype('object')
for index, movie in df.iterrows():
    flag = 0
    for i in directors_df['director']:
      if movie['director'] != i :
        continue
      else:
        flag = directors_df['director'].tolist().index(i)
        break
    if flag == 0:
        directors_df = pd.concat([directors_df, pd.DataFrame({'director': movie['director'],
                                                              'genres': [movie['genres']],
                                                              'movie_titles': [[movie['original_title']]]})],
                                                              ignore_index=True)
    else:
      directors_df.at[flag, 'genres'].extend(movie['genres'])
      directors_df.at[flag, 'movie_titles'].append(movie['original_title'])
      flag = 0
directors_df.head()


Unnamed: 0,director,genres,movie_titles
0,Colin Trevorrow,"[Action, Adventure, Science, Fiction, Thriller]",[Jurassic World]
1,George Miller,"[Action, Adventure, Science, Fiction, Thriller...","[Mad Max: Fury Road, Happy Feet Two, Happy Fee..."
2,Robert Schwentke,"[Adventure, Science, Fiction, Thriller, Drama,...","[Insurgent, The Time Traveler's Wife, RED, R.I..."
3,J.J. Abrams,"[Action, Adventure, Science, Fiction, Fantasy,...","[Star Wars: The Force Awakens, Star Trek, Supe..."
4,James Wan,"[Action, Crime, Thriller, Horror, Thriller, Cr...","[Furious 7, Insidious, Saw, The Conjuring, Ins..."


In [None]:
directors_df['genres'] = directors_df['genres'].apply(lambda x: ' '.join(x))
directors_df.head()

Unnamed: 0,director,genres,movie_titles
0,Colin Trevorrow,Action Adventure Science Fiction Thriller,[Jurassic World]
1,George Miller,Action Adventure Science Fiction Thriller Musi...,"[Mad Max: Fury Road, Happy Feet Two, Happy Fee..."
2,Robert Schwentke,Adventure Science Fiction Thriller Drama Roman...,"[Insurgent, The Time Traveler's Wife, RED, R.I..."
3,J.J. Abrams,Action Adventure Science Fiction Fantasy Scien...,"[Star Wars: The Force Awakens, Star Trek, Supe..."
4,James Wan,Action Crime Thriller Horror Thriller Crime Th...,"[Furious 7, Insidious, Saw, The Conjuring, Ins..."


# Fitting TF-IDF vectorizer based on directors & genres using sigmoid & cosine similarity

In [None]:
dir_tfv = TfidfVectorizer(
    min_df=3, max_features=None, strip_accents='unicode',
    analyzer='word', ngram_range=(1, 3), stop_words='english'
)

# Fitting the TF-IDF on the 'genres' text
dir_tfv_matrix = dir_tfv.fit_transform(directors_df['genres'])

# Compute the cosine similarity matrix
cos_sim = cosine_similarity(dir_tfv_matrix, dir_tfv_matrix)

sigmoid = sigmoid_kernel(dir_tfv_matrix, dir_tfv_matrix)

dir_tfv_matrix.shape

(5096, 1469)

# A searching for the director with movie name function

In [None]:
def find_movie_list_index(movie_title, directors_df = directors_df):
  for index, row in directors_df.iterrows():
    if movie_title in row['movie_titles']:
      return index
  return None

# Giving recommendations based on sigmoid similarity

In [None]:
def give_rec_sigmoid(movie_title, sig=sigmoid):
    # Get the index corresponding to original_title
    idx = find_movie_list_index(movie_title)

    # Sort the movies & get the scores of the 10 most similar movies
    sig_scores = sorted(list(enumerate(sigmoid[idx])), reverse=True, key=lambda x: x[1])[0:10]

    # director indices
    dir_indices = [i[0] for i in sig_scores]

    # Top 10 most similar movies
    movie_list = []
    for i in dir_indices:
      movie_list.extend(directors_df['movie_titles'][i])
    for i in movie_list:
      if i == movie_title:
        movie_list.remove(i)
    return movie_list[0:10]

In [None]:
give_rec_sigmoid('Star Wars: The Force Awakens')

['Star Trek',
 'Super 8',
 'Star Trek Into Darkness',
 'Mission: Impossible III',
 'Hardcore Henry',
 'Captain America: The Winter Soldier',
 '2012: Ice Age',
 'Phineas and Ferb: Mission Marvel',
 'Babylon 5: The Lost Tales - Voices in the Dark',
 'The Blood of Heroes']

# Giving recommendations based on cosine similarity

In [None]:
def give_rec_cosine(movie_title, cos=cos_sim):
    # Get the index corresponding to original_title
    idx = find_movie_list_index(movie_title)

    # Sort the movies & get the scores of the 10 most similar movies
    cos_scores = sorted(list(enumerate(cos_sim[idx])), reverse=True, key=lambda x: x[1])[0:10]

    # director indices
    dir_indices = [i[0] for i in cos_scores]

    # Top 10 most similar movies
    movie_list = []
    for i in dir_indices:
      movie_list.extend(directors_df['movie_titles'][i])
    for i in movie_list:
      if i == movie_title:
        movie_list.remove(i)
    return movie_list[0:10]

In [None]:
give_rec_cosine('Star Wars: The Force Awakens')

['Star Trek',
 'Super 8',
 'Star Trek Into Darkness',
 'Mission: Impossible III',
 'Hardcore Henry',
 'Captain America: The Winter Soldier',
 '2012: Ice Age',
 'Phineas and Ferb: Mission Marvel',
 'Babylon 5: The Lost Tales - Voices in the Dark',
 'The Blood of Heroes']