In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

Now we have to import our dataset

In [None]:
movies = pd.read_csv('movies.csv')

#let's see what is our dataset like
print(movies)
print("titles:\n", movies["title"])

       movieId                               title  \
0            1                    Toy Story (1995)   
1            2                      Jumanji (1995)   
2            3             Grumpier Old Men (1995)   
3            4            Waiting to Exhale (1995)   
4            5  Father of the Bride Part II (1995)   
...        ...                                 ...   
62418   209157                           We (2018)   
62419   209159           Window of the Soul (2001)   
62420   209163                    Bad Poems (2018)   
62421   209169                 A Girl Thing (2001)   
62422   209171      Women of Devil's Island (1962)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
1                       Adventure|Children|Fantasy  
2                                   Comedy|Romance  
3                             Comedy|Drama|Romance  
4                                           Comedy  
...                              

Some good ol' string sanitation

In [None]:
movies['combined_genres'] = movies['genres'].str.replace('|', ' ')
print(movies['combined_genres'])

0        Adventure Animation Children Comedy Fantasy
1                         Adventure Children Fantasy
2                                     Comedy Romance
3                               Comedy Drama Romance
4                                             Comedy
                            ...                     
62418                                          Drama
62419                                    Documentary
62420                                   Comedy Drama
62421                             (no genres listed)
62422                         Action Adventure Drama
Name: combined_genres, Length: 62423, dtype: object


  movies['combined_genres'] = movies['genres'].str.replace('|', ' ')


we have to construct the TF-IDF matrix.


*   TF-IDF matrix. TF: Term-Frequency #term / #terms_in_file
*   IDF: Inverse Document-Frequency        #total_documents / #documents_containing_term


We are ranking how important a tag is



In [None]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['combined_genres'])
print(tfidf_matrix)

  (0, 8)	0.49674837028452556
  (0, 4)	0.27771718920269134
  (0, 3)	0.48808437174545455
  (0, 2)	0.48833048769293214
  (0, 1)	0.44656600888161224
  (1, 8)	0.600453511519303
  (1, 3)	0.5899807477262311
  (1, 1)	0.5397946811673262
  (2, 18)	0.8011493881971549
  (2, 4)	0.5984644164788115
  (3, 7)	0.44022013245613556
  (3, 18)	0.7193439273049612
  (3, 4)	0.5373551425545094
  (4, 4)	1.0
  (5, 20)	0.5370772735955626
  (5, 5)	0.6249107985241872
  (5, 0)	0.5665990611314317
  (6, 18)	0.8011493881971549
  (6, 4)	0.5984644164788115
  (7, 3)	0.7377898038839786
  (7, 1)	0.6750305217431583
  (8, 0)	1.0
  (9, 20)	0.5220827968697404
  (9, 0)	0.5507803757155867
  (9, 1)	0.6512069801063765
  :	:
  (62408, 8)	0.7710287903218748
  (62409, 22)	1.0
  (62410, 7)	1.0
  (62411, 4)	1.0
  (62412, 6)	0.6432580604344854
  (62412, 2)	0.7656494417721884
  (62413, 7)	1.0
  (62414, 12)	0.6946781180206175
  (62414, 5)	0.7193207298162155
  (62415, 14)	0.7071067811865475
  (62415, 11)	0.7071067811865475
  (62416, 7)	0.633

It's time to get the recommendations

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_recommendations_on_demand(title, tfidf_matrix, movies_df, top_n=25):
    # Find the index of the movie that matches the title
    idx = movies_df.index[movies_df['title'] == title].tolist()[0]

    # Compute cosine similarity between this movie and all others
    cosine_similarities = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()

    # Get top N similar movie indices
    similar_indices = cosine_similarities.argsort()[:-top_n-1:-1]

    # Exclude the first element (self-comparison)
    return movies_df['title'].iloc[similar_indices[1:]]

In [None]:
# Example usage
movie_title = "Godfather, The (1972)"
recommendations = get_recommendations_on_demand(movie_title, tfidf_matrix, movies)
print(recommendations)


37377                        Wild Rebels (1967)
46693                     Deadly Weapons (1974)
37380                        Son of Mine (2015)
1726                 Fireworks (Hana-bi) (1997)
21180                            Satanas (2007)
1722                    Newton Boys, The (1998)
37387                    New York Nights (1929)
27138                  No One Would Tell (1996)
21164                               11.6 (2013)
12022                           Longford (2006)
13717              Detective (Détective) (1985)
6068                               Q & A (1990)
27152          Processo per direttissima (1976)
10313         Good Night, and Good Luck. (2005)
1691            Letter From Death Row, A (1998)
52153               Crime of the Century (1996)
46736            Code Name: Diamond Head (1977)
39766          L'Homme aux Yeux d'Argent (1985)
46672                    Willie Dynamite (1974)
15866                              Manon (1949)
11988    Swimming Pool, The (La piscine)