In [1]:
# Importing pandas
import pandas as pd

# Reading the Data
data = pd.read_csv("/kaggle/input/imdb-movies-dataset/imdb_movies.csv")
data.head()

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,12/15/2022,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,04/05/2023,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,English,100000000.0,724459000.0,AU
3,Mummies,01/05/2023,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,03/17/2023,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,English,77000000.0,340942000.0,US


In [2]:
data.tail()

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
10173,20th Century Women,12/28/2016,73.0,Drama,"In 1979 Santa Barbara, California, Dorothea Fi...","Annette Bening, Dorothea Fields, Lucas Jade Zu...",20th Century Women,Released,English,7000000.0,9353729.0,US
10174,Delta Force 2: The Colombian Connection,08/24/1990,54.0,Action,When DEA agents are taken captive by a ruthles...,"Chuck Norris, Col. Scott McCoy, Billy Drago, R...",Delta Force 2: The Colombian Connection,Released,English,9145817.8,6698361.0,US
10175,The Russia House,12/21/1990,61.0,"Drama, Thriller, Romance","Barley Scott Blair, a Lisbon-based editor of R...","Sean Connery, Bartholomew 'Barley' Scott Blair...",The Russia House,Released,English,21800000.0,22997992.0,US
10176,Darkman II: The Return of Durant,07/11/1995,55.0,"Action, Adventure, Science Fiction, Thriller, ...",Darkman and Durant return and they hate each o...,"Larry Drake, Robert G. Durant, Arnold Vosloo, ...",Darkman II: The Return of Durant,Released,English,116000000.0,475661306.0,US
10177,The Swan Princess: A Royal Wedding,07/20/2020,70.0,"Animation, Family, Fantasy",Princess Odette and Prince Derek are going to ...,"Nina Herzog, Princess Odette (voice), Yuri Low...",The Swan Princess: A Royal Wedding,Released,English,92400000.0,539401838.6,GB


In [3]:
%pip install transformers torch sentence-transformers

Note: you may need to restart the kernel to use updated packages.


## About Models

1. *Other MiniLM Models*
`all-MiniLM-L12-v2`: Slightly larger and more accurate than all-MiniLM-L6-v2, but still efficient.
`paraphrase-MiniLM-L6-v2`: Optimized for paraphrase detection and general semantic similarity.

2. *BERT-based Models*
`all-mpnet-base-v2`: Highly accurate for semantic similarity tasks, though slower compared to MiniLM models.
`paraphrase-mpnet-base-v2`: Tuned for paraphrase and similarity tasks.
bert-base-nli-mean-tokens: Based on BERT and suitable for general sentence embeddings.


3. *DistilBERT Variants*
`distiluse-base-multilingual-cased-v1`: Multilingual and lightweight, good for non-English texts.
`distilbert-base-nli-stsb-mean-tokens`: Focused on natural language inference and semantic similarity.


4. *RoBERTa-based Models*
`all-roberta-large-v1`: High accuracy but slower and computationally intensive.
`paraphrase-roberta-base-v1`: Optimized for paraphrase and semantic tasks.


5. *Multilingual Models*
`paraphrase-multilingual-mpnet-base-v2`: Multilingual support for 50+ languages, suitable for cross-lingual tasks.
`xlm-r-distilroberta-base-paraphrase-v1`: Lightweight multilingual model.


6. *Sentence-T5 Models*
`sentence-t5-xl` / `sentence-t5-large` / `sentence-t5-base`: Based on Google's T5 architecture, providing state-of-the-art embeddings with a tradeoff in size and speed.


7. *Other Architectures*
`e5-large-v2` or `e5-base-v2`: Focused on embedding dense information from various contexts.
`universal-sentence-encoder`: Simple and efficient, good for general-purpose embeddings.


# How to Choose:
- Efficiency: Use MiniLM or DistilBERT models for faster computation.
- Accuracy: Opt for mpnet, roberta, or sentence-t5 models.
- Multilingual Needs: Use paraphrase-multilingual-mpnet-base-v2 or xlm-r variants.

In [4]:
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load pre-trained sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # You can use another model if needed

# Step 2: Generate embeddings for overviews
def generate_embeddings(text_list):
    return model.encode(text_list, convert_to_tensor=True)

# Get embeddings for the movie overviews
movie_overviews = data['overview'].tolist()
movie_embeddings = generate_embeddings(movie_overviews)

# You can now use these embeddings to compute similarities


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/319 [00:00<?, ?it/s]

In [5]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Step 3: Get Similar Movies based on Embedding Similarity
def get_similar_movies_genai(movie_name, top_n=5):
    # Find the index of the target movie
    movie_idx = data[data['names'] == movie_name].index[0]
    
    # Get the target movie's embedding
    target_embedding = movie_embeddings[movie_idx]
    
    # Move the target embedding and the movie embeddings to CPU (if they are on GPU)
    target_embedding_cpu = target_embedding.cpu().detach().numpy() if target_embedding.is_cuda else target_embedding.numpy()
    movie_embeddings_cpu = movie_embeddings.cpu().detach().numpy() if movie_embeddings.is_cuda else movie_embeddings.numpy()
    
    # Calculate cosine similarities
    similarities = cosine_similarity([target_embedding_cpu], movie_embeddings_cpu)[0]
    
    # Sort the movies by similarity score
    similar_movies_idx = similarities.argsort()[-top_n-1:-1][::-1]  # Get the top N similar movies
    
    similar_movies = data.iloc[similar_movies_idx]
    similar_movies['similarity'] = similarities[similar_movies_idx]
    
    return similar_movies[['names', 'genre', 'score', 'similarity']]

# Example: Get Similar Movies for "Creed III"
similar_movies_genai = get_similar_movies_genai("Creed III")
print(similar_movies_genai)


             names  genre  score  similarity
114       Creed II  Drama   70.0    0.715509
1555      Rocky II  Drama   72.0    0.620655
115          Creed  Drama   74.0    0.577664
1681  Rocky Balboa  Drama   68.0    0.552088
1380     Rocky III  Drama   69.0    0.536563


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  similar_movies['similarity'] = similarities[similar_movies_idx]


In [6]:
from sklearn.preprocessing import MinMaxScaler

# Example: Genre-Based Recommendations Using Embeddings with Confidence Score
def get_genre_recommendations_genai(genre, top_n=5):
    # Filter movies by genre, ensuring no NaN values in the genre column
    genre_movies = data[data['genre'].notna() & data['genre'].str.contains(genre, case=False)]
    
    # Get the movie embeddings for the selected genre
    genre_movie_embeddings = movie_embeddings[genre_movies.index]
    
    # Ensure the embeddings are on the CPU
    genre_movie_embeddings_cpu = genre_movie_embeddings.cpu().detach().numpy() if genre_movie_embeddings.is_cuda else genre_movie_embeddings.numpy()
    
    # Calculate the cosine similarity matrix
    similarity_matrix = cosine_similarity(genre_movie_embeddings_cpu, genre_movie_embeddings_cpu)
    
    # Calculate the mean similarity for each movie
    genre_movies['similarity'] = similarity_matrix.mean(axis=1)
    
    # Normalize similarity to get confidence scores
    scaler = MinMaxScaler()
    genre_movies['confidence'] = scaler.fit_transform(genre_movies[['similarity']])
    
    # Sort by confidence (similarity)
    genre_movies_sorted = genre_movies.sort_values(by='confidence', ascending=False)
    
    return genre_movies_sorted[['names', 'genre', 'score', 'similarity', 'confidence']].head(top_n)

# Example: Get Similar Movies from Action Genre
action_movies_genai = get_genre_recommendations_genai("Action")
print(action_movies_genai)


           names                                        genre  score  \
1611   Ong Bak 2                  Adventure, Action, Thriller   64.0   
785     Fortress                      Action, Thriller, Crime   60.0   
6263  Safe House                             Action, Thriller   64.0   
2447   Dangerous                             Action, Thriller   62.0   
2477  The Losers  Action, Adventure, Crime, Mystery, Thriller   63.0   

      similarity  confidence  
1611    0.288152    1.000000  
785     0.278532    0.965266  
6263    0.273244    0.946170  
2447    0.272643    0.944001  
2477    0.267726    0.926243  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_movies['similarity'] = similarity_matrix.mean(axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  genre_movies['confidence'] = scaler.fit_transform(genre_movies[['similarity']])
