# Recommendation System Using LLMs


# Initializing Custom IMDB Dataset


In [1]:
import pandas as pd

# Read CSV file with low_memory=False
df_movies = pd.read_csv('/kaggle/input/custom-imdb-dataset/Custom_IMDB_Dataset.csv', low_memory=False)
df_movies


Unnamed: 0,tconst,titleType,primaryTitle,startYear,endYear,genres,language,region,averageRating,numVotes
0,tt0000009,movie,Miss Jerry,1894,\N,['Romance'],\N,DE,5.3,208
1,tt0000147,movie,The Corbett-Fitzsimmons Fight,1897,\N,"['Documentary', 'News', 'Sport']",\N,\N,5.2,505
2,tt0000502,movie,Bohemios,1905,\N,['\\N'],\N,\N,4.1,15
3,tt0000574,movie,The Story of the Kelly Gang,1906,\N,"['Action', 'Adventure', 'Biography']",\N,\N,6.0,874
4,tt0000591,movie,The Prodigal Son,1907,\N,['Drama'],\N,US,5.5,23
...,...,...,...,...,...,...,...,...,...,...
445059,tt9916428,movie,The Secret of China,2019,\N,"['Adventure', 'History', 'War']",\N,\N,3.4,18
445060,tt9916460,tvMovie,Pink Taxi,2019,\N,['Comedy'],\N,\N,8.9,19
445061,tt9916538,movie,Kuambil Lagi Hatiku,2019,\N,['Drama'],\N,ID,8.6,7
445062,tt9916706,movie,Dankyavar Danka,2013,\N,['Comedy'],\N,IN,7.6,5


---


# Preprocessing and Tokenization


In [103]:
import spacy
from spacy.matcher import PhraseMatcher
import json
import re

# Load essential data: language codes, region codes, and genres from JSON files
with open('/kaggle/input/imdb-encoders/language_codes.json', 'r') as file:
    language_codes = json.load(file)
language_to_code = {name.lower(): code for code, name in language_codes.items()}

with open('/kaggle/input/imdb-encoders/region_data.json', 'r') as file:
    region_data = json.load(file)
region_name_to_code = {name.lower(): code for code, names in region_data.items() for name in names}

with open('/kaggle/input/imdb-encoders/genres_data.json', 'r') as file:
    genre_list = json.load(file)

# Initialize spaCy NLP model
nlp = spacy.load("en_core_web_sm")

def clean_and_combine_genres(genres):
    """Clean genre strings, count genre occurrences, and combine genres with frequency >= 2 into a single string."""
    genre_counts = {}
    for genre in genres:
        # Normalize and split genre strings
        genre_items = genre.replace("[", "").replace("]", "").replace("'", "").strip().lower().split(", ")
        for item in genre_items:
            genre_counts[item] = genre_counts.get(item, 0) + 1

    # Filter genres with frequency >= 2
    filtered_genres = {genre for genre, count in genre_counts.items() if count >= 2}

    return ' '.join(sorted(filtered_genres))


def find_movie_genres(title, movies_df):
    """Find genres for a movie title from a DataFrame."""
    matched_movies = movies_df[movies_df['primaryTitle'].str.lower() == title.lower()]
    if not matched_movies.empty:
        return matched_movies.iloc[0]['genres'].split(',')
    return []

def preprocess_query(query, movies_df):
    """Process user query to extract and combine genres, languages, and regions."""
    doc = nlp(query.lower())
    genre_matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
    genre_patterns = [nlp(text.lower()) for text in genre_list]
    genre_matcher.add("GENRE", genre_patterns)

    # Extract and process movie genres
    movie_titles = re.findall(r"['\"](.*?)['\"]", query)
    genres_from_titles = []
    for title in movie_titles:
        genres_from_titles.extend(find_movie_genres(title, movies_df))

    # Combine and clean genres from titles
    combined_genres = clean_and_combine_genres(genres_from_titles).split(' ')

    # Identify additional genres directly mentioned in the query
    matches = genre_matcher(doc)
    for match_id, start, end in matches:
        found_genre = doc[start:end].text
        if found_genre not in combined_genres:
            combined_genres.append(found_genre.lower())

    # Incorporate languages and regions
    for token in doc:
        if token.text in language_to_code and language_to_code[token.text] not in combined_genres:
            combined_genres.append(language_to_code[token.text].lower())
        elif token.text in region_name_to_code and region_name_to_code[token.text] not in combined_genres:
            combined_genres.append(region_name_to_code[token.text].lower())

    # Finalize the processed query by removing duplicates and sorting
    unique_genres = set(combined_genres)
    return ' '.join(sorted(unique_genres))

# Example usage
user_query = "I want to watch action adventure japanese movies/series. Do recommend me the best."
processed_query = preprocess_query(user_query, df_movies)
print(processed_query)


 action adventure ja


---


# Vectorization and Embedding Dataset


In [3]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl.metadata (11 kB)
Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0mm
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-2.5.1


In [4]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
import concurrent.futures

# Define model names as used in the DataFrame column names
model_names = [
    'sentence-transformers/all-MiniLM-L6-v2',
    'sentence-transformers/paraphrase-albert-small-v2',
    'sentence-transformers/all-distilroberta-v1',
    'sentence-transformers/all-mpnet-base-v2'
]

# Assuming df_movies is already defined and loaded
df_movies['comprehensive_description'] = df_movies.apply(
    lambda x: f"{x['genres']} {x['language']} {x['region']}",
    axis=1
)

def process_model_embeddings(model_name, descriptions_chunk):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(descriptions_chunk, batch_size=32, show_progress_bar=False)  # Adjust batch_size if necessary
    return embeddings

# Function to process a single chunk for all models concurrently
def process_chunk_for_all_models(descriptions_chunk):
    with concurrent.futures.ThreadPoolExecutor(max_workers=len(model_names)) as executor:
        # Create future tasks for processing chunk with all models
        future_to_model = {executor.submit(process_model_embeddings, model_name, descriptions_chunk): model_name for model_name in model_names}
        
        results = {}
        for future in concurrent.futures.as_completed(future_to_model):
            model_name = future_to_model[future]
            try:
                embeddings = future.result()
                results[model_name] = embeddings
            except Exception as exc:
                print(f'{model_name} generated an exception: {exc}')
        return results

# Split descriptions into chunks and process each chunk concurrently across models
descriptions = df_movies['comprehensive_description'].tolist()
chunk_size = 3000  # Adjust based on your dataset and memory constraints
chunks = [descriptions[i:i+chunk_size] for i in range(0, len(descriptions), chunk_size)]

# Dictionary to collect embeddings from all chunks for each model
all_embeddings = {model_name: [] for model_name in model_names}

for chunk in chunks:
    chunk_results = process_chunk_for_all_models(chunk)
    for model_name, embeddings in chunk_results.items():
        all_embeddings[model_name].extend(embeddings)

# Update DataFrame with the embeddings for each model
for model_name, embeddings_list in all_embeddings.items():
    column_name = model_name.split('/')[-1] + '_embeddings'
    df_movies[column_name] = embeddings_list

# Normalize 'averageRating' and 'numVotes' to be between 0 and 1
df_movies['norm_rating'] = (df_movies['averageRating'] - df_movies['averageRating'].min()) / (df_movies['averageRating'].max() - df_movies['averageRating'].min())
df_movies['norm_votes'] = (df_movies['numVotes'] - df_movies['numVotes'].min()) / (df_movies['numVotes'].max() - df_movies['numVotes'].min())

# Drop the comprehensive description column if no longer needed
df_movies.drop(columns=['comprehensive_description'], inplace=True)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.06k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/827 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/245 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

---


## Recommendation System


In [104]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

# Define model names as used in the DataFrame column names
model_names = [
    'all-MiniLM-L6-v2',
    'paraphrase-albert-small-v2',
    'all-distilroberta-v1',
    'all-mpnet-base-v2'
]

# Assume 'df_movies' is your DataFrame and 'processed_query' is your user query

def recommend_movies(user_query, df_movies, top_k=20, weight_similarity=0.6, weight_rating=0.2, weight_votes=0.2):
    # Initialize a dictionary to hold the recommendations from each model
    recommendations = {}

    for model_name in model_names:
        model = SentenceTransformer(f'sentence-transformers/{model_name}')
        # Encode the user query into an embedding
        user_query_embedding = model.encode(user_query).reshape(1, -1)

        # Embedding column name based on the model
        embedding_column = model_name + '_embeddings'

        # Convert description embeddings from lists to numpy arrays if they're not already
        if isinstance(df_movies[embedding_column].iloc[0], list):
            df_movies[embedding_column] = df_movies[embedding_column].apply(np.array)

        # Calculate cosine similarity
        similarities = cosine_similarity(user_query_embedding, np.stack(df_movies[embedding_column])).flatten()
        df_movies['similarity'] = similarities

        # Normalize similarity to be between 0 and 1
        norm_similarity = (similarities - similarities.min()) / (similarities.max() - similarities.min())

        # Calculate combined score based on weighted sum
        df_movies['combined_score'] = (weight_similarity * norm_similarity +
                                       weight_rating * df_movies['norm_rating'] +
                                       weight_votes * df_movies['norm_votes'])

        # Filter and sort movies based on combined_score and select the top_k entries
        filtered_movies = df_movies[((df_movies['averageRating'] > 6.5) & (df_movies['averageRating'] <= 9.0) & (df_movies['numVotes'] > 5000)) |
                                    ((df_movies['averageRating'] > 9.0) & (df_movies['numVotes'] > 10000))]
        top_recommendations = filtered_movies.nlargest(top_k, 'combined_score')

        # Store the recommendations from the current model
        recommendations[model_name] = top_recommendations[['primaryTitle', 'startYear', 'genres', 'language', 'averageRating', 'numVotes', 'similarity', 'combined_score']].set_index(pd.Index(range(1, len(top_recommendations) + 1)))

    return recommendations

# Run the recommendation for all models
recommendations = recommend_movies(processed_query, df_movies)

# Access recommendations for a specific model
for model_name in model_names:
    print(f"Recommendations using model {model_name}:\n{recommendations[model_name]}\n\n")

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Recommendations using model all-MiniLM-L6-v2:
                                         primaryTitle startYear  \
1                                           Inception      2010   
2                      Demon Slayer: Kimetsu no Yaiba      2019   
3                                     Game of Thrones      2011   
4                             Raiders of the Lost Ark      1981   
5                                   Dragon Ball Z Kai      2009   
6      Star Wars: Episode V - The Empire Strikes Back      1980   
7   The Lord of the Rings: The Fellowship of the Ring      2001   
8                            Raya and the Last Dragon      2021   
9                                           Gladiator      2000   
10                                             Aliens      1986   
11                                       Tower of God      2020   
12  Pirates of the Caribbean: The Curse of the Bla...      2003   
13      The Lord of the Rings: The Return of the King      2003   
14              

In [105]:
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd

# Define model names as used in the DataFrame column names
model_names = [
    'all-MiniLM-L6-v2',
    'paraphrase-albert-small-v2',
    'all-distilroberta-v1',
    'all-mpnet-base-v2'
]

def recommend_movies(user_query, df_movies, top_k=20, weight_similarity=0.6, weight_rating=0.2, weight_votes=0.2):
    # Split the processed_query into words
    query_parts = user_query.split()
    
    # Identify language codes in the query
    query_languages = [part for part in query_parts if part in language_codes.keys()]
    
    # Filter df_movies to include only rows with a language that matches the query
    if query_languages:
        # Use .copy() to ensure you're working with a copy of the data and not a slice
        df_movies = df_movies[df_movies['language'].isin(query_languages)].copy()

    # Initialize a dictionary to hold the recommendations from each model
    recommendations = {}

    for model_name in model_names:
        model = SentenceTransformer(f'sentence-transformers/{model_name}')
        # Encode the user query into an embedding
        user_query_embedding = model.encode(" ".join(query_parts)).reshape(1, -1)

        # Embedding column name based on the model
        embedding_column = model_name + '_embeddings'

        # Convert description embeddings from lists to numpy arrays if they're not already
        if isinstance(df_movies[embedding_column].iloc[0], list):
            df_movies[embedding_column] = df_movies[embedding_column].apply(np.array)

        # Calculate cosine similarity
        similarities = cosine_similarity(user_query_embedding, np.stack(df_movies[embedding_column])).flatten()
        
        # Use .loc[] to avoid SettingWithCopyWarning when assigning new columns
        df_movies.loc[:, 'similarity'] = similarities

        # Normalize similarity to be between 0 and 1
        norm_similarity = (similarities - similarities.min()) / (similarities.max() - similarities.min())
        
        # Use .loc[] to safely assign 'combined_score'
        df_movies.loc[:, 'combined_score'] = (weight_similarity * norm_similarity +
                                              weight_rating * df_movies['norm_rating'] +
                                              weight_votes * df_movies['norm_votes'])

        # Filter and sort movies based on combined_score and select the top_k entries
        filtered_movies = df_movies[((df_movies['averageRating'] > 6.5) & (df_movies['averageRating'] <= 9.5) & (df_movies['numVotes'] > 5000))]
        top_recommendations = filtered_movies.nlargest(top_k, 'combined_score')

        # Store the recommendations from the current model
        recommendations[model_name] = top_recommendations[['primaryTitle', 'startYear', 'genres', 'language', 'averageRating', 'numVotes', 'similarity']].set_index(pd.Index(range(1, len(top_recommendations) + 1)))

    return recommendations

# Run the recommendation for all models
recommendations = recommend_movies(processed_query, df_movies)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

---

# Access recommendations for a specific model

In [106]:
print("\n\nRecommendations Model --> all-mpnet-base-v2\n")
recommendations['all-mpnet-base-v2']




Recommendations Model --> all-mpnet-base-v2



Unnamed: 0,primaryTitle,startYear,genres,language,averageRating,numVotes,similarity
1,Demon Slayer: Kimetsu no Yaiba,2019,"['Action', 'Adventure', 'Animation']",ja,8.6,144058,0.70253
2,Dragon Ball Z Kai,2009,"['Action', 'Adventure', 'Animation']",ja,8.3,24468,0.70253
3,The Batman,2022,"['Action', 'Crime', 'Drama']",ja,7.8,775647,0.680983
4,Sanjuro,1962,"['Action', 'Drama', 'Thriller']",ja,8.0,40858,0.702046
5,Raya and the Last Dragon,2021,"['Action', 'Adventure', 'Animation']",ja,7.3,170150,0.70253
6,Tower of God,2020,"['Action', 'Adventure', 'Animation']",ja,7.6,7110,0.70253
7,Black Clover: Sword of the Wizard King,2023,"['Action', 'Adventure', 'Animation']",ja,7.4,7553,0.70253
8,Alice in Borderland,2020,"['Action', 'Drama', 'Mystery']",ja,7.7,95236,0.69533
9,Frieren: Beyond Journey's End,2023,"['Adventure', 'Animation', 'Drama']",ja,9.0,8099,0.68254
10,Chocolate,2008,"['Action', 'Drama']",ja,6.9,18825,0.704811


In [107]:
print("\n\nRecommendations Model --> all-MiniLM-L6-v2\n")
recommendations['all-MiniLM-L6-v2']




Recommendations Model --> all-MiniLM-L6-v2



Unnamed: 0,primaryTitle,startYear,genres,language,averageRating,numVotes,similarity
1,Demon Slayer: Kimetsu no Yaiba,2019,"['Action', 'Adventure', 'Animation']",ja,8.6,144058,0.748267
2,Dragon Ball Z Kai,2009,"['Action', 'Adventure', 'Animation']",ja,8.3,24468,0.748267
3,Raya and the Last Dragon,2021,"['Action', 'Adventure', 'Animation']",ja,7.3,170150,0.748267
4,Tower of God,2020,"['Action', 'Adventure', 'Animation']",ja,7.6,7110,0.748267
5,Black Clover: Sword of the Wizard King,2023,"['Action', 'Adventure', 'Animation']",ja,7.4,7553,0.748267
6,Frieren: Beyond Journey's End,2023,"['Adventure', 'Animation', 'Drama']",ja,9.0,8099,0.689547
7,Sing 2,2021,"['Adventure', 'Animation', 'Comedy']",ja,7.4,85640,0.689086
8,Little Nemo: Adventures in Slumberland,1989,"['Adventure', 'Animation', 'Comedy']",ja,7.1,8213,0.689086
9,Alice in Borderland,2020,"['Action', 'Drama', 'Mystery']",ja,7.7,95236,0.645471
10,Fire Force,2019,"['Action', 'Animation', 'Drama']",ja,7.6,10908,0.633595


In [108]:
print("\n\nRecommendations Model --> paraphrase-albert-small-v2\n")
recommendations['paraphrase-albert-small-v2']




Recommendations Model --> paraphrase-albert-small-v2



Unnamed: 0,primaryTitle,startYear,genres,language,averageRating,numVotes,similarity
1,Demon Slayer: Kimetsu no Yaiba,2019,"['Action', 'Adventure', 'Animation']",ja,8.6,144058,0.615266
2,Dragon Ball Z Kai,2009,"['Action', 'Adventure', 'Animation']",ja,8.3,24468,0.615266
3,Raya and the Last Dragon,2021,"['Action', 'Adventure', 'Animation']",ja,7.3,170150,0.615266
4,Tower of God,2020,"['Action', 'Adventure', 'Animation']",ja,7.6,7110,0.615266
5,Black Clover: Sword of the Wizard King,2023,"['Action', 'Adventure', 'Animation']",ja,7.4,7553,0.615266
6,Frieren: Beyond Journey's End,2023,"['Adventure', 'Animation', 'Drama']",ja,9.0,8099,0.57096
7,Sanjuro,1962,"['Action', 'Drama', 'Thriller']",ja,8.0,40858,0.562228
8,Chocolate,2008,"['Action', 'Drama']",ja,6.9,18825,0.565467
9,Mosul,2019,"['Action', 'Drama', 'Thriller']",ja,7.1,30225,0.562228
10,Sing 2,2021,"['Adventure', 'Animation', 'Comedy']",ja,7.4,85640,0.555391


In [109]:
print("\n\nRecommendations Model --> all-distilroberta-v1\n")
recommendations['all-distilroberta-v1']




Recommendations Model --> all-distilroberta-v1



Unnamed: 0,primaryTitle,startYear,genres,language,averageRating,numVotes,similarity
1,Demon Slayer: Kimetsu no Yaiba,2019,"['Action', 'Adventure', 'Animation']",ja,8.6,144058,0.644191
2,Dragon Ball Z Kai,2009,"['Action', 'Adventure', 'Animation']",ja,8.3,24468,0.644191
3,Raya and the Last Dragon,2021,"['Action', 'Adventure', 'Animation']",ja,7.3,170150,0.644191
4,Tower of God,2020,"['Action', 'Adventure', 'Animation']",ja,7.6,7110,0.644191
5,Black Clover: Sword of the Wizard King,2023,"['Action', 'Adventure', 'Animation']",ja,7.4,7553,0.644191
6,Sing 2,2021,"['Adventure', 'Animation', 'Comedy']",ja,7.4,85640,0.599131
7,Little Nemo: Adventures in Slumberland,1989,"['Adventure', 'Animation', 'Comedy']",ja,7.1,8213,0.599131
8,Frieren: Beyond Journey's End,2023,"['Adventure', 'Animation', 'Drama']",ja,9.0,8099,0.55971
9,Miss Kobayashi's Dragon Maid,2017,"['Animation', 'Comedy', 'Fantasy']",ja,7.7,5135,0.538001
10,Healer,2014,"['Action', 'Comedy', 'Crime']",ja,8.5,10784,0.509205


---


# Model Ranking Analysis

In [110]:
# Function to rank models based on specificity v2
def rank_models_by_specificity(recommendations):
    model_specificity_scores = {}
    for model_name, df in recommendations.items():
        avg_similarity_score = df['similarity'].mean()
        model_specificity_scores[model_name] = avg_similarity_score
    
    ranked_models = sorted(model_specificity_scores.items(), key=lambda x: x[1], reverse=True)
    
    print("Models ranked by specificity and relevance to user queries:")
    for rank, (model, score) in enumerate(ranked_models, start=1):
        print(f"{rank}. {model} --> Specificity Score: {score:.2f}")

# Example usage:
rank_models_by_specificity(recommendations)

Models ranked by specificity and relevance to user queries:
1. all-mpnet-base-v2 --> Specificity Score: 0.68
2. all-MiniLM-L6-v2 --> Specificity Score: 0.65
3. paraphrase-albert-small-v2 --> Specificity Score: 0.56
4. all-distilroberta-v1 --> Specificity Score: 0.55


---

# Saving Vectorized Dataset (in 4 parts)

In [None]:
# Saving the first part
start_index = 0
part_size = len(df_movies) // 4
end_index = start_index + part_size
temp_df = df_movies.iloc[start_index:end_index]

# Save to CSV
temp_df.to_csv('/kaggle/working/Embedded_Dataset_1.csv', index=False)
print('Part 1 saved as /kaggle/working/Embedded_Dataset_1.csv')


In [None]:
# Saving the second part
start_index = part_size
end_index = start_index + part_size
temp_df = df_movies.iloc[start_index:end_index]

# Save to CSV
temp_df.to_csv('/kaggle/working/Embedded_Dataset_2.csv', index=False)
print('Part 2 saved as /kaggle/working/Embedded_Dataset_2.csv')


In [None]:
# Saving the third part
start_index = 2 * part_size
end_index = start_index + part_size
temp_df = df_movies.iloc[start_index:end_index]

# Save to CSV
temp_df.to_csv('/kaggle/working/Embedded_Dataset_3.csv', index=False)
print('Part 3 saved as /kaggle/working/Embedded_Dataset_3.csv')


In [None]:
# For the fourth part
start_index = 3 * part_size
temp_df = df_movies.iloc[start_index:]

# Save to CSV
temp_df.to_csv('/kaggle/working/Embedded_Dataset_4.csv', index=False)
print('Part 4 saved as /kaggle/working/Embedded_Dataset_4.csv')


---

In [None]:
import pandas as pd

csv_file_path = '/kaggle/working/Embedded_Dataset.csv.bz2'  # Saving as a bz2 compressed file
df_movies.to_csv(csv_file_path, index=False, compression='bz2')

---