# install required libraries

In [None]:
!pip install tf-keras transformers sentence-transformers

In [10]:
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import re

In [11]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("KevinJuanC/MovieRecommendationSystem")
model = AutoModel.from_pretrained("KevinJuanC/MovieRecommendationSystem")

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)




# read the data

In [12]:
# Step 1: Load the dataset
df = pd.read_csv(r"D:\abdo\AI\projects\recommender\needed features\movies_with_tags_rating.csv") # ensure the right path

## clean movie titles

In [13]:
def clean_movie_title(title):
    title = str(title)
    title = title.lower()
    title = re.sub(r'\s*\(.*?\)\s*', '', title)
    return title.strip()

# Apply title cleaning to the dataset
df['cleaned_title'] = df['title'].apply(clean_movie_title)

## extract embeddings using DistilBERT

In [5]:
# Function to extract embeddings using DistilBERT for different features
def get_feature_embedding(feature):
    if pd.isna(feature) or not isinstance(feature, str):
        return np.zeros(768)  # Return a zero vector if input is NaN or not a string

    # Tokenize the input
    inputs = tokenizer(
        feature,
        return_tensors="pt",    # Return PyTorch tensors
        truncation=True,         # Automatically truncate to the max length
        max_length=512,          # Maximum length of tokens
        padding="max_length"     # Pad to the max length if necessary
    )
    
    # Pass the inputs through the model to get embeddings
    with torch.no_grad():  # Disable gradient calculations for inference
        outputs = model(**inputs)
    
    # Use the mean of the last hidden state to represent the embedding
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
    
    return embeddings

### Extract embeddings for all movie features

In [6]:
df['overview_embedding'] = df['overview'].apply(get_feature_embedding)
df['genre_embedding'] = df['genre'].apply(get_feature_embedding)
df['cast_embedding'] = df['cast'].apply(get_feature_embedding)
df['director_embedding'] = df['director'].apply(get_feature_embedding)


# Step 4: Create an embedding matrix for each feature
overview_embedding_matrix = np.stack(df['overview_embedding'].values)
genre_embedding_matrix = np.stack(df['genre_embedding'].values)
cast_embedding_matrix = np.stack(df['cast_embedding'].values)
director_embedding_matrix = np.stack(df['director_embedding'].values)

KeyboardInterrupt: 

## save the embedding matrix

In [None]:
np.save('overview_embedding_matrix.npy', overview_embedding_matrix)
np.save('genre_embedding_matrix.npy', genre_embedding_matrix)
np.save('cast_embedding_matrix.npy', cast_embedding_matrix)
np.save('director_embedding_matrix.npy', director_embedding_matrix)

## combining features

In [None]:
def combine_embeddings(*embeddings):
    return np.mean(embeddings, axis=0)

# Step 5: Generate a combined embedding
df['combined_embedding'] = df.apply(
    lambda row: combine_embeddings(
        row['overview_embedding'],
        row['genre_embedding'],
        row['cast_embedding'],
        row['director_embedding']
    ),
    axis=1
)

combined_embedding_matrix = np.stack(df['combined_embedding'].values)

np.save('combined_embedding_matrix.npy', combined_embedding_matrix)

## Read the embedding matrix if  it exists

In [14]:
overview_embedding_matrix = np.load('overview_embedding_matrix.npy')
genre_embedding_matrix = np.load('genre_embedding_matrix.npy')
cast_embedding_matrix = np.load('cast_embedding_matrix.npy')
director_embedding_matrix = np.load('director_embedding_matrix.npy')
combined_embedding_matrix = np.load('combined_embedding_matrix.npy')

df['overview_embedding'] = list(overview_embedding_matrix)
df['genre_embedding'] = list(genre_embedding_matrix)
df['cast_embedding'] = list(cast_embedding_matrix)
df['director_embedding'] = list(director_embedding_matrix)
df['combined_embedding'] = list(combined_embedding_matrix)

# recommendation system

In [15]:
def recommend_similar_movies(movie_title, df, feature='overview', top_n=3):
    cleaned_movie_title = clean_movie_title(movie_title)
    
    if cleaned_movie_title in df['cleaned_title'].values:
        movie_index = df[df['cleaned_title'] == cleaned_movie_title].index[0]
        
        # Select the correct embedding matrix based on the feature
        if feature == 'overview':
            input_embedding = df.loc[movie_index, 'overview_embedding']
            embedding_matrix = overview_embedding_matrix
        elif feature == 'genre':
            input_embedding = df.loc[movie_index, 'genre_embedding']
            embedding_matrix = genre_embedding_matrix
        elif feature == 'cast':
            input_embedding = df.loc[movie_index, 'cast_embedding']
            embedding_matrix = cast_embedding_matrix
        elif feature == 'director':
            input_embedding = df.loc[movie_index, 'director_embedding']
            embedding_matrix = director_embedding_matrix
        elif feature == 'combined':
            input_embedding = df.loc[movie_index, 'combined_embedding']
            embedding_matrix = combined_embedding_matrix
        else:
            print("Invalid feature specified.")
            return None
        
        # Compute cosine similarity and sort similar movies
        similarities = cosine_similarity([input_embedding], embedding_matrix)[0]
        similar_movies = df.iloc[np.argsort(similarities)[::-1][1:top_n+1]]
        return similar_movies[['title', 'genre', 'rating']]
    else:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return None

# model operation

In [21]:
test = input('enter your favourite movie: ')

recommended_movies_overview = recommend_similar_movies(test, df, feature='overview', top_n=3)
recommended_movies_genre = recommend_similar_movies(test , df, feature='genre', top_n=3)
recommended_movies_cast = recommend_similar_movies(test, df, feature='cast', top_n=3)
recommended_movies_director = recommend_similar_movies(test, df, feature='director', top_n=3)
recommended_movies_combined = recommend_similar_movies(test, df, feature='combined', top_n=3)


print(f"Movies similar to {test} based on Overview:")
if recommended_movies_overview is not None:
    print(recommended_movies_overview)

print(f"\nMovies similar to {test}  based on Genre:")
if recommended_movies_genre is not None:
    print(recommended_movies_genre)

print(f"\nMovies similar to {test}  based on Cast:")
if recommended_movies_cast is not None:
    print(recommended_movies_cast)

print(f"\nMovies similar to {test}  based on Director:")
if recommended_movies_director is not None:
    print(recommended_movies_director)

print(f"\nMovies similar to {test}  based on Combined Features:")
if recommended_movies_combined is not None:
    print(recommended_movies_combined)

Movie 'me before you' not found in the dataset.
Movie 'me before you' not found in the dataset.
Movie 'me before you' not found in the dataset.
Movie 'me before you' not found in the dataset.
Movie 'me before you' not found in the dataset.
Movies similar to me before you based on Overview:

Movies similar to me before you  based on Genre:

Movies similar to me before you  based on Cast:

Movies similar to me before you  based on Director:

Movies similar to me before you  based on Combined Features:


# final code 

In [1]:
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import re

# Step 1: Load the dataset
df = pd.read_csv(r"D:\abdo\AI\projects\recommender\needed features\movies_with_tags_rating.csv")

# Step 2: Initialize a DistilBERT model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Function to clean movie titles
def clean_movie_title(title):
    title = str(title)
    title = title.lower()
    title = re.sub(r'\s*\(.*?\)\s*', '', title)
    return title.strip()

# Apply title cleaning to the dataset
df['cleaned_title'] = df['title'].apply(clean_movie_title)

# Function to extract embeddings using DistilBERT for different features
def get_feature_embedding(feature):
    if pd.isna(feature) or not isinstance(feature, str):
        return np.zeros(768)  # Return a zero vector if input is NaN or not a string

    # Tokenize the input
    inputs = tokenizer(
        feature,
        return_tensors="pt",    # Return PyTorch tensors
        truncation=True,         # Automatically truncate to the max length
        max_length=512,          # Maximum length of tokens
        padding="max_length"     # Pad to the max length if necessary
    )
    
    # Pass the inputs through the model to get embeddings
    with torch.no_grad():  # Disable gradient calculations for inference
        outputs = model(**inputs)
    
    # Use the mean of the last hidden state to represent the embedding
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
    
    return embeddings

"""
# Step 3: Extract embeddings for all movie features
df['overview_embedding'] = df['overview'].apply(get_feature_embedding)
df['genre_embedding'] = df['genre'].apply(get_feature_embedding)
df['cast_embedding'] = df['cast'].apply(get_feature_embedding)
df['director_embedding'] = df['director'].apply(get_feature_embedding)

# Step 4: Create an embedding matrix for each feature
overview_embedding_matrix = np.stack(df['overview_embedding'].values)
genre_embedding_matrix = np.stack(df['genre_embedding'].values)
cast_embedding_matrix = np.stack(df['cast_embedding'].values)
director_embedding_matrix = np.stack(df['director_embedding'].values)


# Function to combine multiple embeddings (e.g., averaging)
def combine_embeddings(*embeddings):
    return np.mean(embeddings, axis=0)

# Step 5: Generate a combined embedding
df['combined_embedding'] = df.apply(
    lambda row: combine_embeddings(
        row['overview_embedding'],
        row['genre_embedding'],
        row['cast_embedding'],
        row['director_embedding']
    ),
    axis=1
)

"""

overview_embedding_matrix=np.load('overview_embedding_matrix.npy')

genre_embedding_matrix=np.load('genre_embedding_matrix.npy')

cast_embedding_matrix=np.load('cast_embedding_matrix.npy')

director_embedding_matrix=np.load('director_embedding_matrix.npy')

combined_embedding_matrix=np.load('combined_embedding_matrix.npy')

df['overview_embedding'] = list(overview_embedding_matrix)
df['genre_embedding'] = list(genre_embedding_matrix)
df['cast_embedding'] = list(cast_embedding_matrix)
df['director_embedding'] = list(director_embedding_matrix)
df['combined_embedding'] = list(combined_embedding_matrix)


# Function to recommend similar movies based on a specified feature or combined features
def recommend_similar_movies(movie_title, df, feature='overview', top_n=3):
    cleaned_movie_title = clean_movie_title(movie_title)
    
    if cleaned_movie_title in df['cleaned_title'].values:
        movie_index = df[df['cleaned_title'] == cleaned_movie_title].index[0]
        
        # Select the correct embedding matrix based on the feature
        if feature == 'overview':
            input_embedding = df.loc[movie_index, 'overview_embedding']
            embedding_matrix = overview_embedding_matrix
        elif feature == 'genre':
            input_embedding = df.loc[movie_index, 'genre_embedding']
            embedding_matrix = genre_embedding_matrix
        elif feature == 'cast':
            input_embedding = df.loc[movie_index, 'cast_embedding']
            embedding_matrix = cast_embedding_matrix
        elif feature == 'director':
            input_embedding = df.loc[movie_index, 'director_embedding']
            embedding_matrix = director_embedding_matrix
        elif feature == 'combined':
            input_embedding = df.loc[movie_index, 'combined_embedding']
            embedding_matrix = combined_embedding_matrix
        else:
            print("Invalid feature specified.")
            return None
        
        # Compute cosine similarity and sort similar movies
        similarities = cosine_similarity([input_embedding], embedding_matrix)[0]
        similar_movies = df.iloc[np.argsort(similarities)[::-1][1:top_n+1]]
        return similar_movies[['title', 'genre', 'rating']]
    else:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return None

test = "inception"
# Example: Recommend movies similar to 'Interstellar' based on different features
recommended_movies_overview = recommend_similar_movies(test, df, feature='overview', top_n=3)
recommended_movies_genre = recommend_similar_movies(test , df, feature='genre', top_n=3)
recommended_movies_cast = recommend_similar_movies(test, df, feature='cast', top_n=3)
recommended_movies_director = recommend_similar_movies(test, df, feature='director', top_n=3)
recommended_movies_combined = recommend_similar_movies(test, df, feature='combined', top_n=3)

# Display recommendations
print(f"Movies similar to {test} based on Overview:")
if recommended_movies_overview is not None:
    print(recommended_movies_overview)

print(f"\nMovies similar to {test}  based on Genre:")
if recommended_movies_genre is not None:
    print(recommended_movies_genre)

print(f"\nMovies similar to {test}  based on Cast:")
if recommended_movies_cast is not None:
    print(recommended_movies_cast)

print(f"\nMovies similar to {test}  based on Director:")
if recommended_movies_director is not None:
    print(recommended_movies_director)

print(f"\nMovies similar to {test}  based on Combined Features:")
if recommended_movies_combined is not None:
    print(recommended_movies_combined)




Movies similar to inception based on Overview:
              title                                       genre  rating
8431   The Mechanic                     Action, Crime, Thriller    3.25
4680  Our Man Flint  Action, Adventure, Comedy, Fantasy, Sci-Fi    3.20
4497      Hopscotch                           Adventure, Comedy    3.55

Movies similar to inception  based on Genre:
             title                                genre  rating
1199        Aliens  Action, Adventure, Sci-Fi, Thriller     4.2
2075  The Avengers  Action, Adventure, Sci-Fi, Thriller     1.9
2077  The Avengers  Action, Adventure, Sci-Fi, Thriller     4.0

Movies similar to inception  based on Cast:
                           title                                genre  rating
3066                   The Beach  Adventure, Drama, Romance, Thriller    3.30
2467  10 Things I Hate About You               Comedy, Drama, Romance    3.65
8082          500 Days of Summer               Comedy, Drama, Romance    3.85

Movie

## handling unadded movies

In [17]:
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import re
import requests

# Load the dataset
df = pd.read_csv(r"D:\abdo\AI\projects\recommender\needed features\movies_with_tags_rating.csv")

# Initialize DistilBERT model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Clean movie titles
def clean_movie_title(title):
    return re.sub(r'\s*\(.*?\)\s*', '', str(title).lower()).strip()

# Apply title cleaning to the dataset
df['cleaned_title'] = df['title'].apply(clean_movie_title)

# Load precomputed embedding matrices
embedding_features = ['overview', 'genre', 'cast', 'director', 'combined']
for feature in embedding_features:
    matrix = np.load(f'{feature}_embedding_matrix.npy')
    df[f'{feature}_embedding'] = list(matrix)

# Fetch similar movies from TMDb
def fetch_tmdb_recommendations(movie_title):
    api_key = "ddad317e776c8ec2f92ec52efe9d34f5"  
    search_response = requests.get(f"https://api.themoviedb.org/3/search/movie?api_key={api_key}&query={movie_title}").json()

    if 'results' in search_response and search_response['results']:
        movie_id = search_response['results'][0]['id']
        recommendations_response = requests.get(f"https://api.themoviedb.org/3/movie/{movie_id}/similar?api_key={api_key}").json()

        genres_response = requests.get(f"https://api.themoviedb.org/3/genre/movie/list?api_key={api_key}").json()
        genre_mapping = {genre['id']: genre['name'] for genre in genres_response['genres']}
        
        similar_movies = []
        for movie in recommendations_response['results']:
            movie_genres = [genre_mapping.get(genre_id, "Unknown") for genre_id in movie['genre_ids']]
            similar_movies.append({
                'title': movie['title'],
                'genre': ', '.join(movie_genres),
                'rating': movie.get('vote_average', 'N/A')
            })
        
        similar_movies_df = pd.DataFrame(similar_movies)

        # Get the top 3 movies
        top_movies_df = similar_movies_df.head(3)

        return top_movies_df
    
    print(f"No results found for '{movie_title}' in TMDb. Response: {search_response}")
    return None

# Recommend similar movies
def recommend_similar_movies(movie_title, df, feature='overview', top_n=3):
    cleaned_movie_title = clean_movie_title(movie_title)

    if cleaned_movie_title in df['cleaned_title'].values:
        movie_index = df[df['cleaned_title'] == cleaned_movie_title].index[0]

        # Recommend based on different features
        features = ['overview', 'genre', 'cast', 'director', 'combined']
        for feature in features:
            input_embedding = df.loc[movie_index, f'{feature}_embedding']
            embedding_matrix = np.stack(df[f'{feature}_embedding'].values)

            similarities = cosine_similarity([input_embedding], embedding_matrix)[0]
            similar_movies = df.iloc[np.argsort(similarities)[::-1][1:top_n+1]]
            print(f"\nMovies similar to {movie_title} based on {feature.capitalize()}:")
            print(similar_movies[['title', 'genre', 'rating']].to_string(index=False))
        
    else:
        print(f"Movie '{movie_title}' not found in the dataset. Fetching recommendations from TMDb...")
        recommendations = fetch_tmdb_recommendations(movie_title)
        if recommendations is not None and not recommendations.empty:
            print(f"\nTop 3 movies similar to '{movie_title}' based on TMDb:")
            print(recommendations.to_string(index=False))
        else:
            print(f"No recommendations found for '{movie_title}' in TMDb.")

# Test the recommendation system
test_movie = input('enter your favourite movie: ')
recommend_similar_movies(test_movie, df)




Movie 'me before you' not found in the dataset. Fetching recommendations from TMDb...

Top 3 movies similar to 'me before you' based on TMDb:
           title         genre  rating
       Priehrada         Drama   2.000
      Blood Knot         Drama   0.000
Summer in Berlin Comedy, Drama   6.557
