In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the movie dataset
data = pd.read_csv(r'D:\abdo\AI\projects\recommender\needed features\merged_data_new.csv')

# Convert data to a DataFrame
df = pd.DataFrame(data)

# Replace NaN values in relevant columns with an empty string
df['overview'] = df['overview'].fillna('')
df['reviews'] = df['reviews'].fillna('')
df['cast'] = df['cast'].fillna('')
df['director'] = df['director'].fillna('(no data about the director is found)')
df['genre'] = df['genre'].fillna('')
df['film_rate'] = df['film_rate'].fillna(0)  # Ensure film_rate column has no NaN values

# Combine 'title', 'overview', 'cast', 'director', 'genre', and 'reviews' for similarity comparison
df['combined_features'] = df['title'] + ' ' + df['overview'] + ' ' + df['cast'] * 15  + ' ' + df['director'] * 15 + ' ' + df['genre'] * 20 + ' ' + df['reviews']

# Text Vectorization using TF-IDF (including bi-grams for better context)
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
tfidf_matrix = tfidf.fit_transform(df['combined_features'].fillna(''))

# Calculate cosine similarity between all movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Helper function to extract movie title
def extract_title(title):
    if pd.isna(title):
        return ''
    title = title.lower()
    title = re.sub(r'\s*\(\d{4}\)$', '', title)  # Remove year information
    return title

# Define the function to recommend movies
def recommend_movies(movie_title, cosine_sim=cosine_sim, df=df):
    movie_title_clean = extract_title(movie_title)
    
    # Check if the movie title exists in the dataset
    if movie_title_clean in df['title'].apply(extract_title).values:
        # Get the index of the movie that matches the title
        idx = df.index[df['title'].apply(extract_title) == movie_title_clean][0]
        
        # Get the rating of the input movie
        input_movie_rating = df.loc[idx, 'film_rate']
        
        # Get the pairwise similarity scores of all movies with that movie
        sim_scores = list(enumerate(cosine_sim[idx]))
        
        # Sort the movies based on similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Get the top 3 most similar movies (excluding the input movie itself)
        recommended_movies = []
        for score in sim_scores[1:]:
            movie_data = df.iloc[score[0]]
            if abs(movie_data['film_rate'] - input_movie_rating) <= 1:  # Filter movies based on rating within one point higher or lower
                recommended_movies.append({
                    'title': movie_data['title'],
                    'film_rate': movie_data['film_rate'],
                    'imdbId': movie_data['imdbId'],
                    'cover_url': movie_data['cover_url'],
                    'director': movie_data['director'],
                    'genre': movie_data['genre'],
                    'cast': movie_data['cast'],
                    'overview': movie_data['overview'],
                    'reviews': movie_data['reviews']
                })
            if len(recommended_movies) == 3:  # Limit to top 3 movies
                break
        
        # Sort recommended movies by rating in descending order
        recommended_movies = sorted(recommended_movies, key=lambda x: x['film_rate'], reverse=True)
        
        # Return the top 3 most similar movies with all requested details
        return recommended_movies
    else:
        return []  # Return an empty list if the movie is not found

# Example usage: Recommend movies based on the title "Heat"
test = 'heat'
recommended_movies = recommend_movies(test)

# Check if the movie is in the dataset
if not recommended_movies:
    print(f"'{test}' is not listed in our database.")
else:
    print(f"Movies recommended based on '{test}':")
    for movie in recommended_movies:
        print(f"Title: {movie['title']}, Rating: {movie['film_rate']}")
        print(f"Director: {movie['director']}")
        print(f"Genre: {movie['genre']}")
        print("-" * 80)


In [None]:
# Example usage: Recommend movies based on the title "Heat"
test = 'Clockers'
recommended_movies = recommend_movies(test)

# Check if the movie is in the dataset
if not recommended_movies:
    print(f"'{test}' is not listed in our database.")
else:
    print(f"Movies recommended based on '{test}':")
    for movie in recommended_movies:
        print(f"Title: {movie['title']}, Rating: {movie['film_rate']}")
        print(f"Director: {movie['director']}")
        print(f"Genre: {movie['genre']}")
        print("-" * 80)