In [10]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the movie dataset
data = pd.read_csv(r'D:\abdo\AI\projects\recommender\needed features\imdb_data.csv')

# Convert data to a DataFrame
df = pd.DataFrame(data)

# Replace NaN values in 'overview', 'cast', and 'reviews' columns with an empty string
df['overview'] = df['overview'].fillna('')
df['reviews'] = df['reviews'].fillna('')
df['cast'] = df['cast'].fillna('')

# Combine 'overview', 'cast', and 'reviews' for similarity comparison
df['combined_features'] = df['overview'] + ' ' + df['cast'] + ' ' + df['reviews']

# Text Vectorization using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))  # Using bi-grams for better context
tfidf_matrix = tfidf.fit_transform(df['combined_features'].fillna(''))  # Transform combined features into TF-IDF matrix

# Calculate cosine similarity between all movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def recommend_movies(movie_title, cosine_sim=cosine_sim, df=df):
    # Extract the actual movie title from the title column
    pattern = r'^(.+?)(?=,\d+\.|$)'
    title_pattern = re.compile(pattern)

    def extract_title(title):
        if pd.isna(title):
            return ''
        title = title.lower()  # Convert to lowercase
        title = re.sub(r'\s*\(\d{4}\)$', '', title)  # Remove year information
        match = title_pattern.search(str(title))
        if match:
            return match.group(1).strip()
        return title

    # Check if the movie title exists in the dataset
    if extract_title(movie_title) in df['title'].apply(extract_title).values:
        # Get the index of the movie that matches the title
        idx = df.index[df['title'].apply(extract_title) == extract_title(movie_title)][0]
        
        # Get the pairwise similarity scores of all movies with that movie
        sim_scores = list(enumerate(cosine_sim[idx]))
        
        # Sort the movies based on similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Get the top 3 most similar movies (excluding the input movie itself)
        recommended_movies = []
        for score in sim_scores[1:]:
            original_title = df.iloc[score[0]]['title']
            film_rate = df.iloc[score[0]]['film_rate']
            recommended_movies.append((original_title, film_rate))
            if len(recommended_movies) == 3:  # Limit to top 3 movies
                break
        
        # Return the top 3 most similar movies with their ratings
        return recommended_movies
    else:
        return []  # Return an empty list when the movie is not found

# Example: Recommend movies based on "heat"

test='heat'
recommended_movies_1 = recommend_movies(test)
print(f"Movies recommended based on {test}:")
for title, rating in recommended_movies_1:
    print(f"Title: {title}, Rating: {rating}")

Movies recommended based on heat:
Title: The Reader, Rating: 7.6
Title: Righteous Kill, Rating: 6.0
Title: Takers, Rating: 6.2


In [13]:
test='Before and After'
recommended_movies_1 = recommend_movies(test)
print(f"Movies recommended based on {test}:")
for title, rating in recommended_movies_1:
    print(f"Title: {title}, Rating: {rating}")

Movies recommended based on Before and After:
Title: One True Thing, Rating: 6.9
Title: Like Crazy, Rating: 6.6
Title: Music of the Heart, Rating: 6.7
