In [7]:
import pandas as pd
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load Dataset
movies_df = pd.read_csv('tmdb_5000_movies.csv')

# Step 2: Data Preprocessing
# Helper function to extract relevant names from JSON-like strings
def extract_names(data):
    try:
        data = ast.literal_eval(data)  # Convert string to list of dictionaries
        return [item['name'] for item in data]
    except (ValueError, SyntaxError):
        return []

# Extract genres and keywords
movies_df['genres_processed'] = movies_df['genres'].apply(extract_names)
movies_df['keywords_processed'] = movies_df['keywords'].apply(extract_names)

# Combine genres, keywords, and overview into a single textual representation
movies_df['combined_features'] = (
    movies_df['genres_processed'].apply(lambda x: ' '.join(x)) + ' ' +
    movies_df['keywords_processed'].apply(lambda x: ' '.join(x)) + ' ' +
    movies_df['overview'].fillna('')  # Handle missing overviews
)

# Step 3: Vectorize Combined Features
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_df['combined_features'])

# Step 4: Compute Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Step 5: Define Recommendation Functions
def recommend_movies(title, cosine_sim=cosine_sim):
    """
    Recommends movies similar to the input title based on cosine similarity.
    """
    try:
        # Get the index of the movie that matches the title
        idx = movies_df[movies_df['title'].str.lower() == title.lower()].index[0]
        
        # Get similarity scores for all movies
        sim_scores = list(enumerate(cosine_sim[idx]))
        
        # Sort movies by similarity score
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Get indices of the top 10 similar movies
        sim_indices = [i[0] for i in sim_scores[1:11]]  # Exclude the input movie itself
        
        # Return the titles of the top 10 similar movies
        return movies_df['title'].iloc[sim_indices].tolist()
    except IndexError:
        return ["Movie not found. Please check the title."]

def recommend_based_on_ratings(user_ratings):
    """
    Recommends movies based on user ratings of existing movies.
    """
    # Filter for highly rated movies
    high_rated_movies = [movie for movie, rating in user_ratings if rating >= 4]
    
    # Gather recommendations for each high-rated movie
    recommended_movies = set()
    for movie in high_rated_movies:
        recommended_movies.update(recommend_movies(movie))
    
    # Remove already rated movies from recommendations
    rated_movies = {movie for movie, _ in user_ratings}
    final_recommendations = recommended_movies - rated_movies
    
    return list(final_recommendations)

# Example Usage
if __name__ == "__main__":
    # Example: Recommend similar movies to "Avatar"
    print("Recommendations for 'Avatar':")
    print(recommend_movies("Avatar"))
    
    # Example: User ratings-based recommendation
    user_ratings = [("Inception", 5), ("Spectre", 1), ("The Dark Knight Rises", 1)]
    print("\nRecommendations based on user ratings:")
    print(recommend_based_on_ratings(user_ratings))


Recommendations for 'Avatar':
['Mission to Mars', 'Aliens', 'Moonraker', 'Alien³', 'Spaceballs', 'Lifeforce', 'Treasure Planet', 'Lockout', 'Alien', 'Planet of the Apes']

Recommendations based on user ratings:
['Duplex', 'Kiss Kiss Bang Bang', 'Blood and Wine', 'Central Intelligence', 'Pitch Perfect 2', 'Cypher', 'The Count of Monte Cristo', 'Mission: Impossible - Rogue Nation', 'Crouching Tiger, Hidden Dragon', 'The Helix... Loaded']
