# Load and Explore Data

In [1]:
import pandas as pd

# Step 1.1: Load datasets
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')
ratings = pd.read_csv('ratings.csv')
links = pd.read_csv('links.csv')
genome_tags = pd.read_csv('genome-tags.csv')
genome_scores = pd.read_csv('genome-scores.csv')

# Step 1.2: Inspect data
print("Movies dataset:")
print(movies.head())

print("\nTags dataset:")
print(tags.head())

print("\nRatings dataset:")
print(ratings.head())

print("\nLinks dataset:")
print(links.head())

print("\nGenome Tags dataset:")
print(genome_tags.head())

print("\nGenome Scores dataset:")
print(genome_scores.head())


Movies dataset:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Tags dataset:
   userId  movieId            tag   timestamp
0      10      260   good vs evil  1430666558
1      10      260  Harrison Ford  1430666505
2      10      260         sci-fi  1430666538
3      14     1221      Al Pacino  1311600756
4      14     1221          mafia  1311600746

Ratings dataset:
   userId  movieId  rating   timestamp
0       1        1     4.0  1225734739
1 

# Merge Relevant Data

In [14]:
# Step 2.1: Merge movies and tags
movies_tags = tags.merge(movies, on="movieId", how="inner")

# Step 2.2: Summarize tags for each movie
# Combine all tags for a movie into a single string
movie_tag_summary = movies_tags.groupby("movieId").agg({
    "tag": lambda x: " ".join(x.dropna().astype(str)),  # Convert to string and concatenate
    "title": "first",                                  # Keep the movie title
    "genres": "first"                                  # Keep the movie genres
}).reset_index()

# Step 2.3: Merge genome-scores with genome-tags
genome_data = genome_scores.merge(genome_tags, on="tagId", how="inner")

# Keep only relevant tags (with high relevance scores) for each movie
relevant_genome_tags = genome_data[genome_data["relevance"] > 0.5]
movie_genome_tags = relevant_genome_tags.groupby("movieId").agg({
    "tag": lambda x: " ".join(x.dropna().astype(str))  # Convert to string and concatenate
}).reset_index()

# Step 2.4: Combine tags, genres, and genome-based tags
final_data = movie_tag_summary.merge(movie_genome_tags, on="movieId", how="left")
final_data["all_tags"] = (
    final_data["tag_x"].fillna("") + " " +
    final_data["tag_y"].fillna("") + " " +
    final_data["genres"].fillna("")
)

# Final dataset for recommendations
final_data = final_data[["movieId", "title", "all_tags"]].drop_duplicates()

# Display the processed dataset
print(f"Processed dataset shape: {final_data.shape}")
print(final_data.head())
# Save the final dataset as a CSV file
final_data.to_csv('final_data.csv', index=False)
print("final_data.csv has been saved!")

Processed dataset shape: (53452, 3)
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                            all_tags  
0  animation friendship toys animation Disney Pix...  
1  animals based on a book fantasy magic board ga...  
2  sequel moldy old old age old men wedding old p...  
3  characters chick flick girl movie characters c...  
4  family pregnancy wedding 4th wall aging baby d...  
final_data.csv has been saved!


# Generate TF-IDF Embeddings

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 3.1: Create TF-IDF embeddings
tfidf = TfidfVectorizer(stop_words='english')  # Exclude common stop words
tfidf_matrix = tfidf.fit_transform(final_data['all_tags'].fillna(''))  # Generate TF-IDF vectors

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

# Step 3.2: Verify feature names 
feature_names = tfidf.get_feature_names_out()
print(f"Sample feature names: {feature_names[:10]}")


TF-IDF matrix shape: (53452, 51797)
Sample feature names: ['000' '007' '009' '01' '02' '03' '04' '05' '06' '07']


# Build the Recommendation Engine

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

# Step 4.1: Compute Cosine Similarity
# Precompute cosine similarity between all movies
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Step 4.2: Create a Recommendation Function
def recommend_movies(query, top_n=5):
    """
    Recommend top N movies based on a user query or a specific movie title.

    Parameters:
    - query: str, user's input query (e.g., movie title or description keywords)
    - top_n: int, number of recommendations to return

    Returns:
    - recommendations: DataFrame with recommended movies and their details
    """
    # Transform the query into the TF-IDF space
    query_vector = tfidf.transform([query])
    
    # Compute similarity between the query and all movies
    query_similarity = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Get indices of top N similar movies
    top_indices = query_similarity.argsort()[-top_n:][::-1]
    
    # Retrieve recommended movie details
    recommendations = final_data.iloc[top_indices][['movieId', 'title']]
    recommendations['similarity'] = query_similarity[top_indices]
    return recommendations

# Step 4.3: Test the Recommendation Engine
user_query = "Action movie with thrilling plot"
recommendations = recommend_movies(user_query, top_n=5)

print("Recommendations for your query:")
print(recommendations)


Recommendations for your query:
       movieId                             title  similarity
37208   166784           Parole Violators (1994)    0.427110
16423    90600  Headhunters (Hodejegerne) (2011)    0.374388
19906   109578                   Non-Stop (2014)    0.355212
26396   132818       The Boss of Big Town (1942)    0.346965
51264   253316                      Dutch (2021)    0.315770
