In [7]:
import requests

API_KEY = "84fc0155e583f3f7dbb0cafbea8ad4ed"
TMDB_API_URL = "https://api.themoviedb.org/3"

def fetch_movie_details(tmdb_id):
    url = f"{TMDB_API_URL}/movie/{tmdb_id}?api_key={API_KEY}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    return None

def fetch_movie_credits(tmdb_id):
    url = f"{TMDB_API_URL}/movie/{tmdb_id}/credits?api_key={API_KEY}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    return None

In [8]:
import pandas as pd

# Load the CSV files
links = pd.read_csv("ml-32m/links.csv")
movies = pd.read_csv("ml-32m/movies.csv")
ratings = pd.read_csv("ml-32m/ratings.csv")
tags = pd.read_csv("ml-32m/tags.csv")

# Preview the data
print(movies.head())
print(links.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  
   movieId  imdbId   tmdbId
0        1  114709    862.0
1        2  113497   8844.0
2        3  113228  15602.0
3        4  114885  31357.0
4        5  113041  11862.0


In [9]:
movie_rating_counts = ratings['movieId'].value_counts()
top_movie_ids = movie_rating_counts.head(int(len(movie_rating_counts) * 0.10)).index

# Filter ratings to only include the top movies
filtered_ratings = ratings[ratings['movieId'].isin(top_movie_ids)]

# Save the filtered ratings
filtered_ratings.to_csv('filtered_ratings_top_movies.csv', index=False)

print(f"Filtered ratings to include top {len(top_movie_ids)} impactful movies.")

Filtered ratings to include top 8443 impactful movies.


In [10]:
user_rating_counts = filtered_ratings['userId'].value_counts()

# Select the top users
top_user_ids = user_rating_counts.head(500).index  # Adjust the number as needed

# Further filter ratings to include only the top users
final_filtered_ratings = filtered_ratings[filtered_ratings['userId'].isin(top_user_ids)]

# Save the final filtered ratings
final_filtered_ratings.to_csv('final_filtered_ratings.csv', index=False)

print(f"Filtered ratings to include top {len(top_user_ids)} active users.")

Filtered ratings to include top 500 active users.


In [11]:


# Filter tags for relevant movies and users
filtered_tags = tags[
    tags['movieId'].isin(final_filtered_ratings['movieId']) & 
    tags['userId'].isin(final_filtered_ratings['userId'])
]

# Save the filtered tags
filtered_tags.to_csv('final_filtered_tags.csv', index=False)

print("Filtered tags for relevant movies and users.")

Filtered tags for relevant movies and users.


In [12]:
impactful_movies = movies[movies['movieId'].isin(top_movie_ids)]

# Save filtered movies
impactful_movies.to_csv('impactful_movies.csv', index=False)

print(f"Identified {len(impactful_movies)} impactful movies.")

Identified 8443 impactful movies.


In [14]:
#Merging data if and only if the data has been linked to tmdb

movie_data = impactful_movies.merge(links, on="movieId", how="inner")
print(len(movie_data))

8443


In [15]:
movie_data = movie_data[movie_data['tmdbId'].notnull()]
movie_data['tmdbId'] = movie_data['tmdbId'].astype(int)

In [16]:
#enriching the data

import time

enriched_data = []

for _, row in movie_data.iterrows():
    tmdb_id = row['tmdbId']
    details = fetch_movie_details(tmdb_id)
    credits = fetch_movie_credits(tmdb_id)
    
    if details and credits:
        director = [person['name'] for person in credits['crew'] if person['job'] == 'Director']
        actors = [person['name'] for person in credits['cast'][:5]]  # Top 5 actors
        enriched_data.append({
            "movieId": row['movieId'],
            "title": row['title'],
            "genres": row['genres'],
            "tmdbId": tmdb_id,
            "director": director,
            "actors": actors,
            "tmdb_genres": [genre['name'] for genre in details['genres']],
            "release_date": details['release_date'],
            "runtime": details['runtime']
        })

    # Respect TMDb API rate limits
    time.sleep(0.2)  # Adjust if necessary

# Save the enriched data
enriched_df = pd.DataFrame(enriched_data)
enriched_df.to_csv("enriched_impactful_movies.csv", index=False)
print(enriched_df.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  tmdbId           director  \
0  Adventure|Animation|Children|Comedy|Fantasy     862    [John Lasseter]   
1                   Adventure|Children|Fantasy    8844     [Joe Johnston]   
2                               Comedy|Romance   15602    [Howard Deutch]   
3                         Comedy|Drama|Romance   31357  [Forest Whitaker]   
4                                       Comedy   11862    [Charles Shyer]   

                                              actors  \
0  [Tom Hanks, Tim Allen, Don Rickles, Jim Varney...   
1  [Robin Williams, Kirsten Dunst, Bradley Pierce...   
2  [Walter Matthau, Jack Lemmon, Ann-Margret, Sop...   
3  [Whitney 

In [17]:
len(enriched_data)

8389

In [20]:
from arango import ArangoClient
import pandas as pd

# Step 1: Connect to ArangoDB
client = ArangoClient()
db = client.db('movieRecommendations', username='root', password='rootPassword')

# Step 2: Read the enriched data from CSV
csv_file = "enriched_impactful_movies.csv"  # Path to your CSV file
movies_df = pd.read_csv(csv_file)

# Step 3: Access or Create the Movies Collection
if not db.has_collection('movies'):
    movies_collection = db.create_collection('movies')
else:
    movies_collection = db.collection('movies')

# Step 4: Load Data into the Collection
for _, row in movies_df.iterrows():
    document = {
        "_key": str(row['movieId']),  # Use movieId as the document key
        "title": row['title'],
        "genres": row['genres'].split('|'),  # Split genres into a list
        "director": row['director'].strip("[]").replace("'", "").split(', '),  # Convert string to list
        "actors": row['actors'].strip("[]").replace("'", "").split(', '),  # Convert string to list
        "tmdb_genres": row['tmdb_genres'].strip("[]").replace("'", "").split(', '),  # Convert string to list
        "release_date": row['release_date'],
        "runtime": row['runtime']
    }
    
    # Insert document into the collection
    try:
        movies_collection.insert(document)
    except Exception as e:
        print(f"Error inserting document {row['movieId']}: {e}")

print("Data loading completed.")

Data loading completed.


In [15]:
enriched_movies = pd.read_csv('enriched_movies_updated.csv')
valid_movie_ids = set(enriched_movies['movieId'])  # Get valid movie IDs

# Load and filter ratings.csv
ratings = pd.read_csv('ml-32m/ratings.csv')
filtered_ratings = ratings[ratings['movieId'].isin(valid_movie_ids)]

# Save the filtered ratings to a new CSV
filtered_ratings.to_csv('filtered_ratings.csv', index=False)

# Load and filter tags.csv
tags = pd.read_csv('ml-32m/tags.csv')
filtered_tags = tags[tags['movieId'].isin(valid_movie_ids)]

# Save the filtered tags to a new CSV
filtered_tags.to_csv('filtered_tags.csv', index=False)

print("Preprocessing completed. Filtered ratings and tags saved.")

Preprocessing completed. Filtered ratings and tags saved.


In [21]:
if not db.has_collection('ratings'):
    ratings_collection = db.create_collection('ratings')
else:
    ratings_collection = db.collection('ratings')

# Load filtered ratings data into the collection
filtered_ratings = pd.read_csv('final_filtered_ratings.csv')

for _, row in filtered_ratings.iterrows():
    document = {
        "_key": f"{row['userId']}_{row['movieId']}",  # Create a composite key
        "userId": row['userId'],
        "movieId": row['movieId'],
        "rating": row['rating'],
        "timestamp": row['timestamp']
    }
    
    try:
        ratings_collection.insert(document)
    except Exception as e:
        print(f"Error inserting rating document {row['userId']}_{row['movieId']}: {e}")

print("Ratings data loading completed.")

Ratings data loading completed.


In [7]:
from arango import ArangoClient
import pandas as pd

# Step 1: Connect to ArangoDB
client = ArangoClient()
db = client.db('movieRecommendations', username='root', password='rootPassword')

In [8]:
ratings = pd.read_csv('filtered_ratings_top_movies.csv')
edges = [
    {
        "_from": f"Users/{row['userId']}",
        "_to": f"Movies/{row['movieId']}",
        "rating": row['rating'],
        "timestamp": row['timestamp']
    }
    for _, row in ratings.iterrows()
]

# Insert edges into the 'Rated' collection


In [10]:
from arango import ArangoClient
import pandas as pd

# Step 1: Connect to ArangoDB
client = ArangoClient()
db = client.db('movieRecommendations', username='root', password='rootPassword')
rated_collection = db.collection('rated')


In [11]:
from math import ceil
chunk_size = 1000  # Adjust based on your system's capacity
num_chunks = ceil(len(edges) / chunk_size)

for i in range(num_chunks):
    start = i * chunk_size
    end = start + chunk_size
    chunk = edges[start:end]  # Get a subset of edges

    try:
        rated_collection.import_bulk(chunk)  # Import the chunk
        print(f"Chunk {i + 1}/{num_chunks} imported successfully")
    except Exception as e:
        print(f"Error importing chunk {i + 1}: {e}")

Chunk 1/30626 imported successfully
Chunk 2/30626 imported successfully
Chunk 3/30626 imported successfully
Chunk 4/30626 imported successfully
Chunk 5/30626 imported successfully
Chunk 6/30626 imported successfully
Chunk 7/30626 imported successfully
Chunk 8/30626 imported successfully
Chunk 9/30626 imported successfully
Chunk 10/30626 imported successfully
Chunk 11/30626 imported successfully
Chunk 12/30626 imported successfully
Chunk 13/30626 imported successfully
Chunk 14/30626 imported successfully
Chunk 15/30626 imported successfully
Chunk 16/30626 imported successfully
Chunk 17/30626 imported successfully
Chunk 18/30626 imported successfully
Chunk 19/30626 imported successfully
Chunk 20/30626 imported successfully
Chunk 21/30626 imported successfully
Chunk 22/30626 imported successfully
Chunk 23/30626 imported successfully
Chunk 24/30626 imported successfully
Chunk 25/30626 imported successfully
Chunk 26/30626 imported successfully
Chunk 27/30626 imported successfully
Chunk 28/3

In [23]:
import pandas as pd
from collections import defaultdict
import ast  # For safely evaluating stringified lists

# Step 1: Load Movie and Ratings Data
movies_df = pd.read_csv('enriched_impactful_movies.csv')  # Assuming movie data is in a CSV
ratings_df = pd.read_csv('filtered_ratings_top_movies.csv')  # Assuming ratings data is in a CSV

# Step 2: Preprocess movie data to split genres and convert stringified lists
def preprocess_movie_data(row):
    # Split genres by '|'
    row['genres'] = row['genres'].split('|') if isinstance(row['genres'], str) else []
    
    # Convert stringified lists for director and actors to actual lists
    row['director'] = ast.literal_eval(row['director']) if isinstance(row['director'], str) else []
    row['actors'] = ast.literal_eval(row['actors']) if isinstance(row['actors'], str) else []
    
    # Convert stringified tmdb_genres to actual lists
    row['tmdb_genres'] = ast.literal_eval(row['tmdb_genres']) if isinstance(row['tmdb_genres'], str) else []
    
    return row

# Apply preprocessing to the DataFrame
movies_df = movies_df.apply(preprocess_movie_data, axis=1)

# Create a mapping for quick access
movies_mapping = movies_df.set_index("movieId").to_dict(orient="index")

# Step 3: Initialize user profiles
MIN_RATING = 3
user_profiles = defaultdict(lambda: {"directors": defaultdict(int), "actors": defaultdict(int), "genres": defaultdict(int)})

# Step 4: Process the ratings data to aggregate directors, actors, and genres
for _, row in ratings_df.iterrows():
    user_id = row['userId']
    movie_id = row['movieId']
    rating = row['rating']

    # Apply the rating filter
    if rating >= MIN_RATING:
        # Fetch the corresponding movie details from the movies_mapping
        movie = movies_mapping.get(movie_id, None)
        if not movie:
            continue  # Skip if movie details are not found
        
        # Aggregate directors
        for director in movie.get("director", []):  # Use 'director' as per the movie data
            user_profiles[user_id]["directors"][director] += 1
        
        # Aggregate actors
        for actor in movie.get("actors", []):  # Use 'actors' as per the movie data
            user_profiles[user_id]["actors"][actor] += 1
        
        # Aggregate Genres
        for genre in movie.get("genres", []):  # Use processed genres
            user_profiles[user_id]["genres"][genre] += 1

In [24]:
print(user_profiles[ratings_df.iloc[0]["userId"]])

{'directors': defaultdict(<class 'int'>, {'Ang Lee': 2, 'Zhang Yimou': 1, 'Terry Gilliam': 1, 'Jafar Panahi': 1, 'Mel Gibson': 1, 'Martin Scorsese': 3, 'Gregg Araki': 1, 'Tom DiCillo': 1, 'Kevin Smith': 1, 'George Lucas': 1, 'Patrice Chéreau': 1, 'Krzysztof Kieślowski': 2, 'George Huang': 1, 'Steven Spielberg': 3, 'Ridley Scott': 1, 'Todd Solondz': 1, 'Jonathan Demme': 2, 'Claude Sautet': 1, 'John Sayles': 1, 'Annette Haywood-Carter': 1, 'Douglas McGrath': 1, 'Alfred Hitchcock': 2, 'Billy Wilder': 2, 'William Wyler': 2, 'Orson Welles': 1, 'Joseph L. Mankiewicz': 1, 'George Cukor': 1, 'Richard Brooks': 1, 'Mike Leigh': 1, 'Michael Winterbottom': 1, 'Oliver Stone': 1, 'Neil Jordan': 1, 'Daniel Vigne': 1, 'Giuseppe Tornatore': 1, 'Stanley Kubrick': 1, 'Anthony Minghella': 1, 'Irvin Kershner': 1, 'James Cameron': 2, 'Sidney Lumet': 1, 'Francis Ford Coppola': 2, 'Akira Kurosawa': 2, 'Miloš Forman': 1, 'Wolfgang Petersen': 1, 'George Roy Hill': 2, 'Hal Hartley': 1, 'Mike Nichols': 2, 'Rob Re

In [25]:
# Check if 'users' collection exists, create if not
if not db.has_collection('users'):
    db.create_collection('users')

users_collection = db.collection('users')
bulk_insert_data = []

for user_id, data in user_profiles.items():
    bulk_insert_data.append({
        "_key": str(user_id),
        "favorite_genres": sorted(data["genres"], key=data["genres"].get, reverse=True)[:5],
        "frequently_rated_directors": sorted(data["directors"], key=data["directors"].get, reverse=True)[:5],
        "frequently_rated_actors": sorted(data["actors"], key=data["actors"].get, reverse=True)[:5]
    })

chunk_size = 1000
print("Inserting user profiles into the database...")

for i in range(0, len(bulk_insert_data), chunk_size):
    try:
        # Use import_bulk or insert_many based on your ArangoDB driver support
        users_collection.import_bulk(bulk_insert_data[i:i+chunk_size])
        print(f"Inserted {i + len(bulk_insert_data[i:i+chunk_size])} profiles...")
    except Exception as e:
        print(f"Error inserting data at chunk starting index {i}: {str(e)}")

print(f"User profiles created and stored in the 'users' collection.")


Inserting user profiles into the database...
Inserted 1000 profiles...
Inserted 2000 profiles...
Inserted 3000 profiles...
Inserted 4000 profiles...
Inserted 5000 profiles...
Inserted 6000 profiles...
Inserted 7000 profiles...
Inserted 8000 profiles...
Inserted 9000 profiles...
Inserted 10000 profiles...
Inserted 11000 profiles...
Inserted 12000 profiles...
Inserted 13000 profiles...
Inserted 14000 profiles...
Inserted 15000 profiles...
Inserted 16000 profiles...
Inserted 17000 profiles...
Inserted 18000 profiles...
Inserted 19000 profiles...
Inserted 20000 profiles...
Inserted 21000 profiles...
Inserted 22000 profiles...
Inserted 23000 profiles...
Inserted 24000 profiles...
Inserted 25000 profiles...
Inserted 26000 profiles...
Inserted 27000 profiles...
Inserted 28000 profiles...
Inserted 29000 profiles...
Inserted 30000 profiles...
Inserted 31000 profiles...
Inserted 32000 profiles...
Inserted 33000 profiles...
Inserted 34000 profiles...
Inserted 35000 profiles...
Inserted 36000 prof

In [26]:
import pandas as pd
from collections import defaultdict

# Step 1: Connect to ArangoDB
from arango import ArangoClient

client = ArangoClient()
db = client.db('movieRecommendations', username='root', password='rootPassword')

users_collection = db.collection('users')
movies_collection = db.collection('movies')

# Step 2: Fetch User Profile
def get_user_profile(user_id):
    formatted_user_id = f"{float(user_id):.1f}"
    
    
    user_profile = users_collection.get(formatted_user_id)
    if not user_profile:
        print(f"User {user_id} not found in the database.")
        return None
    return user_profile

# Step 3: Find Matching Movies
def recommend_movies(user_id, top_n=10):
    user_profile = get_user_profile(user_id)
    if not user_profile:
        return []

    favorite_genres = set(user_profile.get('favorite_genres', []))
    favorite_directors = set(user_profile.get('frequently_rated_directors', []))
    favorite_actors = set(user_profile.get('frequently_rated_actors', []))

    # Query all movies
    movies_cursor = movies_collection.all()
    recommendations = []

    for movie in movies_cursor:
        score = 0

        # Genre matching
        movie_genres = set(movie.get('genres', []))
        genre_matches = len(favorite_genres & movie_genres)
        score += genre_matches * 2  # Weight for genre matches

        # Director matching
        movie_directors = set(movie.get('directors', []))
        director_matches = len(favorite_directors & movie_directors)
        score += director_matches * 3  # Weight for director matches

        # Actor matching
        movie_actors = set(movie.get('actors', []))
        actor_matches = len(favorite_actors & movie_actors)
        score += actor_matches * 1  # Weight for actor matches

        # Add to recommendations if score > 0
        if score > 0:
            recommendations.append({
                "movieId": movie["_key"],
                "title": movie["title"],
                "genres": movie.get("genres", []),
                "directors": movie.get("directors", []),
                "actors": movie.get("actors", []),
                "score": score
            })

    # Sort by score in descending order
    recommendations = sorted(recommendations, key=lambda x: x['score'], reverse=True)

    return recommendations[:top_n]

# Step 4: Test the Script
if __name__ == "__main__":
    test_user_id = "1000"  
    top_recommendations = recommend_movies(test_user_id, top_n=10)

    if top_recommendations:
        print(f"Top {len(top_recommendations)} recommendations for User {test_user_id}:")
        for idx, movie in enumerate(top_recommendations, start=1):
            print(f"{idx}. {movie['title']} (Score: {movie['score']})")
            print(f"   Genres: {', '.join(movie['genres'])}")
            print(f"   Directors: {', '.join(movie['directors'])}")
            print(f"   Actors: {', '.join(movie['actors'])}")
    else:
        print(f"No recommendations available for User {test_user_id}.")

Top 10 recommendations for User 1000:
1. Stunt Man, The (1980) (Score: 10)
   Genres: Action, Adventure, Comedy, Drama, Romance, Thriller
   Directors: 
   Actors: "Peter OToole", Steve Railsback, Barbara Hershey, Allen Garfield, Alex Rocco
2. National Treasure (2004) (Score: 10)
   Genres: Action, Adventure, Drama, Mystery, Thriller
   Directors: 
   Actors: Nicolas Cage, Diane Kruger, Justin Bartha, Sean Bean, Jon Voight
3. Hunting Party, The (2007) (Score: 10)
   Genres: Action, Adventure, Comedy, Drama, Thriller
   Directors: 
   Actors: Richard Gere, Terrence Howard, Jesse Eisenberg, James Brolin, Diane Kruger
4. Rubber (2010) (Score: 10)
   Genres: Action, Adventure, Comedy, Crime, Drama, Film-Noir, Horror, Mystery, Thriller, Western
   Directors: 
   Actors: Thomas F. Duffy, David Bowe, Stephen Spinella, Roxane Mesquida, Jack Plotnick
5. Getaway, The (1994) (Score: 9)
   Genres: Action, Adventure, Crime, Drama, Romance, Thriller
   Directors: 
   Actors: Alec Baldwin, Kim Basing