In [None]:
import pandas as pd
import networkx as nx
from imdb import IMDb
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import matplotlib.pyplot as plt
from collections import Counter
import pickle

# Load the CSVs and merge on 'movieId'
links_df = pd.read_csv('ml-latest-small/links.csv')
movies_df = pd.read_csv('ml-latest-small/movies.csv')
movies = pd.merge(links_df, movies_df, on='movieId')

# Adjustable variables
num_movies = len(movies)  # Number of movies to process
num_threads = 5  # Number of threads for concurrent requests

# Filter to the first `num_movies`
sample_movies = movies.head(num_movies)

print(num_movies, 'movies in dataset')

100 movies in dataset


load movie data

In [None]:
# Initialize IMDb API
ia = IMDb()

# Define a function to fetch data for each movie
def fetch_movie_data(row):
    movie_id = row['movieId']
    imdb_id = str(row['imdbId']).zfill(7)  # IMDb IDs are zero-padded to 7 digits
    
    try:
        # Retrieve movie data from IMDb
        movie = ia.get_movie(imdb_id)
        title = movie.get('title')
        genre = movie.get('genres', [])
        rating = movie.get('rating')
        year = movie.get('year')
        
        # Get all the actors in the movie
        cast = movie.get('cast', [])
        actors = [(actor.personID, actor['name']) for actor in cast]
        
        # Return movie data and associated actors
        return {
            'movie_id': movie_id,
            'title': title,
            'genre': genre,
            'actors': actors,
            'rating': rating,
            'year': year
        }

    except Exception:
        print(f"Error loading movie: {row['title']}. Retrying...")
        return fetch_movie_data(row)  # Retry fetching the same movie data

# Fetch movie data concurrently without a delay
movie_data_list = []
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    # Submit all fetch tasks
    futures = [executor.submit(fetch_movie_data, row) for _, row in sample_movies.iterrows()]
    
    # Collect results as they complete
    for future in as_completed(futures):
        result = future.result()
        if result:  # Only add successful fetches
            movie_data_list.append(result)


create graph

In [3]:
# Initialize a NetworkX graph
G = nx.Graph()

# Dictionary to store which movies each actor appears in
actor_to_movies = {}

# Populate the actor_to_movies dictionary
for movie_data in movie_data_list:
    movie_id = movie_data['movie_id']
    title = movie_data['title']
    genre = movie_data['genre']
    actors = movie_data['actors']
    rating = movie_data['rating']
    year = movie_data['year']
    
    # Add movie node with attributes
    G.add_node(movie_id, type='movie', title=title, genre=genre, actors=actors, rating=rating, year=year)
    
    # Associate each actor with this movie in `actor_to_movies`
    for actor_id in actors:
        if actor_id not in actor_to_movies:
            actor_to_movies[actor_id] = []
        actor_to_movies[actor_id].append(movie_id)

# Create edges between movies that share at least one actor
for actor, movies in actor_to_movies.items():
    # Create edges between each pair of movies this actor has been in
    for i in range(len(movies)):
        for j in range(i + 1, len(movies)):
            movie1 = movies[i]
            movie2 = movies[j]
            # Add an edge between movies that share this actor
            G.add_edge(movie1, movie2)

# Check graph summary
print("Number of movie nodes:", G.number_of_nodes())
print("Total edges in the graph:", G.number_of_edges())

Number of movie nodes: 9703
Total edges in the graph: 1183130


In [4]:
# Remove isolated nodes (those without any edges)
isolated_nodes = [node for node in G if G.degree(node) == 0]
G.remove_nodes_from(isolated_nodes)

Save graph to a pickle file

In [8]:
# Save the graph as a pickle file
with open('Movie_network.gpickle', 'wb') as f:
    pickle.dump(G, f, pickle.HIGHEST_PROTOCOL)