In [1]:
%%capture
# pip install fa2_modified

In [2]:
%%capture
# pip install powerlaw

In [3]:
# from fa2_modified import ForceAtlas2
from itertools import combinations
import json
import math
import matplotlib as mpl
from matplotlib import colormaps
import matplotlib.patheffects as pe
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import os
import pandas as pd
# import powerlaw
import random
import re
from statistics import mean, median, mode
from tqdm import tqdm
import urllib.request

# The Data

The data used in this graph is comprised of 3 _.csv_-files:

- _TMDB\_scraped\_features.csv_ (7.1 Mb)
- _TMDB\_scraped\_actors.csv_ (10.1 Mb)
- _WIKI\_scraped\_pages.csv_ (119 Mb)

### Downloading the data of the network

In [4]:
# Loacte the data files in folder
TMDB_SCRAPED_FEATURES_PATH = os.path.join(os.path.abspath(""), "TMDB_scraped_features.csv")
TMDB_SCRAPED_ACTORS_PATH = os.path.join(os.path.abspath(""), "TMDB_scraped_actors.csv")
WIKI_SCRAPED_PAGES_PATH = os.path.join(os.path.abspath(""), "WIKI_scraped_pages.csv")

# Print paths
print(TMDB_SCRAPED_FEATURES_PATH)
print(TMDB_SCRAPED_ACTORS_PATH)
print(WIKI_SCRAPED_PAGES_PATH)

C:\Users\Aleksandar\School\02805_Final_project\code\TMDB_scraped_features.csv
C:\Users\Aleksandar\School\02805_Final_project\code\TMDB_scraped_actors.csv
C:\Users\Aleksandar\School\02805_Final_project\code\WIKI_scraped_pages.csv


In [5]:
# Create DataFrames from csv-files
features_df = pd.read_csv(TMDB_SCRAPED_FEATURES_PATH)
actors_df = pd.read_csv(TMDB_SCRAPED_ACTORS_PATH)
wiki_df = pd.read_csv(WIKI_SCRAPED_PAGES_PATH)

Here are some examples to show the content of the DataFrames:

### Features DataFrame

In [6]:
features_df.head(1)

Unnamed: 0,feature_id,title,original_language,overview,cast,feature_popularity,vote_count,vote_average,release_date,genre_ids,poster_path,backdrop_path,adult,page,wikidata_id
0,278,The Shawshank Redemption,en,Imprisoned in the 1940s for the double murder ...,"[192, 504, 2141, 2555, 4029, 5063, 6573, 6574,...",180.106,27250,8.7,1994-09-23,"[18, 80]",/9cqNxx0GxF0bflZmeSMuL5tnGzr.jpg,/zfbjgQE1uSd9wiPTX4VzsLi0rGG.jpg,False,1,Q172241


Since the features will be used to create edges/links in a network later on, it is crucial that every feature has a cast, staring at least two actors. 

Thus, the dataframe is cleaned, just in case.

In [7]:
features_df_old = features_df

In [8]:
features_df = features_df.dropna(subset=["cast"])
features_df = features_df[features_df["cast"].apply(len) >= 2]

print(f"Before cleaning: {features_df_old.shape}")
print(f"After cleaning : {features_df.shape}")

Before cleaning: (9750, 15)
After cleaning : (9735, 15)


### Actors example

In [9]:
print(f"The actors_df contains:\n{actors_df.shape[0]} rows\n{actors_df.shape[1]} columns")

The actors_df contains:
188378 rows
6 columns


In [10]:
actors_df.head(1)

Unnamed: 0,actor_id,original_name,actor_popularity,gender,adult,profile_image_path
0,1,George Lucas,16.91,2,False,/WCSZzWdtPmdRxH9LUCVi2JPCSJ.jpg


### Wikipedia example

In [11]:
print(f"The wiki_df contains:\n{wiki_df.shape[0]} rows\n{wiki_df.shape[1]} columns")

The wiki_df contains:
9438 rows
6 columns


In [12]:
wiki_df.head(1)

Unnamed: 0,feature_id,title,url,page_content,content_size,wikidata_id
0,2,Ariel,https://en.wikipedia.org/wiki/Ariel_(film),Ariel is a 1988 Finnish drama film directed an...,3346,Q658627


# The Actor Network

The Actor network is generated from 

In [13]:
# Initialize a NetworkX MultiGraph to store the Actor-network
G = nx.MultiGraph()

## Adding the Actors as Nodes

In [14]:
# Traverse the actors dataframe and 
# add each row as a node with attributes
progress_bar = tqdm(actors_df.iterrows(), total=len(actors_df), desc="Generating nodes in network")

for idx, row in progress_bar:
    node_id = row["actor_id"]
    attributes = row.drop('actor_id').to_dict()
    G.add_node(node_id, **attributes)

Generating nodes in network: 100%|████████████| 188378/188378 [00:49<00:00, 3776.47it/s]


In [15]:
print("Nodes:", len(G.nodes()))

Nodes: 188378


## Adding the Features as Edges

In [None]:
# Traverse the features dataframe and 
# add for each row, edges between any 
# two actors in the cast list.
progress_bar = tqdm(features_df.iterrows(), total=len(features_df), desc="Generating edges in network")

for idx, row in progress_bar:
    
    # Load movie's cast list
    actor_ids = row["cast"]
    
    # Get movie data to store in edge
    feature_attributes = {
        "feature_id": row["feature_id"],
        "title": row["title"],
        "vote_average": row["vote_average"],
        "feature_popularity": row["feature_popularity"],
        "vote_count": row["vote_count"],
        "release_date": row["release_date"],
        "genre_ids": row["genre_ids"],
    }
    
    # Create all pairs of actor_ids (combinations of 2)
    for actor1, actor2 in combinations(actor_ids, 2):
        G.add_edge(actor1, actor2, **feature_attributes)

In [None]:
print("Edges:", len(G.edges()))

## Storing the network graph as .gexf-file
For ease of use, the graph is saved as a .gexf-file. This ensures consistency and effeciency when running the notebook.

In [None]:
NETWORK_GRAPH_PATH = os.path.join(os.path.abspath(""), "actor_multigraph.gexf")

nx.write_gexf(G, NETWORK_GRAPH_PATH)

# Visualizing the network using networkX

In [None]:
# Create ForceAtlas2 configuration
fa2 = ForceAtlas2(
    # Behavior alternatives
    outboundAttractionDistribution=True,  # Dissuade hubs
    linLogMode=False,  # NOT IMPLEMENTED
    adjustSizes=False,  # Prevent overlap (NOT IMPLEMENTED)
    edgeWeightInfluence=4.0,

    # Performance
    jitterTolerance=8.0,  # Tolerance
    barnesHutOptimize=True,
    barnesHutTheta=1.2,
    multiThreaded=False,  # NOT IMPLEMENTED

    # Tuning
    scalingRatio=2.0,
    strongGravityMode=False,
    gravity=0.1,

    # Log
    verbose=True
    )

In [None]:
# Running the algorithm to get node positions
# positions = fa2.forceatlas2_networkx_layout(G, pos=None, iterations=200)

In [None]:
# # Visualize the graph
# fig, ax = plt.subplots(figsize=(16, 9))

# pos = nx.spring_layout(G, seed = SEED)

In [None]:
# # Draw the nodes without path effects
# nodes = nx.draw_networkx_nodes(
#     G,
#     pos,
#     node_shape='*',
#     node_size=75,
#     node_color='gold',
#     edgecolors='black',  # Add a black edge color for the outline
#     linewidths=0.5,      # Control the outline thickness
#     ax=ax,
# )

In [None]:
# # Add the stroke effect to the entire node collection
# nodes.set_path_effects([
#     pe.Stroke(linewidth=0.3, foreground="black"),  # Add black outline
#     pe.Normal(),                                  # Normal rendering
# ])

In [None]:
# # Draw the edges
# nx.draw_networkx_edges(
#     G,
#     pos,
#     edge_color='white',
#     width=0.5,
#     alpha=0.7
# )

# ax.set_facecolor('black')
# ax.axis('off')
# fig.set_facecolor('black')

# # save_plot_as(file_name="Actors_network_clean", format_name="pdf")
# # save_plot_as(file_name="Actors_network_clean", format_name="png")

# plt.show()