## P02: Degrees of Separation (BFS)

In [21]:
from classes.Queue import Queue
from classes.Color import Color
import pandas as pd
import numpy as np

CINEMAR_DIR = "datasets/"

In [22]:
class CinemaDataset():
    people: np.ndarray
    movie: np.ndarray
    stars: np.ndarray

    def __init__(self, size:str):
        self.people = pd.read_csv(f"{CINEMAR_DIR}{size.lower()}/people.csv").to_numpy()
        self.movie = pd.read_csv(f"{CINEMAR_DIR}{size.lower()}/movies.csv").to_numpy()
        self.stars = pd.read_csv(f"{CINEMAR_DIR}{size.lower()}/stars.csv").to_numpy()

In [32]:
size = input("Select dataset size (small/large):")
data = CinemaDataset(size)

In [33]:
print(f"PEOPLE SHAPE: {data.people.shape}")
print(f"MOVIE SHAPE: {data.movie.shape}")
print(f"STARS SHAPE: {data.stars.shape}")

PEOPLE SHAPE: (1044499, 3)
MOVIE SHAPE: (344276, 3)
STARS SHAPE: (1189594, 2)


In [25]:
# This two actors supposedly have 2 Degrees of Separation
name1, name2 = "Robin Wright", "Bill Paxton"

In [7]:
# Get Actor/Actress PeopleID
mask = np.isin(element=data.people[:,1], test_elements=np.array([name1, name2]))
name1_ID, name2_ID = data.people[mask][0,0], data.people[mask][1,0]
print(f"Name1's PeopleID: {name1_ID}")
print(f"Name2's PeopleID: {name2_ID}")

a = np.isin(element=data.people[:,1], test_elements=np.array([name1]))
data.people[a][0,:][0]

Name1's PeopleID: 200
Name2's PeopleID: 705


705

In [8]:
# Get Actor/Actress Movies given its PeopleID
movies_index = np.where(data.stars[:,0] == name1_ID)[0].flatten()
movies = data.stars[movies_index,1]
print(f"Movie's ID from PeopleID #2:\n{movies}")

# Get Actors/Actresses from Name2_ID's Movies
movies_idx = np.where(data.stars[:,1] == movies[0])[0].flatten()
costars_id = [data.stars[idx,0] for idx in movies_idx]
print(costars_id)

mask = np.isin(element=data.people[:,0], test_elements=costars_id)
costars_records = data.people[mask]
costars_records

Movie's ID from PeopleID #2:
[112384]
[102, 158, 200, 641]


array([[102, 'Kevin Bacon', 1958],
       [158, 'Tom Hanks', 1956],
       [200, 'Bill Paxton', 1955],
       [641, 'Gary Sinise', 1955]], dtype=object)

In [34]:
def degrees_of_separation(Q: Queue, name1: str, name2: str, data:CinemaDataset):
    # Dict that holds graph information
    ds = {
        'status': False,                    # Tells whether target_x has a solution
        'came_from': {},                    # Keeps track of shortest path's nodes
        'cost_so_far': {},                  # Stores node's C(x) 
        'iterations':0,                     # Iterations passed until shortest path is found
        'visited_m':[],                     # Stores visited movies
        'shortest_path': [],                # Node-by-node path to shortest route
        'degrees': 0}                       # Degrees of Separations

    # Get Actor's PeopleID
    mask = np.isin(element=data.people[:,1], test_elements=np.array([name1, name2]))
    init_x, target_x = data.people[mask][0,0], data.people[mask][1,0]

    # BFS Start
    ds['came_from'][init_x] = None; ds['cost_so_far'][init_x] = 0
    Q.insert(init_x)

    while Q.size() != 0:
        # Gets current state
        x = Q.get_first(); ds['iterations'] += 1

        if x == target_x:
            # Write relevant Dijstra's results to graph dict
            ds['degrees'] = ds['cost_so_far'][x]; ds['shortest_path'].append(x); ds['status'] = True
            while x != init_x:
                prev_x = ds['came_from'][x]
                ds['shortest_path'].append(prev_x)
                x = prev_x
            ds['shortest_path'].reverse()
            return ds
        
        # Get Actor's Movies
        movies_index = np.where(data.stars[:,0] == x)[0].flatten()
        movies = data.stars[movies_index,1]

        for u in movies:
            if u in ds['visited_m']: continue
            # Get actors from movie
            actors_idx = np.where(data.stars[:,1] == u)[0].flatten()
            costars_id = [data.stars[idx,0] for idx in actors_idx]; costars_id.remove(x)

            for id in costars_id:
                ds['came_from'][id] = x
                ds['cost_so_far'][id] = ds['cost_so_far'][x] + 1 
                Q.insert(id)
            ds['visited_m'].append(u)

    return ds

In [37]:
p1 = input("Enter actor/actress Name 1: ")
p2 = input("Enter actor/actress Name 2: ")
ds = degrees_of_separation(Queue(), p1, p2, data)
print(f"Name 1: {p1}\nName 2: {p2}")
print(f"Couple of Actors has solution? : {Color.BOLD}{Color.GREEN if {ds['status']} else Color.RED} {ds['status']}{Color.END}")
print(f"Iterations to find shortest path: {Color.BOLD}{ds['iterations']}{Color.END}")
print(f"Node-by-Node Path: {f'{Color.CYAN} -> {Color.END}'.join([f'{Color.BOLD}{x}{Color.END}' for x in ds['shortest_path']])}")
print(f"Path Length: {Color.BOLD}{len(ds['shortest_path'])}{Color.END}")
print(f"Degrees of Separation: {Color.BOLD}{ds['degrees']}{Color.END}")

Name 1: Emma Watson
Name 2: Jennifer Lawrence
Couple of Actors has solution? : [1m[92m True[0m
Iterations to find shortest path: [1m3188[0m
Node-by-Node Path: [1m914612[0m[96m -> [0m[1m705356[0m[96m -> [0m[1m300712[0m[96m -> [0m[1m662160[0m[96m -> [0m[1m674781[0m[96m -> [0m[1m602[0m[96m -> [0m[1m329[0m[96m -> [0m[1m424216[0m[96m -> [0m[1m1275259[0m[96m -> [0m[1m1303[0m[96m -> [0m[1m995[0m[96m -> [0m[1m197[0m[96m -> [0m[1m138[0m[96m -> [0m[1m674782[0m[96m -> [0m[1m357979[0m[96m -> [0m[1m1872[0m[96m -> [0m[1m1721[0m[96m -> [0m[1m940158[0m[96m -> [0m[1m2225369[0m
Path Length: [1m19[0m
Degrees of Separation: [1m18[0m
