# Code to Scrape the Acting Landscape of the Most Popular Movies

## Import Required Libraries

In [1]:
import csv
import re
import requests
import threading
import json
from bs4 import BeautifulSoup
import urllib.request
import pandas as pd
import networkx as nx
import community as louvain
import matplotlib.pyplot as plt
from pyvis.network import Network
from matplotlib import pylab

## Define Static Variables

In [2]:
# create url sections
preurl = "https://www.imdb.com/title/tt"
posturl = "/fullcredits?ref_=tt_cl_sm"

# define empty pandas dataframe
network_table = pd.DataFrame(columns =['Actor1', 'Actor2'])
network_table

Unnamed: 0,Actor1,Actor2


## Test Custom Functions

In [3]:
# import pre-defined functions from script
from Utils import getURLs, retrieveCast, filterList, combinedRetrieve

# define test url and table
test_url = "https://www.imdb.com/title/tt0076759/fullcredits?ref_=tt_cl_sm" 
test_table = network_table

#test retrievePage function
test_list = retrieveCast(test_url)
test_list

Scraping cast list from: Star Wars (1977) 



[nan,
 'Mark Hamill',
 'Harrison Ford',
 'Carrie Fisher',
 'Peter Cushing',
 'Alec Guinness',
 'Anthony Daniels',
 'Kenny Baker',
 'Peter Mayhew',
 'David Prowse',
 'Phil Brown',
 'Shelagh Fraser',
 'Jack Purvis',
 'Alex McCrindle',
 'Eddie Byrne',
 'Drewe Henley',
 'Denis Lawson',
 'Garrick Hagon',
 'Jack Klaff',
 'William Hootkins',
 'Angus MacInnes',
 'Jeremy Sinden',
 'Graham Ashley',
 'Don Henderson',
 'Richard LeParmentier',
 'Leslie Schofield',
 'Rest of cast listed alphabetically:',
 'David Ankrum',
 'Mark Anthony Austin',
 'Scott Beach',
 'Lightning Bear',
 'Jon Berg',
 'Doug Beswick',
 'Paul Blake',
 'Janice Burchette',
 'Ted Burnett',
 'John Chapman',
 'Gilda Cohen',
 'Tim Condren',
 'Barry Copping',
 'Alfie Curtis',
 'Robert Davies',
 'Maria De Aragon',
 'Barbie Denham',
 'Frazer Diamond',
 'Peter Diamond',
 'Warwick Diamond',
 'Sadie Eden',
 'Kim Falkinburg',
 'Harry Fielder',
 'Anthony Forrest',
 'Ted Gagliano',
 'Salo Gardner',
 'Steve Gawley',
 'Barry Gnome',
 'Rusty Go

In [4]:
# test filterList function
filterList(test_list)

Unnamed: 0,Actor1,Actor2
0,Mark Hamill,Harrison Ford
0,Mark Hamill,Carrie Fisher
0,Mark Hamill,Peter Cushing
0,Mark Hamill,Alec Guinness
0,Mark Hamill,Anthony Daniels
...,...,...
0,Fred Wood,Larry Ward
0,Fred Wood,Diana Sadley Way
0,Fred Wood,Harold Weed
0,Fred Wood,Bill Weston


## Retrieve Sample URLs

In [5]:
#sample_urls = getURLs()

Save URLs to a JSON format

In [6]:
#out_file = open("sample_urls.json", "w")
#json.dump(sample_urls, out_file)

# we'll read from the JSON when re-running this project
with open("Data/sample_urls.json") as file:
  sample_urls = list(json.load(file))
sample_urls

['https://www.imdb.com/title/tt0111161/fullcredits?ref_=tt_cl_sm',
 'https://www.imdb.com/title/tt0068646/fullcredits?ref_=tt_cl_sm',
 'https://www.imdb.com/title/tt0252487/fullcredits?ref_=tt_cl_sm',
 'https://www.imdb.com/title/tt0108052/fullcredits?ref_=tt_cl_sm',
 'https://www.imdb.com/title/tt0468569/fullcredits?ref_=tt_cl_sm',
 'https://www.imdb.com/title/tt0050083/fullcredits?ref_=tt_cl_sm',
 'https://www.imdb.com/title/tt0071562/fullcredits?ref_=tt_cl_sm',
 'https://www.imdb.com/title/tt0167260/fullcredits?ref_=tt_cl_sm',
 'https://www.imdb.com/title/tt5354160/fullcredits?ref_=tt_cl_sm',
 'https://www.imdb.com/title/tt0110912/fullcredits?ref_=tt_cl_sm',
 'https://www.imdb.com/title/tt7466810/fullcredits?ref_=tt_cl_sm',
 'https://www.imdb.com/title/tt1375666/fullcredits?ref_=tt_cl_sm',
 'https://www.imdb.com/title/tt0137523/fullcredits?ref_=tt_cl_sm',
 'https://www.imdb.com/title/tt0120737/fullcredits?ref_=tt_cl_sm',
 'https://www.imdb.com/title/tt0109830/fullcredits?ref_=tt_cl_

## Run Multi-threaded Scraping Loop

Run a multithreaded script to scrape cast data from the IMDB URLs.

In [7]:
#dataframes = []
#
#while len(sample_urls) > 0:
#    
#    threads = []
#    
#    if len(sample_urls) < 20:
#        batch_urls = sample_urls
#        sample_urls = []
#    else:
#        batch_urls = sample_urls[:20]
#        del sample_urls[:20]
#        
#    for url in batch_urls:
#        t = threading.Thread(target = combinedRetrieve, args = [url, dataframes])
#        threads.append(t)
#    
#    for i in range(0, len(batch_urls)):
#        threads[i].start()
#        
#    for i in range(0, len(batch_urls)):
#        threads[i].join()

## Aggregate into Final Network Table

With the dataframe of the individual realtionships, we must then aggregate these relationships into an aggreagted table of actors and costars.

In [8]:
#network_table = pd.concat(dataframes)
#network_table["value"] = 1
#final_network_table = network_table.groupby(["Actor1","Actor2"], sort = False, as_index = False).sum()

#final_network_table.to_csv("Data/Actor_Data.csv", index = False)

In [9]:
# once this has been saved, simply read the completed table csv when re-running the notebook
final_network_table = pd.read_csv("Data/Actor_Data.csv")

final_network_table.sort_values('value', ascending = False)

Unnamed: 0,Actor1,Actor2,value
1213337,Elton LeBlanc,Cynthia LeBlanc,36
1213234,Cynthia LeBlanc,Elton LeBlanc,36
284798,Mickie McGowan,Sherry Lynn,29
284875,Sherry Lynn,Mickie McGowan,29
284799,Mickie McGowan,Jack Angel,25
...,...,...,...
10552774,David J Biscoe,Gary Thompsett,1
10552773,David J Biscoe,Shaq Taylor,1
10552772,David J Biscoe,Corinne Swallow,1
10552770,David J Biscoe,Carl Robinson,1


## Network Visualisation

In [10]:
G = nx.from_pandas_edgelist(final_network_table, 
                            source = "Actor1", 
                            target = "Actor2", 
                            edge_attr = "value", 
                            create_using = nx.Graph())

## Calculate Centrality Measures

### Degree Centrality

In [11]:
degree_dict = nx.degree_centrality(G)
degree_dict

{'Martin Balsam': 0.0038381038420317263,
 'John Fiedler': 0.0017675478219882949,
 'Lee J. Cobb': 0.0005050136634252271,
 'E.G. Marshall': 0.0029571355625010523,
 'Jack Klugman': 0.0005947938702563786,
 'Edward Binns': 0.0017114351927188251,
 'Jack Warden': 0.0061892230084225055,
 'Henry Fonda': 0.0027382963083501205,
 'Joseph Sweeney': 0.0002973969351281893,
 'Ed Begley': 0.00044890103415575746,
 'George Voskovec': 0.00039839966781323474,
 'Robert Webber': 0.0009483034346540375,
 'Rudy Bond': 0.0012681454214900148,
 'Tom Gorman': 9.539146975809845e-05,
 'James Kelly': 9.539146975809845e-05,
 'Billy Nelson': 9.539146975809845e-05,
 'John Savoca': 9.539146975809845e-05,
 'Walter Stocker': 9.539146975809845e-05,
 'Doug E. Doug': 0.0007967993356264695,
 'Flex Alexander': 0.0008080218614803634,
 'Shaun Baker': 7.294641805031059e-05,
 'Lorraine Toussaint': 0.00366415469129637,
 'Yunoka Doyle': 7.294641805031059e-05,
 'Jason Bose Smith': 0.000533069978059962,
 'Sullivan Walker': 0.00074068670

In [12]:
degree_df = pd.DataFrame.from_dict(degree_dict, orient='index', columns=['centrality'])

degree_df.to_csv('Centrality Measures/Centrality.csv')

### Betweenness Centrality

In [None]:
betweenness_dict = nx.betweenness_centrality(G)
betweenness_dict

In [None]:
betweenness_df = pd.DataFrame.from_dict(betweenness_dict, orient='index', columns=['centrality'])
betweenness_df.to_csv('Centrality Measures/Betweenness.csv')

### Closeness Centrality

In [None]:
closeness_dict = nx.closeness_centrality(G)
closeness_dict

In [None]:
closeness_df = pd.DataFrame.from_dict(closeness_dict, orient='index', columns=['centrality'])
closeness_df.to_csv('Centrality Measures/Closeness.csv')

## Visualise Sub-graphs

As the complete graph is so large we will have to break the graph into communities of interest for visualisation. Then we can visualise these nodes and save the png images.

In [None]:
partition = louvain.best_partition(G)

def visualise_subgraph(community, colour):

    nodes_to_visualise = [node for node, community_id in partition.items() if community_id == community]
    subgraph = G.subgraph(nodes_to_visualise)

    pos = nx.spring_layout(subgraph, k = 0.1)
    plt.figure(figsize = (8, 8))
    plt.axis('off')
    nx.draw_networkx_nodes(subgraph, pos, node_size = 5, node_color = colour, alpha = 0.8)
    nx.draw_networkx_edges(subgraph, pos, alpha = 0.2)
    plt.savefig("Images/community_" + str(community) + ".png", transparent = False)

In [None]:
communities = {1:'blue',
               2:'red',
               3:'green',
               4:'orange',
               5:'purple'
              }

for community, colour in communities.items():
    visualise_subgraph(community, colour)