# Celebrity social network analysis


## 1. Extract all the links from website.

In [2]:
import requests
import dill
from bs4 import BeautifulSoup
import re
import itertools
import networkx as nx
import heapq  

In [2]:
page =requests.get('https://web.archive.org/web/20150913224145/http://www.newyorksocialdiary.com/party-pictures') 

soup=BeautifulSoup(page.text, 'lxml')
print(soup.prettify())

<!DOCTYPE html>
<!--[if IEMobile 7]><html class="no-js ie iem7" lang="en" dir="ltr"><![endif]-->
<!--[if lte IE 6]><html class="no-js ie lt-ie9 lt-ie8 lt-ie7" lang="en" dir="ltr"><![endif]-->
<!--[if (IE 7)&(!IEMobile)]><html class="no-js ie lt-ie9 lt-ie8" lang="en" dir="ltr"><![endif]-->
<!--[if IE 8]><html class="no-js ie lt-ie9" lang="en" dir="ltr"><![endif]-->
<!--[if (gte IE 9)|(gt IEMobile 7)]><html class="no-js ie" lang="en" dir="ltr" prefix="fb: http://ogp.me/ns/fb# og: http://ogp.me/ns# article: http://ogp.me/ns/article# book: http://ogp.me/ns/book# profile: http://ogp.me/ns/profile# video: http://ogp.me/ns/video# product: http://ogp.me/ns/product#"><![endif]-->
<!--[if !IE]><!-->
<html class="no-js" dir="ltr" lang="en" prefix="fb: http://ogp.me/ns/fb# og: http://ogp.me/ns# article: http://ogp.me/ns/article# book: http://ogp.me/ns/book# profile: http://ogp.me/ns/profile# video: http://ogp.me/ns/video# product: http://ogp.me/ns/product#">
 <!--<![endif]-->
 <head>
  <script src

In [4]:
#urls are in the <div class="views-row">

links=soup.select('div.views-row')

link=links[0].prettify()
print(link)

<div class="views-row views-row-1 views-row-odd views-row-first">
 <span class="views-field views-field-title">
  <span class="field-content">
   <a href="/web/20150913224145/http://www.newyorksocialdiary.com/party-pictures/2015/kicks-offs-sing-offs-and-pro-ams">
    Kicks offs, sing offs, and pro ams
   </a>
  </span>
 </span>
 <span class="views-field views-field-created">
  <span class="field-content">
   Friday, September 11, 2015
  </span>
 </span>
</div>



In [6]:
#function to get the exact url from <a href=>
def get_link(el):
    """
    Return urls
    Args:
    el: html element
    Return:
    url: url link
    date: url link created date
    """
    url=el.select('a')[0]['href']
    return url


#get all the urls for one party page

def get_links(response):
    """
    Returns a list of urls.
    Args:
    response: request context
    Return:
    a list of url, date pairs.
    """
    soup = BeautifulSoup(response.text, "lxml")
    links= soup.select('div.views-row')
    return [get_link(el) for el in links]

In [7]:
get_links(page)[0:3]

['/web/20150913224145/http://www.newyorksocialdiary.com/party-pictures/2015/kicks-offs-sing-offs-and-pro-ams',
 '/web/20150913224145/http://www.newyorksocialdiary.com/party-pictures/2015/grand-finale-of-the-hampton-classic-horse-show',
 '/web/20150913224145/http://www.newyorksocialdiary.com/party-pictures/2015/riders-spectators-horses-and-more']

In [8]:
#Now parse all of the party urls from all the pages.
#26 pages, each page about 50 party links.

from requests_futures.sessions import FuturesSession
LIMIT = 26
url = "https://web.archive.org/web/20150913224145/http://www.newyorksocialdiary.com/party-pictures"
def get_page_args(i):
    return {"url": url,
            "params": {"page": i}}
session = FuturesSession(max_workers=5)
link_list=[link 
         for future in [session.get(**get_page_args(i)) for i in range(LIMIT)]
         for link in get_links(future.result())]

In [9]:
len(link_list)

1223

In [10]:
#save the link_list in case of restarting the notebook.
dill.dump(link_list, open('link_list.pkd', 'wb'))

## 2. Parse photo captions from all the photos.

In [27]:
#function to parse photo captions from a page.
def get_captions(path):
    """
    Return photo captions from a url link page.
    Args:
    path: url link
    return: a list of strings
    """
    path="https://web.archive.org"+path
    response=requests.get(path)
    soup=BeautifulSoup(response.text)
    photocaptions=soup.select('.photocaption')
    captions=[]
    for caption in photocaptions:
        captions.append(caption.text)
    return captions

#parse photo captions from all the pages.
caption_all=[]
for link in link_list:
    caption_all=caption_all+get_captions(link)

#store all the captions as .pkd file    
dill.dump(caption_all, open('caption_all.pkd', 'wb'))

In [29]:
len(caption_all)

128925

## 3. Parse names in the photo captions

In [50]:
#function to parse names in the photo captions.
def names_in_captions(caption_all, skipset):
    """
    Process the photo captions to get a list of lists. In the sublist, it contains names appear in the same photo.
    Args:
    captions_all: a list of strings
    skipset: customed stop words
    Return:
    a list of string lists 
    """
    nameout=[]
    
    splitstr='(?:, |(?<!\\w)and |with )+' #
    splitre=re.compile(splitstr)
    
    skipstr='(?:\\n| )+|\(.*\)'
    for s in skipset:
        skipstr+='|'+s+' '
    skipre=re.compile(skipstr)
    
    for content in caption_all:
        names_after_skip=skipre.sub(' ', content)
        namesin=[]
        split_names=splitre.split(names_after_skip)
        for name in filter(None,split_names):
            name=name.strip()
            if name=='' or name in skipset:
                continue
            namesin.append(name)
        #to parse "John and Florence White" to get "John White" and "Florence White" seperatively
        for i in range(len(namesin)-1):
            if re.search(r'\s',namesin[i+1]) and not re.search(r'\s',namesin[i]):
                namesin[i]=re.sub(r'\w+',namesin[i],namesin[i+1],1)
        if namesin==[]:
            continue
        nameout.append(namesin)
    return nameout

In [51]:
skip={'family', 'families', 'Dr.', 'Mayor', 'MD', 'M.D.', 'friend', 'friends', 'Jr.', 'Sr.', 'Mr.', 'Ms.', 'Mrs.', 'PhD', 'Ph.D.', 'Chair', 'Co-chair', 'Co-chairs', 'President', 'CEO', 'Guest', 'guest', 'Guests', 'guests', 'children', 'child', 'his wife', 'wife', 'her husband', 'husband', 'Hospital for Special Surgery'}
names_in_captions=names_in_captions(caption_all, skip)
dill.dump(names_in_captions[1:], open('names_in_captions.pkd', 'wb'))

In [5]:
names_in_captions=dill.load(open('names_in_captions.pkd', 'rb'))

In [6]:
len(names_in_captions)

107617

## 4. Social network of celebrities 

In [7]:
#Celebrities appear in the same photo are considered knowing each other and have connections.
#function to build a social network.
def socialNetwork(namelists):
    """
    process the relation between names appeared in the same photo captions.
    Args:
    namelists: a list of string lists, sublist has celebrity names appeared in the same photo.
    Return:
    a list of tuples: first element is celebrity name, second is number of how many friends the celebrity knows.
    """
    G=nx.Graph()
    for namelist in namelists:
        edges=itertools.combinations(namelist, 2)
        for edge in edges:
            u=edge[0]
            v=edge[1]
            if G.has_edge(u,v):
                G[u][v]['weight']+=1
            else:
                G.add_edge(u, v, weight=1)
    return G

socialNetwork=socialNetwork(names_in_captions)

celebrity_network=list(socialNetwork.degree(socialNetwork.nodes, weight='weight'))
heapq.nlargest(10, celebrity_network, key=lambda x: x[1])

[('Jean Shafiroff', 769),
 ('Gillian Miniter', 539),
 ('Mark Gilbertson', 529),
 ('Alexandra Lebenthal', 438),
 ('Geoffrey Bradfield', 398),
 ('Yaz Hernandez', 363),
 ('Eleanora Kennedy', 349),
 ('Kamie Lightburn', 334),
 ('Somers Farkas', 331),
 ('Debbie Bancroft', 330)]

The most popular celebrity is Jean Shafiroff, who is an American philanthropist, author and socialite, and serves on philanthropic boards in New York City and Southampton.


![title](JeanShafiroff.jpg)

A similar way to determine popularity is to look at their [PageRank](http://en.wikipedia.org/wiki/PageRank). PageRank was developed by Google to rank web pages in their search engine results. It works by counting the numbers and quality of links to a page to determine a rough estimate of how important the website is. A basic tutorial can be found on [YouTube](https://www.youtube.com/watch?v=qxEkY8OScYY).

In [8]:
pagerank=nx.pagerank(socialNetwork)

In [10]:
top10=[(k, pagerank[k]) for k in heapq.nlargest(10, pagerank, key=lambda x: pagerank[x])]
top10

[('Jean Shafiroff', 0.0006711864516026716),
 ('Mark Gilbertson', 0.0004791469675286675),
 ('Gillian Miniter', 0.00040401808163877893),
 ('Geoffrey Bradfield', 0.00036342809842313743),
 ('Alexandra Lebenthal', 0.00034888804325300046),
 ('Yaz Hernandez', 0.000308131240746186),
 ('Andrew Saffir', 0.00029937461963079674),
 ('Kamie Lightburn', 0.00029328308624227426),
 ('Sharon Bush', 0.00028523043792204095),
 ('Eleanora Kennedy', 0.00027831370818464735)]

## 5. Best Friends

In [64]:
#two people who appear in the most photos are considered best friends.

best_friends=[(e[:2],e[2]['weight']) for e in heapq.nlargest(10, socialNetwork.edges(data=True), key=lambda x: x[2]['weight'])]

best_friends

[(('Gillian Miniter', 'Sylvester Miniter'), 117),
 (('Bonnie Comley', 'Stewart Lane'), 79),
 (('Jamee Gregory', 'Peter Gregory'), 73),
 (('Geoffrey Bradfield', 'Roric Tobin'), 64),
 (('Andrew Saffir', 'Daniel Benedict'), 64),
 (('Donald Tober', 'Barbara Tober'), 55),
 (('Jonathan Farkas', 'Somers Farkas'), 53),
 (('Jean Shafiroff', 'Martin Shafiroff'), 51),
 (('Alexandra Lebenthal', 'Jay Diamond'), 46),
 (('Michael Kennedy', 'Eleanora Kennedy'), 46)]

It appears that top ten best friends are either couples or business partners.