In [18]:
from py2neo import Graph
import pandas as pd
import numpy as np

from datetime import date
import datetime as dt
import time
import re

from bs4 import BeautifulSoup

import progressbar

pd.set_option('display.max_columns', None)

## Web scraping

In [2]:
# download chromedriver executable from https://chromedriver.chromium.org/home

# add chromedriver path
import sys
path = '/Users/catarina/Projects/bin'
sys.path.append(path)


# 755 is the default numerical permission for files in usr/bin
# chromedriver needs a numerical permission equivalent to or greater than 755
import os
os.chmod(path,755) #664


# # configure webdriver to use browser
from selenium import webdriver

# driver = webdriver.Firefox(executable_path = path + '/geckodriver')
# driver = webdriver.Chrome(executable_path = path + '/chromedriver')

# driver.close()

In [142]:
def get_soup_content(website):
    
    driver = webdriver.Firefox(executable_path = path + '/geckodriver')
    
    driver.get(website)
    
    last_height = driver.execute_script("return document.body.scrollHeight")
    
    # scroll page until the end
    
    while True:
        
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        time.sleep(5)
        
        new_height = driver.execute_script("return document.body.scrollHeight")        
        
        if new_height == last_height:
            break
            
        last_height = new_height
        
        
    content = driver.page_source
    
    soup = BeautifulSoup(content)
    
    return soup, driver




def extract_personal_info(soup):
    
    info = {}
    
    has_age = soup.find("td", text="Age")
    if has_age:
        info['age'] = has_age.find_next_sibling("td").text.strip().split(' ')[0]
        
        
    has_birthday = soup.find("td", text="Birthday")
    if has_birthday:
        info['birthday'] = has_birthday.find_next_sibling("td").text.strip()
        
        
    has_height = soup.find("td", text="Height")
    if has_height:
        info['height_cm'] = has_height.find_next_sibling("td").text.strip().split('(')[1].split(' ')[0]
        
        
    has_weight = soup.find("td", text="Weight")
    if has_weight:
        info['weight_kg'] = has_weight.find_next_sibling("td").text.strip().split('(')[1].split(' ')[0]
        
        
    has_eye_color = soup.find("td", text="Eye Color")
    if has_eye_color:
        info['eye_color'] = has_eye_color.find_next_sibling("td").text.strip().lower()
        
    
    has_hair_color = soup.find("td", text="Hair Color")
    if has_hair_color:
        info['hair_color'] = has_hair_color.find_next_sibling("td").text.strip().lower()      
        
        
    has_sign = soup.find("td", text="Zodiac Sign")
    if has_sign:
        info['sign'] = has_sign.find_next_sibling("td").text.strip().lower()
        

    has_sexuality = soup.find("td", text="Sexuality")
    if has_sexuality:
        info['sexuality'] = has_sexuality.find_next_sibling("td").text.strip().lower()
        
        
    has_ethnicity = soup.find("td", text="Ethnicity")
    if has_ethnicity:
        info['ethnicity'] = has_ethnicity.find_next_sibling("td").text.strip().lower()
        
        
    has_nationality = soup.find("td", text="Nationality")
    if has_nationality:
        info['nationality'] = has_nationality.find_next_sibling("td").text.strip().lower()
        
        
    has_occupation = soup.find("td", text="Occupation")
    if has_occupation:
        info['occupation'] = has_occupation.find_next_sibling("td").text.strip().lower()
        

    has_religion = soup.find("td", text="Religion")
    if has_religion:
        info['religion'] = has_religion.find_next_sibling("td").text.strip().lower()
        
    
    return info


def extract_dating_history(soup):

    relationships = []

    for match in soup.findAll('td', text = re.compile('\t\tRelationship\n|\t\tEncounter\n|\t\tMarried\n')):
       
        is_rumour = match.find_next_sibling("td").text.strip() == 'R'
    
        if is_rumour:
            continue
        
        name = match.find_previous_sibling("td").text.strip()
        start = match.find_next_sibling("td").find_next_sibling("td")
        end = start.find_next_sibling("td")
        duration = end.find_next_sibling("td").text.strip()
        match_url = match.find_previous_sibling("td").find("a")['href']

        relationships.append({'name': name, 
                              'type': match.text.strip(),
                              'start': start.text.strip(), 
                              'end': end.text.strip(), 
                              'duration': duration,
                              'url': match_url
                             })

    return relationships



def extract_person_data(url):

    soup, driver = get_soup_content(url)
    
    driver.close()

    i = extract_personal_info(soup)

    r = extract_dating_history(soup)

    i['name'] = url_to_name(url)

    i['relationships'] = r

    d = pd.DataFrame.from_dict(i, orient = 'index').transpose()
    
    return d


In [140]:
def get_n_most_popular_celebrities(driver, n):

    # scroll page until listing at least 1000 celebrities

    while True:
        
        first_height = driver.execute_script("return document.body.scrollHeight")

        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        time.sleep(5)
        
        last_height = driver.execute_script("return document.body.scrollHeight")

        content = driver.page_source

        soup = BeautifulSoup(content)

        last_item = soup.findAll('i', attrs={'class':'icon-chart-line'})[-1]

        nr_items = int(last_item.find_next_sibling("span").text.replace(',',''))

        if (nr_items >= n) or (first_height == last_height):
            
            break


    # extract url for details on each celebrity

    urls = []

    for item in soup.findAll('li', attrs={'class':'ff-grid-box ff-list'}):
        urls.append(item.find('a')['href'])


    return urls, nr_items




def url_to_name(url):
    return (' ').join([w.capitalize() for w in (url.split('/')[-1].split('-'))])


def name_to_url(name):
    return name.split('(')[0].strip().lower().replace(' ','-')    




def build_dataset(urls, df):
    
    if len(df) > 0:
        
        final_df = df.copy()
        
    else:
    
        final_df = pd.DataFrame(columns = ['age', 'birthday', 'height_cm', 'weight_kg', 'eye_color', 
                                           'hair_color', 'sign', 'sexuality', 'ethnicity', 'nationality',
                                           'occupation', 'religion', 'name', 'relationships'])
    
    
    bar = progressbar.ProgressBar(maxval=len(urls), 
                                  widgets=[progressbar.Bar('=', '[', ']'), ' ', 
                                           progressbar.Percentage()])
    
    bar.start()
    
    # get information from all urls
    
    for idx, url in enumerate(urls):
        
        try:
        
            df = extract_person_data(url)

            if df.loc[0,'name'] not in final_df.name.unique():

                final_df = pd.concat([final_df, df] , axis = 0, ignore_index = True)


            # extract info from all the relationships of this celebrity if they are not already stored

            for relationship in df.relationships[0]:

                if relationship['name'] in final_df.name.unique():
                    continue

                df = extract_person_data(relationship['url'])
                final_df = pd.concat([final_df, df] , axis = 0, ignore_index = True)     


        except Exception as ex:
            
            print(ex)
            
            bar.finish()
            
            return final_df
        
        
        bar.update(idx + 1)
        
    
    bar.finish()       
        
    return final_df



### Web scrape urls pages of the most popular celebrities

In [35]:
# scrolling down is not always activated - manually ensure it is and restart webdriver if needed

driver = webdriver.Firefox(executable_path = path + '/geckodriver')

website = "https://www.whosdatedwho.com/popular"

driver.get(website)

content = driver.page_source

In [11]:
# run web scraping code

urls, nr_items = get_n_most_popular_celebrities(driver, n = 1000)
print(nr_items)

1056


In [16]:
# save data

import csv

with open("popular_urls.csv","w") as f:
    wr = csv.writer(f,delimiter="\n")
    wr.writerow(urls)

### Web scrape personal info on each celebrity

In [143]:
# extract information from all celebrities in the urls list and from the people their were involved with

data = build_dataset(urls, pd.DataFrame())



In [145]:
# save data

data.to_csv('relationships.csv', index = False)

## Data cleaning and processing

In [280]:
data = pd.read_csv('relationships.csv')

In [319]:
data_ = data.drop_duplicates().reset_index(drop = True)

In [315]:
### Data correction

data_.loc[data_.name == 'Rafael Cebrian','birthday'] = '15th October, 1989'
data_.loc[data_.name == 'Ryan Press','birthday'] = '24th October, 1979'
data_.loc[data_.name == 'Jessica Vargas','birthday'] = '23th October, 1995'
data_.loc[data_.name == 'Dave Gardner','birthday'] = '17th September, 1976'
data_.loc[data_.name == 'Tommy Alastra','birthday'] = '14th February, 1976'
data_.loc[data_.name == 'Karolyn Pho','birthday'] = '19th January, 1998'
data_.loc[data_.name == 'Bonita','birthday'] = '12th December, 1995'
data_.loc[data_.name == 'Kaitlin Najjar','birthday'] = '23rd May, 1995'
data_.loc[data_.name == 'Natt Weller','birthday'] = '10th May, 1995'
data_.loc[data_.name == 'Kaitlin Najjar','birthday'] = '10th May, 1995'
data_.loc[data_.name == 'Sophie Coady','birthday'] = '10th November, 1991'
data_.loc[data_.name == 'Victor Turpin','birthday'] = '4th March, 1982'
data_.loc[data_.name == 'Jack Street','birthday'] = '25th September, 1988'
data_.loc[data_.name == 'Viktoria Alexeeva','birthday'] = '13th April, 1995'
data_.loc[data_.name == 'Hayes Hargrove','birthday'] = '20th December, 1979'
data_.loc[data_.name == 'Cisco Rosado','birthday'] = '29th June, 1979'
data_.loc[data_.name == 'Cherie Thibodeaux','birthday'] = '11th September, 1975'

In [308]:
def process_datetime(date):
    
    try:
    
        date_components = date.replace(',','').split(' ')

        date_components = ([date_components[0].replace('st', '').replace('nd', '').replace('rd', '').replace('th', '')]
                           + date_components[1:])

        date = "-".join(date_components)
        
    except:
        
        return np.nan
    
    return date
    

In [443]:
def zodiac_sign(day, month): 
    # checks month and date within the valid range 
    # of a specified zodiac 
    if month == 12: 
        return 'sagittarius' if (day < 22) else 'capricorn'

    elif month == 1: 
        return 'capricorn' if (day < 20) else 'aquarius'

    elif month == 2: 
        return 'aquarius' if (day < 19) else 'pisces'

    elif month == 3: 
        return 'pisces' if (day < 21) else 'aries'

    elif month == 4: 
        return 'aries' if (day < 20) else 'taurus'

    elif month == 5: 
        return 'taurus' if (day < 21) else 'gemini'

    elif month == 6: 
        return 'gemini' if (day < 21) else 'cancer'

    elif month == 7: 
        return 'cancer' if (day < 23) else 'leo'

    elif month == 8: 
        return 'leo' if (day < 23) else 'virgo'

    elif month == 9: 
        return 'virgo' if (day < 23) else 'libra'

    elif month == 10: 
        return 'libra' if (day < 23) else 'scorpio'

    elif month == 11: 
        return 'scorpio' if (day < 22) else 'sagittarius'
    
    elif day == np.nan or month == np.nan:
        return np.nan
    
    
    
def calculate_age(birthdate): 
    
    if pd.isnull(birthdate):
        return np.nan

    year = birthdate.year
    month = str(birthdate.month)
    day = str(birthdate.day)
    
    age = 2020 - year + int(pd.to_datetime('2020-'+month+'-'+day) <= pd.to_datetime(date.today()))
    
    return age


In [444]:
data_1 = data_.copy()
data_1['birthday'] = data_1['birthday'].apply(process_datetime)
data_1['birthday'] = pd.to_datetime(data_1['birthday'], format = '%d-%B-%Y')

data_1['sign'] = data_1.birthday.apply(lambda bd: zodiac_sign(bd.day, bd.month))

data_1['age'] = data_1.birthday.apply(calculate_age)

data_1['n_relationships'] = proc_data.relationships.apply(lambda l: len(eval(l)))

data_1 = data_1.rename(columns = {'birthday': 'date_of_birth'})

In [484]:
proc_data = data_1.copy()

## Create graph data model

In [528]:
# Connect to local database 'Kaggle Movie Database' and upload data
# extracted from https://www.kaggle.com/rounakbanik/the-movies-dataset

graph = Graph("bolt://localhost:7687", auth=("neo4j", "gossip"))

In [519]:
graph.run("CREATE CONSTRAINT UniquePersonNameConstraint ON (p:Person) ASSERT p.name IS UNIQUE")

In [520]:
# Check creation of constraints
graph.run("CALL db.constraints()").data()

[{'name': 'UniquePersonNameConstraint',
  'description': 'CONSTRAINT ON ( person:Person ) ASSERT (person.name) IS UNIQUE',
  'details': "Constraint( id=2, name='UniquePersonNameConstraint', type='UNIQUENESS', schema=(:Person {name}), ownedIndex=1 )"}]

### Import Person nodes

In [541]:
for index, row in proc_data.iterrows():
    graph.run('''
        MERGE (p:Person {name:$name})
            ON CREATE SET
                  p.name = $name,
                  p.dateOfBirth = $date_of_birth,
                  p.age = toInteger($age),
                  p.heightCm = toFloat($height_cm),
                  p.weightKg = toFloat($weight_kg),
                  p.eyeColor = $eye_color,
                  p.hairColor = $hair_color,
                  p.sign = $sign,
                  p.sexuality = $sexuality,
                  p.ethnicity = $ethnicity,
                  p.nationality = $nationality,
                  p.occupation = $occupation,
                  p.religion = $religion,
                  p.n_relationships = toInteger($n_relationships)
                  ''', 
        parameters = {
          'name': row['name'],
          'date_of_birth': str(row.date_of_birth),
          'age': row.age,
          'height_cm': row.height_cm,
          'weight_kg': row.weight_kg,
          'eye_color': row.eye_color,
          'hair_color': row.hair_color,
          'sign': row.sign,
          'sexuality': row.sexuality,
          'ethnicity': row.ethnicity,
          'nationality': row.nationality,
          'occupation': row.occupation,
          'religion': row.religion,
          'n_relationships': row.n_relationships
        })
    
    
# check creation of movie nodes

graph.run('match (p:Person) return count(p)').data()

[{'count(p)': 5226}]

### Import relationships

In [542]:
for index, row in proc_data.iterrows():
    graph.run('''
        WITH apoc.convert.fromJsonList($relationships) AS relationships
        UNWIND relationships AS relationships_map
        WITH relationships_map['name'] AS p2_name,
             relationships_map['type'] AS type,
             relationships_map['start'] AS start,
             relationships_map['end'] AS end,
             relationships_map['duration'] AS duration

        
        MATCH (p1:Person {name:$p1_name}), (p2:Person {name:p2_name})
        
        MERGE (p1)-[r:RELATIONSHIP]-(p2)
            ON CREATE SET
                r.type = type,
                r.start = start,
                r.end = end,
                r.duration = duration
                  ''', 
              
        parameters = {
          'p1_name': row['name'],
          'relationships': row.relationships
        })
    
    
# check creation of movie nodes

graph.run('MATCH (:Person)-[r:RELATIONSHIP]-(:Person) RETURN COUNT(r)').data()

[{'COUNT(r)': 14076}]

## Data analytics

People that shared more relationships

In [558]:
query = '''
MATCH p = (p1:Person)-[r1:RELATIONSHIP]-(p2:Person)-[r2:RELATIONSHIP]-(p3:Person)
WHERE p1<>p3 and p1.name < p3.name //and p1.age <= 35 and p3.age <= 35
WITH p1, p3, count(distinct(p)) AS c
RETURN p1.name, p3.name, c ORDER BY c DESC LIMIT 10
'''

graph.run(query).data()

[{'p1.name': 'Ava Gardner', 'p3.name': 'Lana Turner', 'c': 24},
 {'p1.name': 'Jack Nicholson', 'p3.name': 'Warren Beatty', 'c': 12},
 {'p1.name': 'Lindsay Lohan', 'p3.name': 'Paris Hilton', 'c': 10},
 {'p1.name': 'Ryan O Neal', 'p3.name': 'Warren Beatty', 'c': 10},
 {'p1.name': 'Mick Jagger', 'p3.name': 'Warren Beatty', 'c': 10},
 {'p1.name': 'Frank Sinatra', 'p3.name': 'John F Kennedy', 'c': 9},
 {'p1.name': 'Lana Turner', 'p3.name': 'Marlene Dietrich', 'c': 9},
 {'p1.name': 'Ava Gardner', 'p3.name': 'Marilyn Monroe', 'c': 9},
 {'p1.name': 'Robert Evans', 'p3.name': 'Warren Beatty', 'c': 9},
 {'p1.name': 'David Bowie', 'p3.name': 'Mick Jagger', 'c': 8}]

People up to 40 years old that shared more relationships

In [560]:
query = '''
MATCH p = (p1:Person)-[r1:RELATIONSHIP]-(p2:Person)-[r2:RELATIONSHIP]-(p3:Person)
WHERE p1<>p3 and p1.name < p3.name and p1.age <= 40 and p3.age <= 40
WITH p1, p3, count(distinct(p)) AS c
RETURN p1.name, p3.name, c ORDER BY c DESC LIMIT 10
'''

graph.run(query).data()

[{'p1.name': 'Lindsay Lohan', 'p3.name': 'Paris Hilton', 'c': 10},
 {'p1.name': 'Aubrey Graham', 'p3.name': 'James Harden', 'c': 6},
 {'p1.name': 'Lil Wayne', 'p3.name': 'Soulja Boy', 'c': 6},
 {'p1.name': 'Shad Moss', 'p3.name': 'Yung Berg', 'c': 5},
 {'p1.name': 'Lindsay Lohan', 'p3.name': 'Sienna Miller', 'c': 5},
 {'p1.name': 'Jenna Shea', 'p3.name': 'Kat Stacks', 'c': 5},
 {'p1.name': 'Rob Kardashian', 'p3.name': 'Soulja Boy', 'c': 5},
 {'p1.name': 'French Montana', 'p3.name': 'James Harden', 'c': 4},
 {'p1.name': 'Alexis Sky', 'p3.name': 'Blac Chyna', 'c': 4},
 {'p1.name': 'James Harden', 'p3.name': 'Lil Wayne', 'c': 4}]

In [563]:
# visualize nodes

query = '''
MATCH (p1:Person {name:$name1})-[r1:RELATIONSHIP]-(p3)-[r2:RELATIONSHIP]-(p2:Person {name:$name2}) 
return p1, p2, p3, r1, r2
'''

graph.run(query, parameters = {'name1': 'Rihanna', 'name2': 'Rita Ora'})

<py2neo.database.Cursor at 0x12504d438>

In [None]:
# how many squares of data?

query = '''
MATCH (p1:Person)-[:RELATIONSHIP]-(p2:Person)-[:RELATIONSHIP]-(p3:Person)-[:RELATIONSHIP]-(p4:Person)-[:RELATIONSHIP]-(p1) 
WHERE p1 <> p3 and p2 <> p4 and p1.name < p3.name and p2.name < p4.name
return p1, p2, p3, p4 limit 10
'''

### Graph algorithms

#### Community

In [565]:
query = '''
CALL gds.graph.create(
    'myGraph',
    'Person',
    {
        RELATIONSHIP: {
            orientation: 'UNDIRECTED'
        }
    }
)
'''

graph.run(query).data()

[{'nodeProjection': {'Person': {'properties': {}, 'label': 'Person'}},
  'relationshipProjection': {'RELATIONSHIP': {'orientation': 'UNDIRECTED',
    'aggregation': 'DEFAULT',
    'type': 'RELATIONSHIP',
    'properties': {}}},
  'graphName': 'myGraph',
  'nodeCount': 5226,
  'relationshipCount': 14076,
  'createMillis': 70}]

In [566]:
query = '''
CALL gds.louvain.write.estimate('myGraph', { writeProperty: 'community' })
YIELD nodeCount, relationshipCount, bytesMin, bytesMax, requiredMemory
'''

graph.run(query).data()

[{'nodeCount': 5226,
  'relationshipCount': 14076,
  'bytesMin': 340049,
  'bytesMax': 1903232,
  'requiredMemory': '[332 KiB ... 1858 KiB]'}]

In [567]:
query = '''
CALL gds.louvain.write('myGraph', { writeProperty: 'community' })
YIELD communityCount, modularity, modularities
'''

graph.run(query).data()

[{'communityCount': 289,
  'modularity': 0.7795516802748462,
  'modularities': [0.543530377173047,
   0.7367617725107102,
   0.7644527200652552,
   0.7795516802748462]}]

In [572]:
query = '''
CALL gds.triangleCount.stream('myGraph')
YIELD nodeId, triangleCount
WITH nodeId, triangleCount WHERE triangleCount > 0
RETURN gds.util.asNode(nodeId).name AS name, triangleCount
ORDER BY triangleCount DESC
'''

graph.run(query).data()

[{'name': 'David Bowie', 'triangleCount': 5},
 {'name': 'Missy Elliott', 'triangleCount': 3},
 {'name': 'Da Brat', 'triangleCount': 3},
 {'name': 'Rocco Siffredi', 'triangleCount': 3},
 {'name': 'Jesse Jane', 'triangleCount': 3},
 {'name': 'Marilyn Monroe', 'triangleCount': 2},
 {'name': 'Joan Crawford', 'triangleCount': 2},
 {'name': 'Marlon Brando', 'triangleCount': 2},
 {'name': 'Howard Hughes', 'triangleCount': 2},
 {'name': 'Gianna Michaels', 'triangleCount': 2},
 {'name': 'Britney Stevens', 'triangleCount': 2},
 {'name': 'Belladonna', 'triangleCount': 2},
 {'name': 'Joan Jett', 'triangleCount': 2},
 {'name': 'Trina', 'triangleCount': 2},
 {'name': 'Lori Mattox', 'triangleCount': 2},
 {'name': 'Queenie', 'triangleCount': 2},
 {'name': 'Mick Ronson', 'triangleCount': 2},
 {'name': 'Iggy Pop', 'triangleCount': 2},
 {'name': 'Errol Flynn', 'triangleCount': 2},
 {'name': 'Laurence Olivier', 'triangleCount': 2},
 {'name': 'Karrine Steffans', 'triangleCount': 2},
 {'name': 'Kristen Stew