# Facebook Profile Page Scrapping

The purpose of this notebook is to analyze and structure public information located at a public profile page in Facebook using Selenium and BeautifulSoup

In [1]:
# Import packages and methods for analyzing html code from websites
import requests
from bs4 import BeautifulSoup
from bs4.element import NavigableString

# Import packages and methods for automating website navigation with selenium and chromedriver
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.chrome.options import Options

# Process output as json file
import json

## Configure parameters

In [10]:
# Input here the total amount of posts to collect
NUM_POSTS = 50

# Input the public facebook profile url
URL = "https://www.facebook.com/stevejobsfilm"

# Input chromedriver location
CHROMEDRIVER_PATH = '/Users/alex/Downloads/chromedriver'

## Define support functions for notebook

In [11]:
# Perform click action on a button and return True, in case of any error, return False
def click_expand_button(button):
    try:
        button.click()
    except :
        return False
    return True

# Perform hover action over date element in a post with a shared related post to load its url and return True if successful, otherwise return False
def show_post_url(i):
    try:
        # Identify date element within a shared post
        a = driver.find_element(By.CSS_SELECTOR, 'div.x9f619.x1n2onr6.x1ja2u2z.xeuugli.x1iyjqo2.xs83m0k.x1xmf6yo.x1emribx.x1e56ztr.x1i64zmx.xjl7jj.x19h7ccj.x65f84u')\
                  .find_elements(By.CSS_SELECTOR,'div.x9f619.x1n2onr6.x1ja2u2z.x1jx94hy.x1qpq9i9.xdney7k.xu5ydu1.xt3gfkd.xh8yej3.x6ikm8r.x10wlt62.xquyuld')[i]\
                  .find_element(By.CSS_SELECTOR,'div.x1a8lsjc.x1swvt13.x1pi30zi')\
                  .find_element(By.CSS_SELECTOR,'a.x1i10hfl.xjbqb8w.x1ejq31n.xd10rxx.x1sy0etr.x17r0tee.x972fbf.xcfux6l.x1qhh985.xm0m39n.x9f619.x1ypdohk.xt0psk2.xe8uvvx.xdj266r.x11i5rnm.xat24cr.x1mh8g0r.xexx8yu.x4uap5.x18d9i69.xkhd6sd.x16tdsg8.x1hl2dhg.xggy1nq.x1a2a7pz.x1sur9pj.xkrqix3.xi81zsa')
       
        # If element is located, move browser to element
        ActionChains(driver).move_to_element(a).perform()
        
        # Wait a second for html to update url
        time.sleep(1)
        
        return True
    except:
        return False

## Run scrapping process

In [12]:
# Set up headless execution
chrome_options = Options()
chrome_options.add_argument("--headless=new")

# Initialize chromedriver and maximize window 
driver = webdriver.Chrome(executable_path = CHROMEDRIVER_PATH, options = chrome_options)
driver.maximize_window()

# Load Facebook profile url and close sign-up popup
driver.get(URL)
time.sleep(1)
driver.find_element(By.CSS_SELECTOR,"div.x92rtbv.x10l6tqk.x1tk7jg1.x1vjfegm").click()
time.sleep(1)

# Hide sign-up banner at the bottom of the website
driver.execute_script('document.querySelector("div.x78zum5.xdt5ytf.x2lah0s.x193iq5w.x2bj2ny.x1ey2m1c.xayqjjm.x9f619.xds687c.x1xy6bms.xn6708d.x1s14bel.x1ye3gou.xixxii4.x17qophe.x1u8a7rm").style.visibility = "hidden";')

# ------ PROCESS 1: Load all required posts ------

# Define control variables for monitoring post loading cycle
last_amount_posts = 0 
frozen_counter = 0

# Go down to the bottom of the webpage until it reaches the number of posts required
while len(driver.find_elements(By.CSS_SELECTOR, "div.x1yztbdb.x1n2onr6.xh8yej3.x1ja2u2z")) < NUM_POSTS :
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
    time.sleep(2)
    print("{} posts loaded".format(len(driver.find_elements(By.CSS_SELECTOR, "div.x1yztbdb.x1n2onr6.xh8yej3.x1ja2u2z"))))
    
    # If no new posts were loaded, increase frozen counter by 1
    if len(driver.find_elements(By.CSS_SELECTOR, "div.x1yztbdb.x1n2onr6.xh8yej3.x1ja2u2z")) <= last_amount_posts:
        frozen_counter += 1
        
        # If no new posts were loaded after 3 attempts, break cycle
        if frozen_counter == 3:
            break
    
    # Store new amount of posts loaded
    else:
        last_amount_posts = len(driver.find_elements(By.CSS_SELECTOR, "div.x1yztbdb.x1n2onr6.xh8yej3.x1ja2u2z"))
    

# Return back to the top of the webpage to prepare for next process
driver.execute_script("window.scrollTo(0, 0)")

# ------ PROCESS 2: Expand text from all the posts ------
# Store object with the css selector identifier for "show more" button
str_loadmore_button= "div.x1i10hfl.xjbqb8w.x1ejq31n.xd10rxx.x1sy0etr.x17r0tee.x972fbf.xcfux6l.x1qhh985.xm0m39n.x9f619.x1ypdohk.xt0psk2.xe8uvvx.xdj266r.x11i5rnm.xat24cr.x1mh8g0r.xexx8yu.x4uap5.x18d9i69.xkhd6sd.x16tdsg8.x1hl2dhg.xggy1nq.x1a2a7pz.x1sur9pj.xkrqix3.xzsf02u.x1s688f"

# Iterate for all posts in website
for i in range(len(driver.find_elements(By.CSS_SELECTOR, "div.x1yztbdb.x1n2onr6.xh8yej3.x1ja2u2z"))):
    
    # Load post as a selenium element
    actual_post = driver.find_elements(By.CSS_SELECTOR, "div.x1yztbdb.x1n2onr6.xh8yej3.x1ja2u2z")[i]
    
    # Look for all "show more" buttons
    list_loadmore_button = actual_post.find_elements(By.CSS_SELECTOR, str_loadmore_button)
    print("post {0} has {1} load buttons".format(i, len(list_loadmore_button)))
    
    # If at least one button is found, perform click action on them
    if len(list_loadmore_button)>0:
        for button in list_loadmore_button[:1]:
            button.click()
            time.sleep(1)

# ------ PROCESS 3: Load url of shared posts ------
# Initialize empty list for locating posts with shared posts
list_shared_url_posts = []    

# Initialize actions class for performing actions in chromedriver 
actions = ActionChains(driver)

# Iterate for all posts in website
for i in range(len(driver.find_elements(By.CSS_SELECTOR, "div.x1yztbdb.x1n2onr6.xh8yej3.x1ja2u2z"))):
    
    # If the date element of a shared post is located within the post, identify this post as having a shared publication
    if show_post_url(i):
        print("post {} contains shared post".format(i+1))
        list_shared_url_posts.append(i)

# ------ STRUCTURE INFORMATION WITH BEAUTIFULSOUP ------    
# Scrap html code from website in chromedriver, and close the chromedriver session
web_soup = BeautifulSoup(driver.page_source, 'lxml')
driver.quit()

# Get general information from profile 
str_name = web_soup.find("h1",class_="html-h1 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x1vvkbs x1heor9g x1qlqyl8 x1pd3egz x1a2a7pz").next_element.strip()
str_followed = web_soup.find("div",class_="x9f619 x1n2onr6 x1ja2u2z x78zum5 xdt5ytf x2lah0s x193iq5w x1cy8zhl xyamay9").next_element()[1].next_element.strip()
str_followers = web_soup.find("div",class_="x9f619 x1n2onr6 x1ja2u2z x78zum5 xdt5ytf x2lah0s x193iq5w x1cy8zhl xyamay9").next_element()[0].next_element.strip().replace("\xa0"," ").replace(".","")
str_image_url = web_soup.find("svg",class_="x3ajldb").find("image")["xlink:href"]

# Collect all posts from html code, up until NUM_POSTS
list_all_post = web_soup.find("div",class_ = "x9f619 x1n2onr6 x1ja2u2z xeuugli x1iyjqo2 xs83m0k x1xmf6yo x1emribx x1e56ztr x1i64zmx xjl7jj x19h7ccj x65f84u").find_all("div",class_ = "x78zum5 x1n2onr6 xh8yej3")[:NUM_POSTS]

# LOOP THROUGH ALL POSTS TO SCRAP INFORMATION
# Initialize empty list that will contain posts information
list_str_posts = []

# Iterate through all posts
int_post = 0
for post in list_all_post:
    
    # Store text if text element is located, otherwise indicate that no text is available
    if len(post.find_all("span",class_="x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u x1yc453h"))>0:
        
        # Collect all paragraphs in post text 
        list_all_paragraphs = post.find_all("span",class_="x193iq5w xeuugli x13faqbe x1vvkbs xlh3980 xvmahel x1n0sxbx x1lliihq x1s928wv xhkezso x1gmr53x x1cpjm7i x1fgarty x1943h6x x4zkp8e x3x7a5m x6prxxf xvq8zen xo1l8bm xzsf02u x1yc453h")[0].find_all("div",class_="xat24cr")

        # Initialize empty string to collect all available text
        str_post = ""

        # Iterate through all fond paragraphs 
        for paragraph in list_all_paragraphs: 
            
            # Get all elements from paragraphs and iterate
            proc_paragraph = paragraph.find("div").contents

            for element in proc_paragraph:
                
                # If element contains only string, append it directly 
                if type(element) == NavigableString:
                    str_post += element

                # Otherwise, if tag element contains a string, append that string
                elif len(element.find_all("a",class_="x1i10hfl")):
                    str_post += element.next_element.next_element

            str_post += "\n\n"
    else:
        str_post = "NO TEXT AVAILABLE"
        
    # If iterating post was identified to contain a shared post, get the url for that shared post
    if int_post in list_shared_url_posts: 
        str_shared_url = web_soup.find('div', class_ = 'x9f619 x1n2onr6 x1ja2u2z xeuugli x1iyjqo2 xs83m0k x1xmf6yo x1emribx x1e56ztr x1i64zmx xjl7jj x19h7ccj x65f84u').find_all('div', class_ = 'x78zum5 x1n2onr6 xh8yej3')[int_post].find_all('div', class_ = 'x78zum5 xdt5ytf xz62fqu x16ldp7u')[2].find("a",class_="x1i10hfl xjbqb8w x1ejq31n xd10rxx x1sy0etr x17r0tee x972fbf xcfux6l x1qhh985 xm0m39n x9f619 x1ypdohk xt0psk2 xe8uvvx xdj266r x11i5rnm xat24cr x1mh8g0r xexx8yu x4uap5 x18d9i69 xkhd6sd x16tdsg8 x1hl2dhg xggy1nq x1a2a7pz x1sur9pj xkrqix3 xi81zsa xo1l8bm")["href"]
    else:
        str_shared_url = ""
    
    # Initialize list that will contain urls for all found images
    list_url_img = []
    
    # Look for all images, using class identifiers for single images and multiple images within a post
    list_mult_img = post.find_all("img",class_ ="xz74otr x1ey2m1c xds687c x5yr21d x10l6tqk x17qophe x13vifvy xh8yej3")
    list_sing_img = post.find_all("img",class_ ="x1ey2m1c xds687c x5yr21d x10l6tqk x17qophe x13vifvy xh8yej3 xl1xv1r")
    
    # Add all urls located with the multiple images identifier
    if len(list_mult_img) > 0:
        list_mult_img_url = [img["src"] for img in list_mult_img]
        list_url_img += list_mult_img_url
        
    # Add all urls located with the single images identifier
    if len(list_sing_img) > 0:
        list_sing_img_url = [img["src"] for img in list_sing_img]
        list_url_img += list_sing_img_url
    
    # Store dictionary with info scrapped from post
    dict_post = {"text" : str_post, "shared_post_url" : str_shared_url, "img_url" : list_url_img}
     
    # Append dictionary into general list with post information
    list_str_posts.append(dict_post)
    int_post+=1

# Structure scrapped data into a dictionary 
dict_output = {"name":str_name,"followers":str_followers,"followed":str_followed,"image_url":str_image_url,"recent_posts":list_str_posts[:NUM_POSTS]}

# Transform dictionary into json object
json_object = json.dumps(dict_output, indent=4)
 
# Writing to result .json
with open("result.json", "w") as outfile:
    outfile.write(json_object)

  


7 posts loaded
10 posts loaded
14 posts loaded
19 posts loaded
22 posts loaded
25 posts loaded
29 posts loaded
33 posts loaded
37 posts loaded
40 posts loaded
44 posts loaded
47 posts loaded
51 posts loaded
post 0 has 0 load buttons
post 1 has 1 load buttons
post 2 has 0 load buttons
post 3 has 1 load buttons
post 4 has 0 load buttons
post 5 has 0 load buttons
post 6 has 0 load buttons
post 7 has 0 load buttons
post 8 has 0 load buttons
post 9 has 0 load buttons
post 10 has 0 load buttons
post 11 has 1 load buttons
post 12 has 0 load buttons
post 13 has 0 load buttons
post 14 has 0 load buttons
post 15 has 0 load buttons
post 16 has 0 load buttons
post 17 has 0 load buttons
post 18 has 0 load buttons
post 19 has 0 load buttons
post 20 has 0 load buttons
post 21 has 0 load buttons
post 22 has 0 load buttons
post 23 has 0 load buttons
post 24 has 0 load buttons
post 25 has 0 load buttons
post 26 has 0 load buttons
post 27 has 0 load buttons
post 28 has 0 load buttons
post 29 has 0 load b