In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from facebook_scraper import get_posts

from pprint import pprint

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def crawl_all_post_from_page(page_url):
    COOKIE_PATH = "facebook.com_cookies.txt"
    PAGES_NUMBER = 10

    # Get all post from page
    post_list = []
    for post in get_posts(page_url,
                          options={"comments": True, "reactors": True, "reactions": True,
                                   "sharers": True, "comment_reactions": True,
                                   "allow_extra_requests": True, "progress": True},
                          extra_info=True, pages=PAGES_NUMBER, cookies=COOKIE_PATH):
        post_list.append(post)

    return post_list

In [3]:
def ranking_member(post_list):
    # Ranking top members have the most reacts, shares, comments and posts
    post_ranking = {}
    react_ranking = {}
    comment_ranking = {}
    share_ranking = {}

    for post in post_list:
        # Post count
        if (post_ranking.get(post['username'])):
            post_ranking[post['username']]['post_counter'] += 1
        else:
            post_ranking[post['username']] = {
                'link': post['user_url'],
                'post_counter': 1,
            }

        # Reaction count
        if (post.get('reactors')):
            for reactor in post['reactors']:
                if (react_ranking.get(reactor['name'])):
                    react_ranking[reactor['name']]['react_counter'] += 1
                else:
                    react_ranking[reactor['name']] = {
                        'link': reactor['link'],
                        'react_counter': 1,
                    }
        
        if (post.get('comments_full')):
            for comment in post['comments_full']:
                if (comment.get('comment_reactors')):
                    for comment_reactor in comment['comment_reactors']:
                        if (react_ranking.get(comment_reactor['name'])):
                            react_ranking[comment_reactor['name']]['react_counter'] += 1
                        else:
                            react_ranking[comment_reactor['name']] = {
                                'link': comment_reactor['link'],
                                'react_counter': 1,
                            }
                
                if (comment.get('replies')):
                    for reply in comment['replies']:
                        if (reply.get('comment_reactors')):
                            for comment_reactor in reply['comment_reactors']:
                                if (react_ranking.get(comment_reactor['name'])):
                                    react_ranking[comment_reactor['name']]['react_counter'] += 1
                                else:
                                    react_ranking[comment_reactor['name']] = {
                                        'link': comment_reactor['link'],
                                        'react_counter': 1,
                                    }

        # Comment count
        if (post.get('comments_full')):
            for comment in post['comments_full']:
                if (comment_ranking.get(comment['commenter_name'])):
                    comment_ranking[comment['commenter_name']]['comment_counter'] += 1
                else:
                    comment_ranking[comment['commenter_name']] = {
                        'link': comment['commenter_url'],
                        'comment_counter': 1,
                    }
                
                if (comment.get('replies')):
                    for reply in comment['replies']:
                        if (comment_ranking.get(reply['commenter_name'])):
                            comment_ranking[reply['commenter_name']]['comment_counter'] += 1
                        else:
                            comment_ranking[reply['commenter_name']] = {
                                'link': reply['commenter_url'],
                                'comment_counter': 1,
                            }
            
        # Share count
        if (post.get('sharers')):
            for sharer in post['sharers']:
                if (share_ranking.get(sharer['name'])):
                    share_ranking[sharer['name']]['share_counter'] += 1
                else:
                    share_ranking[sharer['name']] = {
                        'link': sharer['link'],
                        'share_counter': 1,
                    }

    post_ranking = sorted(post_ranking.items(), key=lambda item:item[1]['post_counter'], reverse=True)
    react_ranking = sorted(react_ranking.items(), key=lambda item:item[1]['react_counter'], reverse=True)
    comment_ranking = sorted(comment_ranking.items(), key=lambda item:item[1]['comment_counter'], reverse=True)
    share_ranking = sorted(share_ranking.items(), key=lambda item:item[1]['share_counter'], reverse=True)

    top = int(input('\nHow many result do you want to show? Top: '))

    print('\nPost ranking:')
    pprint(post_ranking[:top])
    print('\nReact ranking:')
    pprint(react_ranking[:top])
    print('\nComment ranking:')
    pprint(comment_ranking[:top])
    print('\nShare ranking:')
    pprint(share_ranking[:top])

    return True

In [4]:
def ranking_comments(post_list, post_text_filter=''):
    # Ranking top comments have the most reacts, replies
    result = []

    for post in post_list:
        if (post_text_filter.lower() in post['post_text'].lower()):
            most_reacted_comments = []
            most_replied_comments = []
            
            max_react = 0
            max_reply = 0

            if (post.get('comments_full')):
                for comment in post['comments_full']:
                    # Count comment's reaction
                    if (comment.get('comment_reaction_count')):
                        comment_info = {
                            'comment_url': comment['comment_url'],
                            'commenter_name': comment['commenter_name'],
                            'commenter_url': comment['commenter_url'],
                            'comment_text': comment['comment_text'],
                            'comment_time': comment['comment_time'],
                            'comment_reactions': comment['comment_reactions'],
                            'comment_reaction_count': comment['comment_reaction_count']
                        }

                        if (comment['comment_reaction_count'] > max_react):
                            max_react = comment['comment_reaction_count']
                            most_reacted_comments = [comment_info]
                        
                        if (comment['comment_reaction_count'] == max_react):
                            most_reacted_comments.append(comment_info)
                    
                    # Count reply comment's reaction
                    if (comment.get('replies')):
                        for reply in comment['replies']:
                            reply_info = {
                                'comment_url': reply['comment_url'],
                                'commenter_name': reply['commenter_name'],
                                'commenter_url': reply['commenter_url'],
                                'comment_text': reply['comment_text'],
                                'comment_time': reply['comment_time'],
                                'comment_reactions': reply['comment_reactions'],
                                'comment_reaction_count': reply['comment_reaction_count']
                            }

                            if (reply.get('comment_reaction_count')):
                                if (reply['comment_reaction_count'] > max_react):
                                    max_react = reply['comment_reaction_count']
                                    most_reacted_comments = [reply_info]
                                
                                if (reply['comment_reaction_count'] == max_react):
                                    most_reacted_comments.append(reply_info)

                    # Count reply of comment
                    if (comment.get('replies')):
                        if (len(comment['replies']) > max_reply):
                            max_reply = len(comment['replies'])
                            most_replied_comments = [{
                                'comment_url': comment['comment_url'],
                                'commenter_name': comment['commenter_name'],
                                'commenter_url': comment['commenter_url'],
                                'comment_text': comment['comment_text'],
                                'comment_time': comment['comment_time'],
                                'comment_reply_count': len(comment['replies'])
                            }]

                        if (len(comment['replies']) == max_reply):
                            most_replied_comments.append({
                                'comment_url': comment['comment_url'],
                                'commenter_name': comment['commenter_name'],
                                'commenter_url': comment['commenter_url'],
                                'comment_text': comment['comment_text'],
                                'comment_time': comment['comment_time'],
                                'comment_reply_count': len(comment['replies'])
                            })
                                    
            result.append({
                'post_url': 'https://facebook.com' + post['links'][0]['link'],
                'post_text': post['post_text'],
                'username': post['username'],
                'user_url': post['user_url'],
                'post_time': post['time'],
                'most_reacted_comments': most_reacted_comments,
                'most_replied_comments': most_replied_comments,
            })
    
    pprint(result)

    return True

In [5]:
def ranking_posts(post_list):
    result = []

    for post in post_list:
        react_counter = post['reaction_count']
        comment_counter = post['comments']
        share_counter = post['shares']

        # Count reaction in comments
        for comment in post['comments_full']:
            if (comment.get('comment_reaction_count')):
                react_counter += comment['comment_reaction_count']

            # Count reaction in replies
            if (comment.get('replies')):
                for reply in comment['replies']:
                    if (reply.get('comment_reaction_count')):
                        react_counter += reply['comment_reaction_count']

        # Count reply comments
        for comment in post['comments_full']:
            if (comment.get('replies')):
                comment_counter += len(comment['replies'])

        result.append({
            'post_url': 'https://facebook.com' + post['links'][0]['link'],
            'post_text': post['post_text'],
            'username': post['username'],
            'user_url': post['user_url'],
            'post_time': post['time'],
            'react_counter': react_counter,
            'comment_counter': comment_counter,
            'share_counter': share_counter,
        })

    react_ranking = sorted(result, key=lambda item:item['react_counter'], reverse=True)
    comment_ranking = sorted(result, key=lambda item:item['comment_counter'], reverse=True)
    share_ranking = sorted(result, key=lambda item:item['share_counter'], reverse=True)

    top = int(input('\nHow many result do you want to show? Top: '))

    print('\nReact ranking:')
    pprint(react_ranking[:top])
    print('\nComment ranking:')
    pprint(comment_ranking[:top])
    print('\nShare ranking:')
    pprint(share_ranking[:top])

    return True

In [6]:
def crawl_comments_by_regex(post_url, comment_regex):
    return ''

In [7]:
def slice_host(url):
    host = 'https://www.facebook.com/'
    pos = url.find(host)
    return url[len(host):]

In [8]:
def main(post_list):
    print('''
Please choose what you want to do:
  1. Ranking top users have the most posts, reacts, comments, shares
  2. Ranking top comments have the most reacts, replies
  3. Ranking top posts have the most reacts, comments, shares
  4. Crawl all comments of a post followed by regex
    ''')

    option = int(input('Your option: '))

    if option == 1:
        ranking_member(post_list)

    if option == 2:
        post_text_filter = input('Input post\'s text to filter some specific posts (leave empty to load all): ')
        ranking_comments(post_list, post_text_filter)

    if option == 3:
        ranking_posts(post_list)

    if option == 4:
        post_url = input('URL of post: ')
        comment_regex = input(
            'Describe the comment\'s regex you want to crawl: ')
        crawl_comments_by_regex(post_url, comment_regex)

In [9]:
print('\nWelcome to Facebook Crawler App!\n')

page_url = input('URL of your page: ')

print('\nCrawling all post data at page: ', page_url)
print('This action may take a while...')

post_list = crawl_all_post_from_page(slice_host(page_url))

print('\nCrawl all data completed!')


Welcome to Facebook Crawler App!


Crawling all post data at page:  https://www.facebook.com/mxnhat1610
This action may take a while...


  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1000000000.0 [00:00<?, ?it/s]
  0%|          | 0/1


Crawl all data completed!


In [11]:
while True:
    main(post_list)

    endApp = input('\nContinue using app? (Y/N) ')
    if endApp == 'N' or endApp == 'n':
        break


Please choose what you want to do:
  1. Ranking top users have the most posts, reacts, comments, shares
  2. Ranking top comments have the most reacts, replies
  3. Ranking top posts have the most reacts, comments, shares
  4. Crawl all comments of a post followed by regex
    

Post ranking:
[('Xuân Nhật',
  {'link': 'https://facebook.com/mxnhat1610?lst=100004843053382%3A100007582690558%3A1670349317&eav=AfYycciFmARqCoUo8uR6ogH_0pnVKi0qhNqDSgLSWI2zkhsItGLtKpg9qKkPNFYMMsA&refid=17&_ft_=encrypted_tracking_data.0AY8RmjkKpM_1y9Xglll6Jr5yuhUcLyoGB77r70fe7ltTLZBM2dUBShArOOFaYYkYfG8M2WNBHjIYUCg7zuxoxIVwqBr6yDm5frcUyySNXBYtCFLNUOXScw4HzWOlgadbWasRCFIDvnyar56zD7r_pzqEBBUd-1llxKGCuzmt8KxpgfHJ5TeIZoTT1lAC285B_tp8EV97seZrI3AX86czidwvZKyq9SUG2uo1d_P_6pQ85MglWhZNQdwwOMHolPa-6xvpvRcGgrVHVBjKhVLKCqnMBHUhZaJsGmVTqkgVMYihMPiY5YfHS4Ar8pgcJ2Wl2cW_9mpI-sUV6CdPQVNUanGGocCY0KnG7KCr-wHV4WdwDvwfDk4gE4gDTEkI2yGZSqoaFgUMReClpuD4O6SU_64MT5v9f781AAROiIbLUlhPQIbzgxxsYnJ16fpt7mZdWpUFrbMYxbMxv2m0o5c3mb0E9-f80prVMu-D

ValueError: invalid literal for int() with base 10: ''