In [1]:
import urllib.request
import json
import datetime
import csv
import time

In [40]:
app_id = "..."
app_secret = "..." # DO NOT SHARE WITH ANYONE!
page_id = "atleticopr"
access_token = app_id + "|" + app_secret

In [3]:
def request_until_succeed(url):
    req = urllib.request.Request(url)
    success = False
    while success is False:
        try:
            response = urllib.request.urlopen(req)
            if response.getcode() == 200:
                success = True
        except Exception as e:
            print(e)
            time.sleep(5)

            print("Error for URL %s: %s" % (url, datetime.datetime.now()))
            print("Retrying.")

            if '400' in str(e):
                return None;

    return response.read().decode('utf-8')

In [52]:
# Needed to write tricky unicode correctly to csv
from unidecode import unidecode

def unicode_normalize(text):
    text.translate({ 0x2018:0x27, 0x2019:0x27, 0x201C:0x22,
                     0x201D:0x22, 0xa0:0x20 }).encode('utf-8')

    return unidecode(text)


In [43]:
def getFacebookCommentFeedData(status_id, access_token, num_comments):

    # Construct the URL string
    base = "https://graph.facebook.com/v2.6"
    node = "/%s/comments" % status_id
    fields = "?fields=id,message,like_count,created_time,comments,from,attachment"
    parameters = "&order=chronological&limit=%s&access_token=%s" % \
            (num_comments, access_token)
    url = base + node + fields + parameters

    # retrieve data
    data = request_until_succeed(url)
    if data is None:
        return None
    else:
        return json.loads(data)

In [53]:
def processFacebookComment(comment, status_id, parent_id = ''):

    # The status is now a Python dictionary, so for top-level items, we can simply call the key.
    # Additionally, some items may not always exist, so must check for existence first
    comment_id = comment['id']
    comment_message = '' if 'message' not in comment else \
            unicode_normalize(comment['message'])
    comment_author = unicode_normalize(comment['from']['name'])
    comment_likes = 0 if 'like_count' not in comment else \
            comment['like_count']

    if 'attachment' in comment:
        attach_tag = "[[%s]]" % comment['attachment']['type'].upper()
        comment_message = attach_tag if comment_message is '' else \
                (
               #  comment_message.decode("utf-8") + \
                 " " +  attach_tag) \
               #  .encode("utf-8")

    # Time needs special care since a) it's in UTC and b) it's not easy to use in statistical programs.
    comment_published = datetime.datetime.strptime(
            comment['created_time'],'%Y-%m-%dT%H:%M:%S+0000')
    comment_published = comment_published + datetime.timedelta(hours=+1) # Paris
    comment_published = comment_published.strftime(
            '%Y-%m-%d %H:%M:%S') # best time format for spreadsheet programs

    # Return a tuple of all processed data
    return (comment_id, status_id, parent_id, comment_message, comment_author,
            comment_published, comment_likes)

In [57]:
def scrapeFacebookPageFeedComments(page_id, access_token):
    with open('/home/teresas/csv_files/facebook/%s_facebook_comments.csv' % page_id, \
              'w', newline='', encoding='utf-8') as file:
        w = csv.writer(file)
        w.writerow(["comment_id", "status_id", "parent_id", "comment_message",
                    "comment_author", "comment_published", "comment_likes"])

        num_processed = 0   # keep a count on how many we've processed
        scrape_starttime = datetime.datetime.now()

        print("Scraping %s Comments From Posts: %s\n" % \
                (page_id, scrape_starttime))

        with open('/home/teresas/csv_files/facebook/%s_facebook_statuses.csv' % page_id, \
                  'r', encoding='utf-8') as csvfile:
            reader = csv.DictReader(csvfile)

            #reader = [dict(status_id='759985267390294_1158001970921953')]
            for status in reader:
                has_next_page = True

                comments = getFacebookCommentFeedData(status['status_id'],
                        access_token, 100)

                while has_next_page and comments is not None:
                    for comment in comments['data']:
                        w.writerow(processFacebookComment(comment,
                            status['status_id']))

                        if 'comments' in comment:
                            has_next_subpage = True

                            subcomments = getFacebookCommentFeedData(
                                    comment['id'], access_token, 100)

                            while has_next_subpage:
                                for subcomment in subcomments['data']:
                                    # print (processFacebookComment(
                                        # subcomment, status['status_id'],
                                        # comment['id']))
                                    w.writerow(processFacebookComment(
                                            subcomment,
                                            status['status_id'],
                                            comment['id']))

                                    num_processed += 1
                                    if num_processed % 1000 == 0:
                                        print("%s Comments Processed: %s" %
                                              (num_processed,
                                               datetime.datetime.now()))

                                if 'paging' in subcomments:
                                    if 'next' in subcomments['paging']:
                                        subcomments = json.loads(
                                                request_until_succeed(
                                                    subcomments['paging']\
                                                               ['next']))
                                    else:
                                        has_next_subpage = False
                                else:
                                    has_next_subpage = False

                        # output progress occasionally 
                        num_processed += 1
                        if num_processed % 1000 == 0:
                            print("%s Comments Processed: %s" %
                                  (num_processed, datetime.datetime.now()))

                    if 'paging' in comments:
                        if 'next' in comments['paging']:
                            comments = json.loads(request_until_succeed(
                                        comments['paging']['next']))
                        else:
                            has_next_page = False
                    else:
                        has_next_page = False

        print("\nDone!\n%s Comments Processed in %s" %
              (num_processed, datetime.datetime.now() - scrape_starttime))

if __name__ == '__main__':
    scrapeFacebookPageFeedComments(page_id, access_token)

Scraping atleticopr Comments From Posts: 2017-03-02 12:58:41.807450

1000 Comments Processed: 2017-03-02 12:58:52.641286
2000 Comments Processed: 2017-03-02 12:59:00.306170
3000 Comments Processed: 2017-03-02 12:59:07.086626
4000 Comments Processed: 2017-03-02 12:59:13.520392
5000 Comments Processed: 2017-03-02 12:59:21.111422
6000 Comments Processed: 2017-03-02 12:59:29.474618
7000 Comments Processed: 2017-03-02 12:59:38.932316
8000 Comments Processed: 2017-03-02 12:59:47.197967
9000 Comments Processed: 2017-03-02 12:59:53.654167
10000 Comments Processed: 2017-03-02 13:00:02.102691
11000 Comments Processed: 2017-03-02 13:00:09.546576
12000 Comments Processed: 2017-03-02 13:00:16.168362
13000 Comments Processed: 2017-03-02 13:00:26.602888
14000 Comments Processed: 2017-03-02 13:00:32.842055
15000 Comments Processed: 2017-03-02 13:00:38.847703
16000 Comments Processed: 2017-03-02 13:00:45.566140
17000 Comments Processed: 2017-03-02 13:00:52.791828
18000 Comments Processed: 2017-03-02 13

In [62]:
import pandas as pd
pd.set_option('max_colwidth',100)
df_comm = pd.read_csv('/home/teresas/csv_files/facebook/atleticopr_facebook_comments.csv')

In [63]:
df_comm

Unnamed: 0,comment_id,status_id,parent_id,comment_message,comment_author,comment_published,comment_likes
0,1413061688715813_1413063952048920,181724431849551_1413061688715813,,"Mesmo com o sub23 nao e motivo pra nao massacrar as paquitas, no minimo um 3x0.",Fagner Henrike Santos,2017-02-28 16:37:12,4
1,1413061688715813_1413065428715439,181724431849551_1413061688715813,,DALHE FURACAO PARA CIMA DOS COXINHAS.,Nilton Roberto Antunes,2017-02-28 16:39:35,0
2,1413061688715813_1413065528715429,181724431849551_1413061688715813,,Andre Uruguay,Felipe Rafael Andrade,2017-02-28 16:39:49,1
3,1413061688715813_1413066102048705,181724431849551_1413061688715813,,Gabriel Paes,Thais Coser,2017-02-28 16:40:36,0
4,1413061688715813_1413066562048659,181724431849551_1413061688715813,,Luiz Henrique os malaaas,Matheus Olivieri,2017-02-28 16:41:45,0
5,1413061688715813_1413067188715263,181724431849551_1413061688715813,,All aboard!!!,Joao Roque,2017-02-28 16:43:18,1
6,1413061688715813_1413073652047950,181724431849551_1413061688715813,1413061688715813_1413067188715263,Golpista!!!,Romualdo Ferreira,2017-02-28 16:54:44,1
7,1413061688715813_1413069202048395,181724431849551_1413061688715813,,Boa Atletico!,Julio Jiordano De Melo,2017-02-28 16:47:03,0
8,1413061688715813_1413069268715055,181724431849551_1413061688715813,,Estarei ligado como sempre diretamente aqui da Italia! Inicio 00:00 Atletico 2-0 Coritiba,Bruno De Souza,2017-02-28 16:47:14,98
9,1413061688715813_1413077855380863,181724431849551_1413061688715813,1413061688715813_1413069268715055,"Ai irmao, olha o comentario sem nocao ai de baixo kkkk",Adilson Moreira,2017-02-28 17:02:31,2
