In [1]:
import requests
import re
import json
from bs4 import BeautifulSoup
import time
import datetime
import pandas as pd
import numpy as np
import urllib


In [2]:
def get_pageid(pageurl):
    resp = requests.get(pageurl)
    pageid = re.findall('page_id=(.*?)"',resp.text)[0]
    return pageid



In [3]:
def parse_content(data):
    df = []
    soup = BeautifulSoup(data['domops'][0][3]['__html'], 'lxml')
    # post
    for ele in soup.findAll('div', {'class':'userContentWrapper'}):
        try:
            df.append([
                ele.find('img')['aria-label'], #name
                ele.find('div', {'data-testid':'story-subtitle'})['id'], # ID
                ele.find('abbr')['data-utime'], # time
                ''.join([i.text for i in ele.find('div', {'data-testid':'post_message'}).findAll('p')]), # content
                ele.find('a')['href'].split('?')[0] # link
                    ])
        except:
            pass  
    df = pd.DataFrame(data=df, columns=['NAME', 'ID', 'TIME', 'CONTENT', 'LINK'])
    df['PAGEID'] = df['ID'].apply(lambda x: re.split(r'_|;',x)[2])
    df['POSTID'] = df['ID'].apply(lambda x: re.split(r'_|;',x)[3])
    df['TIME'] = df['TIME'].apply(lambda x: datetime.datetime.fromtimestamp(int(x)))
    df = df.drop('ID',axis=1)
    return df



In [4]:
def get_reaction(data):
    df = []
    # posts
    for ele in data['jsmods']['pre_display_requires']:
        try:
            df.append([
                ele[3][1]['__bbox']['variables']['storyID'], # storyID
                ele[3][1]['__bbox']['result']['data']['feedback']['display_comments_count']['count'],  # display_comments_count
                ele[3][1]['__bbox']['result']['data']['feedback']['comment_count']['total_count'], # total_comments_count
                ele[3][1]['__bbox']['result']['data']['feedback']['reaction_count']['count'], # reaction_count
                ele[3][1]['__bbox']['result']['data']['feedback']['share_count']['count'], # share_count
                ele[3][1]['__bbox']['result']['data']['feedback']['top_reactions']['edges'], # reactions
            ])
        except:
            pass
    # vidoes
    for ele in data['jsmods']['require']:
        try:
            df.append([
                'S:_I'+ele[3][2]['feedbacktarget']['actorid']+':'+ele[3][2]['feedbacktarget']['targetfbid'], # storyID
                ele[3][2]['feedbacktarget']['commentcount'], # display_comments_count
                ele[3][2]['feedbacktarget']['commentcount'], # total_comments_count
                ele[3][2]['feedbacktarget']['likecount'], # likecount
                ele[3][2]['feedbacktarget']['sharecount'], # sharecount
                [] # reactions
            ])
        except:
            pass
    df = pd.DataFrame(df, columns=['storyID','display_comments_count', 'total_comments_count', 'reaction_count', 'share_count', 'reactions'])
    df['storyID'] = df['storyID'].apply(lambda x: re.sub('S:_I', '',x))
    df['PAGEID'] = df['storyID'].apply(lambda x: re.split(r':',x)[0])
    df['POSTID'] = df['storyID'].apply(lambda x: re.split(r':',x)[1])
    # 
    def get_reactions(reactname, reactions):
        for react in reactions:
            if reactname in str(react):
                return react['reaction_count']
        return 0
    df['LIKE'] = df['reactions'].apply(lambda x: get_reactions('LIKE', x))
    df['LOVE'] = df['reactions'].apply(lambda x: get_reactions('LOVE', x))
    df['HAHA'] = df['reactions'].apply(lambda x: get_reactions('HAHA', x))
    df['SUPPORT'] = df['reactions'].apply(lambda x: get_reactions('SUPPORT', x))
    df['WOW'] = df['reactions'].apply(lambda x: get_reactions('WOW', x))
    df['ANGER'] = df['reactions'].apply(lambda x: get_reactions('ANGER', x))
    df['SORRY'] = df['reactions'].apply(lambda x: get_reactions('SORRY', x))
    # for vidoe's tpye post
    df['LIKE'] = np.select(condlist = [df['reactions'].apply(lambda x: len(x)==0)], 
                           choicelist=[df['reaction_count']], 
                           default=df['LIKE'])
    return df



In [5]:
def Crawl_PagePosts(pageurl, until_date='2019-01-01'):
    pageid = get_pageid(pageurl) 
    content_df = [] # post
    feedback_df = [] # reactions
    timeline_cursor = ''
    max_date =  datetime.datetime.now()
    break_times = 0
    # request date and break loop when reach the goal 
    while max_date >= datetime.datetime.strptime(until_date, '%Y-%m-%d'):
        # request params
        url = 'https://www.facebook.com/pages_reaction_units/more/'
        params = {'page_id': pageid,
                  'cursor': str({"timeline_cursor":timeline_cursor,
                                 "timeline_section_cursor":'{}',
                                 "has_next_page":'true'}), 
                  'surface': 'www_pages_home',
                  'unit_count': 20,
                  '__a': '1'}
        try:
            resp = requests.get(url, params=params)
            data = json.loads(re.sub(r'for \(;;\);','',resp.text))
            # contesnts：poster's name, poster's ID, post ID, time, content
            ndf = parse_content(data=data)
            content_df.append(ndf)
            # reactions
            ndf1 = get_reaction(data=data)
            feedback_df.append(ndf1)
            # update request params
            max_date = ndf['TIME'].max()
            print('TimeStamp: {}.'.format(ndf['TIME'].max()))
            timeline_cursor = re.findall(r'timeline_cursor\\u002522\\u00253A\\u002522(.*?)\\u002522\\u00252C\\u002522timeline_section_cursor',resp.text)[0]
            # break times to zero
            break_times = 0
        except:
            break_times += 1
            print('break_times:', break_times)
        time.sleep(2)
        if break_times > 10:
            break
    # join content and reactions
    content_df = pd.concat(content_df, ignore_index=True)
    feedback_df = pd.concat(feedback_df, ignore_index=True)
    df = pd.merge(left=content_df, right=feedback_df, how='left', on=['PAGEID', 'POSTID'])
    df = df.loc[:,['NAME', 'TIME', 'CONTENT', 'PAGEID', 'POSTID', 'display_comments_count', 'total_comments_count', 'reaction_count', 'share_count', 'LIKE', 'LOVE', 'HAHA', 'SUPPORT', 'WOW', 'ANGER', 'SORRY']]
    df = df.rename(columns={'display_comments_count':'DISPLAYCOMMENTS', 'total_comments_count':'TOTAL_COMMENTS', 'reaction_count':'REACTIONS','share_count':'SHARES'})
    df['UPDATETIME'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")      
    print('There are {} posts in DataFrame.'.format(str(df.shape[0])))
    return df



In [6]:
pageurl= 'https://www.facebook.com/diudiu333'
facebook_crawler.Crawl_PagePosts(pageurl=pageurl, until_date='2021-01-01')

NameError: name 'facebook_crawler' is not defined

In [7]:
pageurl= 'https://www.facebook.com/diudiu333'
Crawl_PagePosts(pageurl=pageurl, until_date='2021-01-01')

break_times: 1
break_times: 2
break_times: 3
break_times: 4
break_times: 5
break_times: 6
break_times: 7
break_times: 8
break_times: 9
break_times: 10
break_times: 11


ValueError: No objects to concatenate

In [8]:
pageurl= 'https://www.facebook.com/diudiu333'
Crawl_PagePosts(pageurl=pageurl, until_date='2021-01-01')

break_times: 1
break_times: 2
break_times: 3
break_times: 4
break_times: 5


KeyboardInterrupt: 

In [9]:
pageid = get_pageid(pageurl) 

In [10]:
pageid

'1723714034327589'

In [11]:
    content_df = [] # post
    feedback_df = [] # reactions
    timeline_cursor = ''
    max_date =  datetime.datetime.now()
    break_times = 0

In [12]:
url = 'https://www.facebook.com/pages_reaction_units/more/'

In [13]:
params = {'page_id': pageid,
                  'cursor': str({"timeline_cursor":timeline_cursor,
                                 "timeline_section_cursor":'{}',
                                 "has_next_page":'true'}), 
                  'surface': 'www_pages_home',
                  'unit_count': 20,
                  '__a': '1'}

In [14]:
resp = requests.get(url, params=params)

In [15]:
data = json.loads(re.sub(r'for \(;;\);','',resp.text))

In [16]:
data

,
       'DGWRequestStreamClient',
       'MqttLongPollingRunner',
       'KeyframesRenderer',
       'KeyframesEnvironment',
       'FBKeyframesLoggedSession',
       'KeyframesAssetDecoder',
       'Animation'],
      'r': ['KLBGjev',
       'uYlxyUe',
       'QS4c2DS',
       'wbC2fzj',
       'n5dQXA3',
       'jfBnfiP',
       'Ii4JCVd',
       'KQ+loWk',
       '/hd+9hh']},
     'be': 1},
    'FeedStoryUFICommentActorLinkBadges.react': {'r': ['Ry1XBlB',
      'aaqwwwI',
      'K59t8sA',
      '7cWmW3B',
      'Ox/JKrB',
      'cK+qKNE',
      '1IdIMmu',
      'SluwQx/',
      'KUDPaun',
      'hKY0QKT',
      'KufrNF/',
      'Ik6Ht2B'],
     'rds': {'m': ['ContextualConfig',
       'BladeRunnerClient',
       'SkywalkerUtils',
       'FleetBeaconSubscriptionLauncher',
       'BanzaiScuba_DEPRECATED',
       'DGWRequestStreamClient',
       'MqttLongPollingRunner'],
      'r': ['wbC2fzj', 'QS4c2DS', 'n5dQXA3']},
     'be': 1},
    'FeedStoryUFICommentBody.react': {'r': ['Ry1XBlB'

In [17]:
ndf = parse_content(data=data)

IndexError: list index out of range

In [18]:
ndf

NameError: name 'ndf' is not defined

In [19]:
df = []

In [20]:
soup = BeautifulSoup(data['domops'][0][3]['__html'], 'lxml')

In [21]:
for ele in soup.findAll('div', {'class':'userContentWrapper'}):
        try:
            df.append([
                ele.find('img')['aria-label'], #name
                ele.find('div', {'data-testid':'story-subtitle'})['id'], # ID
                ele.find('abbr')['data-utime'], # time
                ''.join([i.text for i in ele.find('div', {'data-testid':'post_message'}).findAll('p')]), # content
                ele.find('a')['href'].split('?')[0] # link
                    ])
        except:
            pass  

In [22]:
df = pd.DataFrame(data=df, columns=['NAME', 'ID', 'TIME', 'CONTENT', 'LINK'])

In [23]:
df

Unnamed: 0,NAME,ID,TIME,CONTENT,LINK
0,丟丟妹,feed_subtitle_1723714034327589;788936008456629;;9,1623414472,今天丟丟砸重本 送很大 來根丟丟補貨囉 💎💎💎💎💎💎 任何問題請直接來電：022266003...,https://www.facebook.com/diudiu333/
1,丟丟妹,feed_subtitle_1723714034327589;211523144148343;;9,1623241746,防疫期間 丟丟想說 聽我說 ❤️❤️❤️❤️❤️❤️ 有任何問題請直接來電：0222660...,https://www.facebook.com/diudiu333/
2,曉鈞兒,feed_subtitle_1723714034327589:-81169956646116...,1621682425,今日晚餐 #蕃茄炒蛋 #蒜炒小白菜 #紅蘿蔔烘蛋 #蜜汁佐肉⋯⋯ #酥烤雞腿排 #鈞兒廚房...,https://www.facebook.com/people/%E6%9B%89%E9%8...
3,丟丟妹,feed_subtitle_1723714034327589;430086242327939...,1620830988,【暫停自取服務公告】 為配合政府防疫措施，自5/13起，暫停自取服務， 造成您的不便，懇請見...,https://www.facebook.com/diudiu333/
4,丟丟妹,feed_subtitle_1723714034327589;428116058524957...,1620286461,你身上有她💋的香水味... 是我鼻子👃犯的罪... 王董是否能化險為夷呢⁉😱,https://www.facebook.com/diudiu333/
5,丟丟妹,feed_subtitle_1723714034327589;427780081891888...,1620195257,不會因為他人 抹滅我爸爸他們的辛苦 不會因為我是鄉下討海人的孩子感到自卑 因為我有這樣的爸...,https://www.facebook.com/diudiu333/
6,丟丟妹,feed_subtitle_1723714034327589;276119734236443;;9,1619615963,今天董事長換人當 ， 神秘嘉賓 丟丟讓你猜猜看❤️ 😎😎😎😎😎😎😎😎😎 有任何問題請直接來電...,https://www.facebook.com/diudiu333/
7,丟丟妹,feed_subtitle_1723714034327589;423920332944530...,1619088394,☆☆☆Mercedes-Benz A180 銷售所得全數捐贈☆☆☆ 【說到做到！】信守承諾是...,https://www.facebook.com/diudiu333/
8,丟丟妹,feed_subtitle_1723714034327589;422635474406349...,1618722437,丟妹：你敢說 丟妹敢做 不要說送東西 你們要叫我們幹甚麼事 我都做得出來... 謝董：三萬次...,https://www.facebook.com/diudiu333/
9,丟丟妹,feed_subtitle_1723714034327589;422152334121330...,1618578842,給我的攻擊越大 就越強壯 誰叫我是 丟丟❤️ 給你們 傷眼一下🤨 有任何疑難雜症可以撥打02...,https://www.facebook.com/diudiu333/


In [24]:
df['ID'].apply(lambda x: re.split(r'_|;',x)[2])

0                          1723714034327589
1                          1723714034327589
2     1723714034327589:-8116995664611601863
3                          1723714034327589
4                          1723714034327589
5                          1723714034327589
6                          1723714034327589
7                          1723714034327589
8                          1723714034327589
9                          1723714034327589
10                         1723714034327589
11                         1723714034327589
12                         1723714034327589
13                         1723714034327589
14                         1723714034327589
15                         1723714034327589
16                         1723714034327589
17                         1723714034327589
18                         1723714034327589
Name: ID, dtype: object

In [25]:
df['ID'][1]

'feed_subtitle_1723714034327589;211523144148343;;9'

In [26]:
df['ID'][2]

'feed_subtitle_1723714034327589:-8116995664611601863'

In [27]:
df['ID'][3]

'feed_subtitle_1723714034327589;4300862423279391;;9'

In [28]:
df['PAGEID'] = df['ID'].apply(lambda x: re.split(r'_|;|-',x)[2])

In [29]:
df['POSTID'] = df['ID'].apply(lambda x: re.split(r'_|;|-',x)[3])

In [30]:
df

Unnamed: 0,NAME,ID,TIME,CONTENT,LINK,PAGEID,POSTID
0,丟丟妹,feed_subtitle_1723714034327589;788936008456629;;9,1623414472,今天丟丟砸重本 送很大 來根丟丟補貨囉 💎💎💎💎💎💎 任何問題請直接來電：022266003...,https://www.facebook.com/diudiu333/,1723714034327589,788936008456629
1,丟丟妹,feed_subtitle_1723714034327589;211523144148343;;9,1623241746,防疫期間 丟丟想說 聽我說 ❤️❤️❤️❤️❤️❤️ 有任何問題請直接來電：0222660...,https://www.facebook.com/diudiu333/,1723714034327589,211523144148343
2,曉鈞兒,feed_subtitle_1723714034327589:-81169956646116...,1621682425,今日晚餐 #蕃茄炒蛋 #蒜炒小白菜 #紅蘿蔔烘蛋 #蜜汁佐肉⋯⋯ #酥烤雞腿排 #鈞兒廚房...,https://www.facebook.com/people/%E6%9B%89%E9%8...,1723714034327589:,8116995664611601863
3,丟丟妹,feed_subtitle_1723714034327589;430086242327939...,1620830988,【暫停自取服務公告】 為配合政府防疫措施，自5/13起，暫停自取服務， 造成您的不便，懇請見...,https://www.facebook.com/diudiu333/,1723714034327589,4300862423279391
4,丟丟妹,feed_subtitle_1723714034327589;428116058524957...,1620286461,你身上有她💋的香水味... 是我鼻子👃犯的罪... 王董是否能化險為夷呢⁉😱,https://www.facebook.com/diudiu333/,1723714034327589,4281160585249575
5,丟丟妹,feed_subtitle_1723714034327589;427780081891888...,1620195257,不會因為他人 抹滅我爸爸他們的辛苦 不會因為我是鄉下討海人的孩子感到自卑 因為我有這樣的爸...,https://www.facebook.com/diudiu333/,1723714034327589,4277800818918885
6,丟丟妹,feed_subtitle_1723714034327589;276119734236443;;9,1619615963,今天董事長換人當 ， 神秘嘉賓 丟丟讓你猜猜看❤️ 😎😎😎😎😎😎😎😎😎 有任何問題請直接來電...,https://www.facebook.com/diudiu333/,1723714034327589,276119734236443
7,丟丟妹,feed_subtitle_1723714034327589;423920332944530...,1619088394,☆☆☆Mercedes-Benz A180 銷售所得全數捐贈☆☆☆ 【說到做到！】信守承諾是...,https://www.facebook.com/diudiu333/,1723714034327589,4239203329445301
8,丟丟妹,feed_subtitle_1723714034327589;422635474406349...,1618722437,丟妹：你敢說 丟妹敢做 不要說送東西 你們要叫我們幹甚麼事 我都做得出來... 謝董：三萬次...,https://www.facebook.com/diudiu333/,1723714034327589,4226354744063493
9,丟丟妹,feed_subtitle_1723714034327589;422152334121330...,1618578842,給我的攻擊越大 就越強壯 誰叫我是 丟丟❤️ 給你們 傷眼一下🤨 有任何疑難雜症可以撥打02...,https://www.facebook.com/diudiu333/,1723714034327589,4221523341213300


In [31]:
df['PAGEID'] = df['ID'].apply(lambda x: re.split(r'_|;|-|:',x)[2])

In [32]:
df['POSTID'] = df['ID'].apply(lambda x: re.split(r'_|;|-|:',x)[3])

In [33]:
df['TIME'] = df['TIME'].apply(lambda x: datetime.datetime.fromtimestamp(int(x)))

In [34]:
df

Unnamed: 0,NAME,ID,TIME,CONTENT,LINK,PAGEID,POSTID
0,丟丟妹,feed_subtitle_1723714034327589;788936008456629;;9,2021-06-11 20:27:52,今天丟丟砸重本 送很大 來根丟丟補貨囉 💎💎💎💎💎💎 任何問題請直接來電：022266003...,https://www.facebook.com/diudiu333/,1723714034327589,788936008456629.0
1,丟丟妹,feed_subtitle_1723714034327589;211523144148343;;9,2021-06-09 20:29:06,防疫期間 丟丟想說 聽我說 ❤️❤️❤️❤️❤️❤️ 有任何問題請直接來電：0222660...,https://www.facebook.com/diudiu333/,1723714034327589,211523144148343.0
2,曉鈞兒,feed_subtitle_1723714034327589:-81169956646116...,2021-05-22 19:20:25,今日晚餐 #蕃茄炒蛋 #蒜炒小白菜 #紅蘿蔔烘蛋 #蜜汁佐肉⋯⋯ #酥烤雞腿排 #鈞兒廚房...,https://www.facebook.com/people/%E6%9B%89%E9%8...,1723714034327589,
3,丟丟妹,feed_subtitle_1723714034327589;430086242327939...,2021-05-12 22:49:48,【暫停自取服務公告】 為配合政府防疫措施，自5/13起，暫停自取服務， 造成您的不便，懇請見...,https://www.facebook.com/diudiu333/,1723714034327589,4300862423279391.0
4,丟丟妹,feed_subtitle_1723714034327589;428116058524957...,2021-05-06 15:34:21,你身上有她💋的香水味... 是我鼻子👃犯的罪... 王董是否能化險為夷呢⁉😱,https://www.facebook.com/diudiu333/,1723714034327589,4281160585249575.0
5,丟丟妹,feed_subtitle_1723714034327589;427780081891888...,2021-05-05 14:14:17,不會因為他人 抹滅我爸爸他們的辛苦 不會因為我是鄉下討海人的孩子感到自卑 因為我有這樣的爸...,https://www.facebook.com/diudiu333/,1723714034327589,4277800818918885.0
6,丟丟妹,feed_subtitle_1723714034327589;276119734236443;;9,2021-04-28 21:19:23,今天董事長換人當 ， 神秘嘉賓 丟丟讓你猜猜看❤️ 😎😎😎😎😎😎😎😎😎 有任何問題請直接來電...,https://www.facebook.com/diudiu333/,1723714034327589,276119734236443.0
7,丟丟妹,feed_subtitle_1723714034327589;423920332944530...,2021-04-22 18:46:34,☆☆☆Mercedes-Benz A180 銷售所得全數捐贈☆☆☆ 【說到做到！】信守承諾是...,https://www.facebook.com/diudiu333/,1723714034327589,4239203329445301.0
8,丟丟妹,feed_subtitle_1723714034327589;422635474406349...,2021-04-18 13:07:17,丟妹：你敢說 丟妹敢做 不要說送東西 你們要叫我們幹甚麼事 我都做得出來... 謝董：三萬次...,https://www.facebook.com/diudiu333/,1723714034327589,4226354744063493.0
9,丟丟妹,feed_subtitle_1723714034327589;422152334121330...,2021-04-16 21:14:02,給我的攻擊越大 就越強壯 誰叫我是 丟丟❤️ 給你們 傷眼一下🤨 有任何疑難雜症可以撥打02...,https://www.facebook.com/diudiu333/,1723714034327589,4221523341213300.0


In [35]:
df['POSTID'] = df['ID'].apply(lambda x: re.split(r'_|;|:-',x)[3])

In [36]:
df

Unnamed: 0,NAME,ID,TIME,CONTENT,LINK,PAGEID,POSTID
0,丟丟妹,feed_subtitle_1723714034327589;788936008456629;;9,2021-06-11 20:27:52,今天丟丟砸重本 送很大 來根丟丟補貨囉 💎💎💎💎💎💎 任何問題請直接來電：022266003...,https://www.facebook.com/diudiu333/,1723714034327589,788936008456629
1,丟丟妹,feed_subtitle_1723714034327589;211523144148343;;9,2021-06-09 20:29:06,防疫期間 丟丟想說 聽我說 ❤️❤️❤️❤️❤️❤️ 有任何問題請直接來電：0222660...,https://www.facebook.com/diudiu333/,1723714034327589,211523144148343
2,曉鈞兒,feed_subtitle_1723714034327589:-81169956646116...,2021-05-22 19:20:25,今日晚餐 #蕃茄炒蛋 #蒜炒小白菜 #紅蘿蔔烘蛋 #蜜汁佐肉⋯⋯ #酥烤雞腿排 #鈞兒廚房...,https://www.facebook.com/people/%E6%9B%89%E9%8...,1723714034327589,8116995664611601863
3,丟丟妹,feed_subtitle_1723714034327589;430086242327939...,2021-05-12 22:49:48,【暫停自取服務公告】 為配合政府防疫措施，自5/13起，暫停自取服務， 造成您的不便，懇請見...,https://www.facebook.com/diudiu333/,1723714034327589,4300862423279391
4,丟丟妹,feed_subtitle_1723714034327589;428116058524957...,2021-05-06 15:34:21,你身上有她💋的香水味... 是我鼻子👃犯的罪... 王董是否能化險為夷呢⁉😱,https://www.facebook.com/diudiu333/,1723714034327589,4281160585249575
5,丟丟妹,feed_subtitle_1723714034327589;427780081891888...,2021-05-05 14:14:17,不會因為他人 抹滅我爸爸他們的辛苦 不會因為我是鄉下討海人的孩子感到自卑 因為我有這樣的爸...,https://www.facebook.com/diudiu333/,1723714034327589,4277800818918885
6,丟丟妹,feed_subtitle_1723714034327589;276119734236443;;9,2021-04-28 21:19:23,今天董事長換人當 ， 神秘嘉賓 丟丟讓你猜猜看❤️ 😎😎😎😎😎😎😎😎😎 有任何問題請直接來電...,https://www.facebook.com/diudiu333/,1723714034327589,276119734236443
7,丟丟妹,feed_subtitle_1723714034327589;423920332944530...,2021-04-22 18:46:34,☆☆☆Mercedes-Benz A180 銷售所得全數捐贈☆☆☆ 【說到做到！】信守承諾是...,https://www.facebook.com/diudiu333/,1723714034327589,4239203329445301
8,丟丟妹,feed_subtitle_1723714034327589;422635474406349...,2021-04-18 13:07:17,丟妹：你敢說 丟妹敢做 不要說送東西 你們要叫我們幹甚麼事 我都做得出來... 謝董：三萬次...,https://www.facebook.com/diudiu333/,1723714034327589,4226354744063493
9,丟丟妹,feed_subtitle_1723714034327589;422152334121330...,2021-04-16 21:14:02,給我的攻擊越大 就越強壯 誰叫我是 丟丟❤️ 給你們 傷眼一下🤨 有任何疑難雜症可以撥打02...,https://www.facebook.com/diudiu333/,1723714034327589,4221523341213300


In [37]:
def parse_group_content(resp):
    soup = BeautifulSoup(resp.text, 'lxml')
    df = []
    for ele in soup.findAll('article'):
        try:
            df.append([
                re.findall('"actor_id":([0-9]{1,})' ,str(ele))[0], # actorid
                re.findall('"top_level_post_id":"(.*?)"' ,str(ele))[0], # postid
                ele.find('strong').text, # actorname
                json.loads(re.findall(r'"post_context":({.*?})', str(ele))[0])['publish_time'], # TIME
                json.loads(re.findall(r'"post_context":({.*?})', str(ele))[0])['story_name'], # story_name
                ele.select_one('div.story_body_container > div').text, # content
                ' '.join([i.text for i in ele.findAll('span', {'class':'_28wy'})]) # reactions
            ])
        except:
            pass
    df = pd.DataFrame(data=df, columns = ['ACTORID','POSTID', 'NAME', 'TIME','STORYNAME', 'CONTENT', 'REACTIONS'])
    df['GROUPID'] = re.findall('\?id=([0-9]{1,})"',resp.text)[0]
    df['TIME'] = df['TIME'].apply(lambda x: datetime.datetime.fromtimestamp(int(x)))
    df['LIKES'] = df['REACTIONS'].apply(lambda x: re.findall('([0-9]{1,}) Like', x)[0] if 'Like' in x else '0')
    df['COMMENTS'] = df['REACTIONS'].apply(lambda x: re.findall('([0-9]{1,}) Comment', x)[0] if 'Comment' in x else '0')
    df['SHARES'] = df['REACTIONS'].apply(lambda x: re.findall('([0-9]{1,}) Share', x)[0] if 'Share' in x else '0')
    df = df.loc[:,['ACTORID', 'NAME', 'GROUPID', 'POSTID', 'TIME', 'STORYNAME', 'CONTENT', 'LIKES', 'COMMENTS', 'SHARES']]
    df['UPDATETIME'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")   
    return df



In [38]:
def get_bac(resp):
    string = urllib.parse.unquote(resp.text)
    bac = re.findall('\?bac=(.*?)={0,1}&amp',string)[0]
    return bac
## Crawl_GroupPosts

def Crawl_GroupPosts(groupurl, until_date='2019-01-01'):
    groupurl = re.sub('www','m', groupurl)
    headers = {
        'referer': 'https://m.facebook.com/',
        'cookie': 'locale=en_US',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
        }
    df = []
    bac = ''
    max_date =  datetime.datetime.now()
    break_times = 0
    # request data and break loop when reach the goal 
    while max_date >= datetime.datetime.strptime(until_date, '%Y-%m-%d'):
        # request params
        params = {
            'bac': bac,
            'multi_permalinks': '',
            'refid': '18'
            }
        resp = requests.get(groupurl, headers=headers, params=params)
        try:
            ndf = parse_group_content(resp)
            df.append(ndf)
            # update request params
            bac = get_bac(resp) 
            # there are some posts will be pinned at top, so we can't take the max date directly
            max_date = ndf['TIME'].sort_values(ascending=False,ignore_index=True)[3] 
            print('TimeStamp: {}.'.format(max_date))
            break_times = 0 # break times to zero
        except:
            break_times += 1
            print('break_times:', break_times)
        time.sleep(2)
        if break_times > 5:
            return print('ERROR: Please send the following URL to the author. \n', resp.url)
    # concat data we collect
    df = pd.concat(df, ignore_index=True)
    print('There are {} posts in the DataFrame.'.format(str(df.shape[0])))
    return df



In [39]:
groupurl = 'https://www.facebook.com/groups/pythontw'
Crawl_GroupPosts(groupurl, until_date='2021-01-01')

TimeStamp: 2021-06-26 20:21:45.
TimeStamp: 2021-06-25 18:45:16.
break_times: 1
break_times: 2
break_times: 3
break_times: 4
break_times: 5
break_times: 6
ERROR: Please send the following URL to the author. 
 https://m.facebook.com/groups/pythontw?bac=MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ%253D%26multi_permalinks%26refid%3D18&multi_permalinks=&refid=18


In [40]:
import requests
import re
import json
from bs4 import BeautifulSoup
import time
import datetime
import pandas as pd
import numpy as np
import urllib

# Fans page ==================================================================

## get page id
def get_pageid(pageurl):
    resp = requests.get(pageurl)
    pageid = re.findall('page_id=(.*?)"',resp.text)[0]
    return pageid

## parse_content
def parse_content(data):
    df = []
    soup = BeautifulSoup(data['domops'][0][3]['__html'], 'lxml')
    # post
    for ele in soup.findAll('div', {'class':'userContentWrapper'}):
        try:
            df.append([
                ele.find('img')['aria-label'], #name
                ele.find('div', {'data-testid':'story-subtitle'})['id'], # ID
                ele.find('abbr')['data-utime'], # time
                ''.join([i.text for i in ele.find('div', {'data-testid':'post_message'}).findAll('p')]), # content
                ele.find('a')['href'].split('?')[0] # link
                    ])
        except:
            pass  
    df = pd.DataFrame(data=df, columns=['NAME', 'ID', 'TIME', 'CONTENT', 'LINK'])
    df['PAGEID'] = df['ID'].apply(lambda x: re.split(r'_|;|-|:',x)[2])
    df['POSTID'] = df['ID'].apply(lambda x: re.split(r'_|;|:-',x)[3])
    df['TIME'] = df['TIME'].apply(lambda x: datetime.datetime.fromtimestamp(int(x)))
    df = df.drop('ID',axis=1)
    return df

## get_reaction
def get_reaction(data):
    df = []
    # posts
    for ele in data['jsmods']['pre_display_requires']:
        try:
            df.append([
                ele[3][1]['__bbox']['variables']['storyID'], # storyID
                ele[3][1]['__bbox']['result']['data']['feedback']['display_comments_count']['count'],  # display_comments_count
                ele[3][1]['__bbox']['result']['data']['feedback']['comment_count']['total_count'], # total_comments_count
                ele[3][1]['__bbox']['result']['data']['feedback']['reaction_count']['count'], # reaction_count
                ele[3][1]['__bbox']['result']['data']['feedback']['share_count']['count'], # share_count
                ele[3][1]['__bbox']['result']['data']['feedback']['top_reactions']['edges'], # reactions
            ])
        except:
            pass
    
    # vidoes
    for ele in data['jsmods']['require']:
        try:
            
            df.append([
                'S:_I'+ele[3][2]['feedbacktarget']['actorid']+':'+ele[3][2]['feedbacktarget']['targetfbid'], # storyID
                ele[3][2]['feedbacktarget']['commentcount'], # display_comments_count
                ele[3][2]['feedbacktarget']['commentcount'], # total_comments_count
                ele[3][2]['feedbacktarget']['likecount'], # likecount
                ele[3][2]['feedbacktarget']['sharecount'], # sharecount
                [] # reactions
            ])
        except:
            pass
    df = pd.DataFrame(df, columns=['storyID','display_comments_count', 'total_comments_count', 'reaction_count', 'share_count', 'reactions'])
    df['storyID'] = df['storyID'].apply(lambda x: re.sub('S:_I', '',x))
    df['PAGEID'] = df['storyID'].apply(lambda x: re.split(r':',x)[0])
    df['POSTID'] = df['storyID'].apply(lambda x: re.split(r':',x)[1])
    
    # 
    def get_reactions(reactname, reactions):
        for react in reactions:
            if reactname in str(react):
                return react['reaction_count']
        return 0
    df['LIKE'] = df['reactions'].apply(lambda x: get_reactions('LIKE', x))
    df['LOVE'] = df['reactions'].apply(lambda x: get_reactions('LOVE', x))
    df['HAHA'] = df['reactions'].apply(lambda x: get_reactions('HAHA', x))
    df['SUPPORT'] = df['reactions'].apply(lambda x: get_reactions('SUPPORT', x))
    df['WOW'] = df['reactions'].apply(lambda x: get_reactions('WOW', x))
    df['ANGER'] = df['reactions'].apply(lambda x: get_reactions('ANGER', x))
    df['SORRY'] = df['reactions'].apply(lambda x: get_reactions('SORRY', x))
    
    # for vidoe's tpye post
    df['LIKE'] = np.select(condlist = [df['reactions'].apply(lambda x: len(x)==0)], 
                           choicelist=[df['reaction_count']], 
                           default=df['LIKE'])
    return df

# Crawl_PagePosts
def Crawl_PagePosts(pageurl, until_date='2019-01-01'):
    pageid = get_pageid(pageurl) 

    content_df = [] # post
    feedback_df = [] # reactions
    timeline_cursor = ''
    max_date =  datetime.datetime.now()
    break_times = 0
    
    # request date and break loop when reach the goal 
    while max_date >= datetime.datetime.strptime(until_date, '%Y-%m-%d'):
        
        # request params
        url = 'https://www.facebook.com/pages_reaction_units/more/'
        params = {'page_id': pageid,
                  'cursor': str({"timeline_cursor":timeline_cursor,
                                 "timeline_section_cursor":'{}',
                                 "has_next_page":'true'}), 
                  'surface': 'www_pages_home',
                  'unit_count': 20,
                  '__a': '1'}

        try:
            resp = requests.get(url, params=params)
            data = json.loads(re.sub(r'for \(;;\);','',resp.text))
            
            # contesnts：poster's name, poster's ID, post ID, time, content
            ndf = parse_content(data=data)
            content_df.append(ndf)

            # reactions
            ndf1 = get_reaction(data=data)
            feedback_df.append(ndf1)
  
            # update request params
            max_date = ndf['TIME'].max()
            print('TimeStamp: {}.'.format(ndf['TIME'].max()))
            timeline_cursor = re.findall(r'timeline_cursor\\u002522\\u00253A\\u002522(.*?)\\u002522\\u00252C\\u002522timeline_section_cursor',resp.text)[0]
            # break times to zero
            break_times = 0

        except:
            break_times += 1
            print('break_times:', break_times)
        
        time.sleep(2)
        if break_times > 5:
            break
    
    # join content and reactions
    content_df = pd.concat(content_df, ignore_index=True)
    feedback_df = pd.concat(feedback_df, ignore_index=True)
    df = pd.merge(left=content_df, right=feedback_df, how='left', on=['PAGEID', 'POSTID'])
    df = df.loc[:,['NAME', 'TIME', 'CONTENT', 'PAGEID', 'POSTID', 'display_comments_count', 'total_comments_count', 'reaction_count', 'share_count', 'LIKE', 'LOVE', 'HAHA', 'SUPPORT', 'WOW', 'ANGER', 'SORRY']]
    df = df.rename(columns={'display_comments_count':'DISPLAYCOMMENTS', 'total_comments_count':'TOTAL_COMMENTS', 'reaction_count':'REACTIONS','share_count':'SHARES'})
    df['UPDATETIME'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")      
    print('There are {} posts in DataFrame.'.format(str(df.shape[0])))
    return df

# Group page ==================================================================

## parse_group_content
def parse_group_content(resp):
    soup = BeautifulSoup(resp.text, 'lxml')
    df = []
    for ele in soup.findAll('article'):
        try:
            df.append([
                re.findall('"actor_id":([0-9]{1,})' ,str(ele))[0], # actorid
                re.findall('"top_level_post_id":"(.*?)"' ,str(ele))[0], # postid
                ele.find('strong').text, # actorname
                json.loads(re.findall(r'"post_context":({.*?})', str(ele))[0])['publish_time'], # TIME
                json.loads(re.findall(r'"post_context":({.*?})', str(ele))[0])['story_name'], # story_name
                ele.select_one('div.story_body_container > div').text, # content
                ' '.join([i.text for i in ele.findAll('span', {'class':'_28wy'})]) # reactions
            ])
        except:
            pass

    df = pd.DataFrame(data=df, columns = ['ACTORID','POSTID', 'NAME', 'TIME','STORYNAME', 'CONTENT', 'REACTIONS'])
    df['GROUPID'] = re.findall('\?id=([0-9]{1,})"',resp.text)[0]
    df['TIME'] = df['TIME'].apply(lambda x: datetime.datetime.fromtimestamp(int(x)))
    df['LIKES'] = df['REACTIONS'].apply(lambda x: re.findall('([0-9]{1,}) Like', x)[0] if 'Like' in x else '0')
    df['COMMENTS'] = df['REACTIONS'].apply(lambda x: re.findall('([0-9]{1,}) Comment', x)[0] if 'Comment' in x else '0')
    df['SHARES'] = df['REACTIONS'].apply(lambda x: re.findall('([0-9]{1,}) Share', x)[0] if 'Share' in x else '0')
    df = df.loc[:,['ACTORID', 'NAME', 'GROUPID', 'POSTID', 'TIME', 'STORYNAME', 'CONTENT', 'LIKES', 'COMMENTS', 'SHARES']]
    df['UPDATETIME'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")   
    return df

## get_bac
def get_bac(resp):
    string = urllib.parse.unquote(resp.text)
    bac = re.findall('\?bac=(.*?)={0,1}&amp',string)[0]
    return bac

## Crawl_GroupPosts


In [41]:
def Crawl_GroupPosts(groupurl, until_date='2019-01-01'):
    groupurl = re.sub('www','m', groupurl)
    headers = {
        'referer': 'https://m.facebook.com/',
        'cookie': 'locale=en_US',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
        }
    df = []
    bac = ''
    max_date =  datetime.datetime.now()
    break_times = 0
    # request data and break loop when reach the goal 
    while max_date >= datetime.datetime.strptime(until_date, '%Y-%m-%d'):
        # request params
        params = {
            'bac': bac,
            'multi_permalinks': '',
            'refid': '18'
            }
        resp = requests.get(groupurl, headers=headers, params=params)
        try:
            ndf = parse_group_content(resp)
            df.append(ndf)
            # update request params
            bac = get_bac(resp) 
            # there are some posts will be pinned at top, so we can't take the max date directly
            max_date = ndf['TIME'].sort_values(ascending=False,ignore_index=True)[3] 
            print('TimeStamp: {}.'.format(max_date))
            break_times = 0 # break times to zero
        except:
            break_times += 1
            print('break_times:', break_times)
        time.sleep(2)
        if break_times > 5:
            return resp
            # return print('ERROR: Please send the following URL to the author. \n', resp.url)
    # concat data we collect
    df = pd.concat(df, ignore_index=True)
    print('There are {} posts in the DataFrame.'.format(str(df.shape[0])))
    return df



In [42]:
resp = Crawl_GroupPosts(groupurl, until_date='2021-01-01')

TimeStamp: 2021-06-26 20:21:45.
TimeStamp: 2021-06-25 18:45:16.
break_times: 1
break_times: 2
break_times: 3
break_times: 4
break_times: 5
break_times: 6


In [43]:
parse_group_content(resp)

IndexError: list index out of range

In [44]:
soup = BeautifulSoup(resp.text, 'lxml')

In [45]:
df = []

In [46]:
for ele in soup.findAll('article'):
        try:
            df.append([
                re.findall('"actor_id":([0-9]{1,})' ,str(ele))[0], # actorid
                re.findall('"top_level_post_id":"(.*?)"' ,str(ele))[0], # postid
                ele.find('strong').text, # actorname
                json.loads(re.findall(r'"post_context":({.*?})', str(ele))[0])['publish_time'], # TIME
                json.loads(re.findall(r'"post_context":({.*?})', str(ele))[0])['story_name'], # story_name
                ele.select_one('div.story_body_container > div').text, # content
                ' '.join([i.text for i in ele.findAll('span', {'class':'_28wy'})]) # reactions
            ])
        except:
            pass

In [47]:
df = pd.DataFrame(data=df, columns = ['ACTORID','POSTID', 'NAME', 'TIME','STORYNAME', 'CONTENT', 'REACTIONS'])
df

Unnamed: 0,ACTORID,POSTID,NAME,TIME,STORYNAME,CONTENT,REACTIONS


In [48]:
soup.findAll('article')

[]

In [49]:
resp.text

'<!DOCTYPE html PUBLIC "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" "http://www.wapforum.org/DTD/xhtml-mobile10.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\n  <head>\n    <title>Error Facebook</title>\n    <meta name="viewport"             content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0" />\n    <meta http-equiv="Content-Type"   content="text/html; charset=utf-8" />\n    <meta http-equiv="Cache-Control"  content="no-cache" />\n    <meta name="robots"               content="noindex, nofollow" />\n    <style type="text/css">\n      body {\n        margin: 0;\n        padding: 0;\n        font-family: "Helvetica", sans-serif;\n        font-size: 14px;\n        background-color: #f2f2f2;\n      }\n\n      img {\n        vertical-align: top;\n      }\n\n      a {\n        color: #3b5998;\n        text-decoration: none;\n      }\n\n      .touch a {\n        color: #576b95;\n      }\n\n      hr {\n        display: none;\n      }\n\n

In [50]:
bac

NameError: name 'bac' is not defined

In [51]:
resp.url

'https://m.facebook.com/groups/pythontw?bac=MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ%253D%26multi_permalinks%26refid%3D18&multi_permalinks=&refid=18'

In [52]:
requests.get('https://m.facebook.com/groups/pythontw?bac=MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ%253D%26multi_permalinks%26refid%3D18&multi_permalinks=&refid=18', headers=headers)

NameError: name 'headers' is not defined

In [53]:
    headers = {
        'referer': 'https://m.facebook.com/',
        'cookie': 'locale=en_US',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
        }

In [54]:
requests.get('https://m.facebook.com/groups/pythontw?bac=MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ%253D%26multi_permalinks%26refid%3D18&multi_permalinks=&refid=18', headers=headers)

<Response [500]>

In [55]:
pageurl= 'https://www.facebook.com/diudiu333'

In [56]:
Crawl_PagePosts(pageurl=pageurl, until_date='2021-01-01')

TimeStamp: 2021-06-11 20:27:52.
TimeStamp: 2021-03-02 11:55:57.
TimeStamp: 2020-11-23 19:09:27.
There are 58 posts in DataFrame.


Unnamed: 0,NAME,TIME,CONTENT,PAGEID,POSTID,DISPLAYCOMMENTS,TOTAL_COMMENTS,REACTIONS,SHARES,LIKE,LOVE,HAHA,SUPPORT,WOW,ANGER,SORRY,UPDATETIME
0,丟丟妹,2021-06-11 20:27:52,今天丟丟砸重本 送很大 來根丟丟補貨囉 💎💎💎💎💎💎 任何問題請直接來電：022266003...,1723714034327589,788936008456629,318445.0,318445.0,18466.0,110401.0,18466.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-06-27 14:42:21
1,丟丟妹,2021-06-09 20:29:06,防疫期間 丟丟想說 聽我說 ❤️❤️❤️❤️❤️❤️ 有任何問題請直接來電：0222660...,1723714034327589,211523144148343,176610.0,176610.0,13917.0,47573.0,13917.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-06-27 14:42:21
2,曉鈞兒,2021-05-22 19:20:25,今日晚餐 #蕃茄炒蛋 #蒜炒小白菜 #紅蘿蔔烘蛋 #蜜汁佐肉⋯⋯ #酥烤雞腿排 #鈞兒廚房...,1723714034327589,8116995664611601863,,,,,,,,,,,,2021-06-27 14:42:21
3,丟丟妹,2021-05-12 22:49:48,【暫停自取服務公告】 為配合政府防疫措施，自5/13起，暫停自取服務， 造成您的不便，懇請見...,1723714034327589,4300862423279391,317.0,553.0,20067.0,162.0,19861.0,149.0,6.0,28.0,20.0,1.0,2.0,2021-06-27 14:42:21
4,丟丟妹,2021-05-06 15:34:21,你身上有她💋的香水味... 是我鼻子👃犯的罪... 王董是否能化險為夷呢⁉😱,1723714034327589,4281160585249575,342.0,394.0,17220.0,544.0,13745.0,173.0,3133.0,57.0,44.0,43.0,25.0,2021-06-27 14:42:21
5,丟丟妹,2021-05-05 14:14:17,不會因為他人 抹滅我爸爸他們的辛苦 不會因為我是鄉下討海人的孩子感到自卑 因為我有這樣的爸...,1723714034327589,4277800818918885,1162.0,1932.0,61552.0,151.0,60626.0,640.0,10.0,260.0,10.0,1.0,5.0,2021-06-27 14:42:21
6,丟丟妹,2021-04-28 21:19:23,今天董事長換人當 ， 神秘嘉賓 丟丟讓你猜猜看❤️ 😎😎😎😎😎😎😎😎😎 有任何問題請直接來電...,1723714034327589,276119734236443,47452.0,47452.0,9775.0,11590.0,9775.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-06-27 14:42:21
7,丟丟妹,2021-04-22 18:46:34,☆☆☆Mercedes-Benz A180 銷售所得全數捐贈☆☆☆ 【說到做到！】信守承諾是...,1723714034327589,4239203329445301,194.0,209.0,12659.0,281.0,12262.0,190.0,128.0,27.0,26.0,17.0,9.0,2021-06-27 14:42:21
8,丟丟妹,2021-04-18 13:07:17,丟妹：你敢說 丟妹敢做 不要說送東西 你們要叫我們幹甚麼事 我都做得出來... 謝董：三萬次...,1723714034327589,4226354744063493,97.0,109.0,7620.0,168.0,6732.0,70.0,770.0,21.0,14.0,8.0,5.0,2021-06-27 14:42:21
9,丟丟妹,2021-04-16 21:14:02,給我的攻擊越大 就越強壯 誰叫我是 丟丟❤️ 給你們 傷眼一下🤨 有任何疑難雜症可以撥打02...,1723714034327589,4221523341213300,404.0,629.0,11558.0,32.0,11254.0,131.0,78.0,51.0,42.0,0.0,2.0,2021-06-27 14:42:21


In [57]:
Crawl_PagePosts(pageurl=pageurl, until_date='2020-01-01')

TimeStamp: 2021-06-11 20:27:52.
TimeStamp: 2021-03-02 11:55:57.
TimeStamp: 2020-11-23 19:09:27.
TimeStamp: 2020-06-25 12:01:49.
TimeStamp: 2020-03-20 17:47:08.
TimeStamp: 2019-12-22 21:30:00.
There are 117 posts in DataFrame.


Unnamed: 0,NAME,TIME,CONTENT,PAGEID,POSTID,DISPLAYCOMMENTS,TOTAL_COMMENTS,REACTIONS,SHARES,LIKE,LOVE,HAHA,SUPPORT,WOW,ANGER,SORRY,UPDATETIME
0,丟丟妹,2021-06-11 20:27:52,今天丟丟砸重本 送很大 來根丟丟補貨囉 💎💎💎💎💎💎 任何問題請直接來電：022266003...,1723714034327589,788936008456629,318445.0,318445.0,18466.0,110398.0,18466.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-06-27 14:43:10
1,丟丟妹,2021-06-09 20:29:06,防疫期間 丟丟想說 聽我說 ❤️❤️❤️❤️❤️❤️ 有任何問題請直接來電：0222660...,1723714034327589,211523144148343,176610.0,176610.0,13917.0,47572.0,13917.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-06-27 14:43:10
2,曉鈞兒,2021-05-22 19:20:25,今日晚餐 #蕃茄炒蛋 #蒜炒小白菜 #紅蘿蔔烘蛋 #蜜汁佐肉⋯⋯ #酥烤雞腿排 #鈞兒廚房...,1723714034327589,8116995664611601863,,,,,,,,,,,,2021-06-27 14:43:10
3,丟丟妹,2021-05-12 22:49:48,【暫停自取服務公告】 為配合政府防疫措施，自5/13起，暫停自取服務， 造成您的不便，懇請見...,1723714034327589,4300862423279391,317.0,553.0,20067.0,162.0,19861.0,149.0,6.0,28.0,20.0,1.0,2.0,2021-06-27 14:43:10
4,丟丟妹,2021-05-06 15:34:21,你身上有她💋的香水味... 是我鼻子👃犯的罪... 王董是否能化險為夷呢⁉😱,1723714034327589,4281160585249575,342.0,394.0,17220.0,544.0,13745.0,173.0,3133.0,57.0,44.0,43.0,25.0,2021-06-27 14:43:10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112,丟丟妹,2019-09-10 17:50:52,怒🤬🤬🤬🤬🤬🤬 我和家人的關係 不需要 （外人） 有所評論 我家人永遠是我最愛的人 不需要 ...,1723714034327589,2636178933081090,588.0,595.0,12465.0,24.0,11968.0,176.0,34.0,0.0,264.0,5.0,18.0,2021-06-27 14:43:10
113,丟丟妹,2019-08-20 19:38:22,今天是小丟丟誕生的日子！（撒花 丟妹剛進產房、小編代po 影片講解： 丟妹突然拿起電話打給櫃...,1723714034327589,2597075640324753,1176.0,1201.0,7601.0,27.0,6936.0,188.0,461.0,0.0,16.0,0.0,0.0,2021-06-27 14:43:10
114,丟丟妹,2019-08-11 20:55:59,#下禮拜要生娃娃了好緊張😭 #測試有沒有幫丟丟按追蹤搶先看 ❤️ #幫妹丟丟分享沒人好生氣 ...,1723714034327589,2270557759940948,2464.0,2464.0,3622.0,170.0,3622.0,0.0,0.0,0.0,0.0,0.0,0.0,2021-06-27 14:43:10
115,丟丟妹,2019-07-29 21:09:26,#各位最近忙什麼呢（？）#等我出關吧 🥳 丟丟下個月要生產囉🥺🥺🥺（緊張刺激害怕） 所以最熱...,1723714034327589,2557096954322622,401.0,424.0,9395.0,34.0,9025.0,167.0,29.0,1.0,169.0,3.0,1.0,2021-06-27 14:43:10


In [58]:
def parse_group_content(resp):
    soup = BeautifulSoup(resp.text, 'lxml')
    df = []
    for ele in soup.findAll('article'):
        try:
            df.append([
                re.findall('"actor_id":([0-9]{1,})' ,str(ele))[0], # actorid
                re.findall('"top_level_post_id":"(.*?)"' ,str(ele))[0], # postid
                ele.find('strong').text, # actorname
                json.loads(re.findall(r'"post_context":({.*?})', str(ele))[0])['publish_time'], # TIME
                json.loads(re.findall(r'"post_context":({.*?})', str(ele))[0])['story_name'], # story_name
                ele.select_one('div.story_body_container > div').text, # content
                ' '.join([i.text for i in ele.findAll('span', {'class':'_28wy'})]) # reactions
            ])
        except:
            pass
    df = pd.DataFrame(data=df, columns = ['ACTORID','POSTID', 'NAME', 'TIME','STORYNAME', 'CONTENT', 'REACTIONS'])
    df['GROUPID'] = re.findall('\?id=([0-9]{1,})"',resp.text)[0]
    df['TIME'] = df['TIME'].apply(lambda x: datetime.datetime.fromtimestamp(int(x)))
    df['LIKES'] = df['REACTIONS'].apply(lambda x: re.findall('([0-9]{1,}) Like', x)[0] if 'Like' in x else '0')
    df['COMMENTS'] = df['REACTIONS'].apply(lambda x: re.findall('([0-9]{1,}) Comment', x)[0] if 'Comment' in x else '0')
    df['SHARES'] = df['REACTIONS'].apply(lambda x: re.findall('([0-9]{1,}) Share', x)[0] if 'Share' in x else '0')
    df = df.loc[:,['ACTORID', 'NAME', 'GROUPID', 'POSTID', 'TIME', 'STORYNAME', 'CONTENT', 'LIKES', 'COMMENTS', 'SHARES']]
    df['UPDATETIME'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")   
    return df
## get_bac

def get_bac(resp):
    string = urllib.parse.unquote(resp.text)
    bac = re.findall('\?bac=(.*?)={0,1}&amp',string)[0]
    return bac
## Crawl_GroupPosts

def Crawl_GroupPosts(groupurl, until_date='2019-01-01'):
    groupurl = re.sub('www','m', groupurl)
    headers = {
        'referer': 'https://m.facebook.com/',
        'cookie': 'locale=en_US',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
        }
    df = []
    bac = ''
    max_date =  datetime.datetime.now()
    break_times = 0
    # request data and break loop when reach the goal 
    while max_date >= datetime.datetime.strptime(until_date, '%Y-%m-%d'):
        # request params
        params = {
            'bac': bac,
            'multi_permalinks': '',
            'refid': '18'
            }
        resp = requests.get(groupurl, headers=headers, params=params)
        try:
            ndf = parse_group_content(resp)
            df.append(ndf)
            # update request params
            bac = get_bac(resp) 
            print(bac)
            # there are some posts will be pinned at top, so we can't take the max date directly
            max_date = ndf['TIME'].sort_values(ascending=False,ignore_index=True)[3] 
            print('TimeStamp: {}.'.format(max_date))
            break_times = 0 # break times to zero
        except:
            break_times += 1
            print('break_times:', break_times)
        time.sleep(2)
        if break_times > 5:
            return resp
            # return print('ERROR: Please send the following URL to the author. \n', resp.url)
    # concat data we collect
    df = pd.concat(df, ignore_index=True)
    print('There are {} posts in the DataFrame.'.format(str(df.shape[0])))
    return df



In [59]:
groupurl = 'https://www.facebook.com/groups/pythontw'
Crawl_GroupPosts(groupurl, until_date='2021-01-01')

MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ=
TimeStamp: 2021-06-26 20:21:45.
MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ%3D&multi_permalinks&refid=18
TimeStamp: 2021-06-25 18:45:16.
break_times: 1
break_times: 2
break_times: 3
break_times: 4
break_times: 5
break_times: 6


<Response [500]>

In [60]:
resp.text

'<!DOCTYPE html PUBLIC "-//WAPFORUM//DTD XHTML Mobile 1.0//EN" "http://www.wapforum.org/DTD/xhtml-mobile10.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\n  <head>\n    <title>Error Facebook</title>\n    <meta name="viewport"             content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=0" />\n    <meta http-equiv="Content-Type"   content="text/html; charset=utf-8" />\n    <meta http-equiv="Cache-Control"  content="no-cache" />\n    <meta name="robots"               content="noindex, nofollow" />\n    <style type="text/css">\n      body {\n        margin: 0;\n        padding: 0;\n        font-family: "Helvetica", sans-serif;\n        font-size: 14px;\n        background-color: #f2f2f2;\n      }\n\n      img {\n        vertical-align: top;\n      }\n\n      a {\n        color: #3b5998;\n        text-decoration: none;\n      }\n\n      .touch a {\n        color: #576b95;\n      }\n\n      hr {\n        display: none;\n      }\n\n

In [61]:
MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ

NameError: name 'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ' is not defined

In [62]:
groupurl = re.sub('www','m', groupurl)

In [63]:
groupurl

'https://m.facebook.com/groups/pythontw'

In [64]:
    headers = {

SyntaxError: unexpected EOF while parsing (<ipython-input-64-4d38e1a99044>, line 1)

In [65]:
headers = {
        'referer': 'https://m.facebook.com/',
        'cookie': 'locale=en_US',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
        }

In [66]:
df = []
    bac = ''
    max_date =  datetime.datetime.now()
    break_times = 0

IndentationError: unexpected indent (<ipython-input-66-5f63ed1b374a>, line 2)

In [67]:
    df = []
    bac = ''
    max_date =  datetime.datetime.now()
    break_times = 0

In [68]:
        params = {
            'bac': bac,
            'multi_permalinks': '',
            'refid': '18'
            }

In [69]:
resp = requests.get(groupurl, headers=headers, params=params)

In [70]:
parse_group_content(resp)

Unnamed: 0,ACTORID,NAME,GROUPID,POSTID,TIME,STORYNAME,CONTENT,LIKES,COMMENTS,SHARES,UPDATETIME
0,160712400714277,PyCon Taiwan,197223143437,10161302929498438,2021-05-29 11:17:07,EntGroupQuestionCreationStory,【#PyCast許願池】 大家好，我們是 PyCon Taiwan 的夥伴！ 今年是 PyC...,23,0,1,2021-06-27 15:43:33
1,100034770712412,陳仕原,197223143437,10161378257433438,2021-06-25 15:03:31,EntGroupMallPostCreationStory,請問一定要把綠色字刪掉才能執行嗎 (我是初學者求解),37,26,3,2021-06-27 15:43:33
2,100003199696636,Genie Kuo,197223143437,10161383216748438,2021-06-27 14:12:40,EntGroupMallPostCreationStory,安心防疫 ，在線聽講最好！ 6/30政大社科院數位轉型推動系列講座 本次推出 政大校友...,2,0,0,2021-06-27 15:43:33
3,100015297752773,劉弘翔,197223143437,10161381057933438,2021-06-26 18:52:20,EntGroupMallPostCreationStory,小小問題請大神解惑，感謝 新手發問:請問大家，我想要把文件打開並且數出字數(不要標點符號)要...,4,2,0,2021-06-27 15:43:33
4,100000199211663,Jimmy Lin,197223143437,10161381510958438,2021-06-26 23:34:16,EntGroupMallPostCreationStory,各位先進，大家好 我在使用Django的時候 返回jsonresponse 出現下列這種er...,1,3,0,2021-06-27 15:43:33
5,100013805490753,Yu Chen,197223143437,10161381601948438,2021-06-27 00:28:47,EntGroupMallPostCreationStory,"請問一下 我有一個array[1,2,3,4,5,6,7,8,9] 我要如何reshape成...",14,5,0,2021-06-27 15:43:33
6,120405158,Claire C,197223143437,10161383150663438,2021-06-24 12:00:01,EntComposerPhotoCreationStory,◤ 開發者技術與程式運用懶人包 ◢包含推薦系統 Merlin、智慧影像分析 DeepStre...,0,0,0,2021-06-27 15:43:33
7,100001685034801,Charles Liu,197223143437,10160092520388438,2020-05-14 15:57:41,EntGroupMallPostCreationStory,課程名稱: Python 快速入門精講http://estucourse.com/t...,14,2,8,2021-06-27 15:43:33
8,100000100205150,Shiyeh Yeh,197223143437,10161380996368438,2021-06-26 17:58:57,EntGroupMallPostCreationStory,請問兩個dcit要合併，該怎麼寫比較好? (應該說有條件的合併，並不一定是要新的蓋掉舊的),13,14,4,2021-06-27 15:43:33
9,100029194779243,ခ်မ္းေလး,197223143437,10161383010663438,2021-06-20 18:00:53,EntVideoCreationStory,,0,0,0,2021-06-27 15:43:33


In [71]:
get_bac(resp)

'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ='

In [72]:
string = urllib.parse.unquote(resp.text)

In [73]:
string

se,abortLoadingReUpStillVisibleVideos:false,abortedLoadingPixelBoundary:-2000,reloadingPixelBoundary:-1000,videoPollingFrequency:200,disableAutoplayOnHomePgUpPgDownEnd:false,pressPgUpPgDownAutoplayShutoffInterval:500,pressHomeEndAutoplayShutoffInterval:1000,enteredExitedFsLoggingFix:false,watchedPercentage:95,createVideoVisibilityObserver:false,fireBufferingEndEventOnDestroy:false,showStaleOverlayOnVideoNodeStaled:false,showHDIndicator:true,useDebouncedScroll:false,enableInstreamAdViewabilityLogging:true,enableComponentGuards:true,endScreenDuration:5000,staleVideoNudgeAmount:0,copyLinkInContextMenu:false,copyLinkAtCurrentTimeInContextMenu:true,fixVPCCallBeforeLoaded:false,videoVisibilityObserverUseMinimumThreshold:false,delayVideoControlRenderForApiReady:false,unsubscribeImmediateplay:true,removeHiddenVideoTracking:false,fixVideoPauseWhenBackgroundedInFeed:false,fixFeedVideosPlayOffscreen:false,pauseUpNextControllerWNS:true,pauseAudioUnitsOffscreen:true,pauseOffscreenMutedOnUpdateAutop

In [74]:
re.findall('\?bac=(.*?)={0,1}&amp',string)[0]

'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ='

In [75]:
re.findall('\?bac=(.*?)={0,1}&amp',string)

['MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ=',
 'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ=']

In [76]:
def get_bac(resp):
    string = urllib.parse.unquote(resp.text)
    bac = re.findall('\?bac=(.*?)={1}&amp',string)[0]
    return bac



In [77]:
re.findall('\?bac=(.*?)={1}&amp',string)

['MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ=',
 'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ=']

In [78]:
re.findall('\?bac=(.*?)=={1}&amp',string)

['MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ',
 'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ']

In [79]:
def get_bac(resp):
    string = urllib.parse.unquote(resp.text)
    bac = re.findall('\?bac=(.*?)[=]{0,2}&amp',string)[0]
    return bac



In [80]:
get_bac(resp)

'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ'

In [81]:
Crawl_GroupPosts(groupurl, until_date='2019-01-01')

MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ
TimeStamp: 2021-06-26 20:21:45.
MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ&multi_permalinks&refid=18
TimeStamp: 2021-06-25 18:45:16.
break_times: 1
break_times: 2
break_times: 3
break_times: 4
break_times: 5
break_times: 6


<Response [500]>

In [82]:
'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ&multi_permalinks&refid=18'.split(r'&',-1)[0]

'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ'

In [83]:
def get_bac(resp):
    string = urllib.parse.unquote(resp.text)
    bac = re.findall('\?bac=(.*?)[=]{0,2}&amp',string)[0]
    bac = bac.split(r'&',-1)[0]
    return bac



In [84]:
'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ&multi_permalinks&refid=18'.split(r'&&',-1)[0]

'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ&multi_permalinks&refid=18'

In [85]:
Crawl_GroupPosts(groupurl, until_date='2019-01-01')

MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ
TimeStamp: 2021-06-26 20:21:45.
MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ
TimeStamp: 2021-06-25 18:45:16.
MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ
TimeStamp: 2021-06-25 18:45:16.
MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ
TimeStamp: 2021-06-25 18:45:16.
MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ
TimeStamp: 2021-06-25 18:45:16.
MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ
TimeStamp: 2021-06-25 18:45:16.


KeyboardInterrupt: 

In [86]:
groupurl = re.sub('www','m', groupurl)

In [87]:
headers = {
        'referer': 'https://m.facebook.com/',
        'cookie': 'locale=en_US',
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
        }

In [88]:
    df = []
    bac = ''
    max_date =  datetime.datetime.now()
    break_times = 0

In [89]:
        params = {
            'bac': bac,
            'multi_permalinks': '',
            'refid': '18'
            }

In [90]:
resp = requests.get(groupurl, headers=headers, params=params)

In [91]:
ndf = parse_group_content(resp)

In [92]:
ndf

Unnamed: 0,ACTORID,NAME,GROUPID,POSTID,TIME,STORYNAME,CONTENT,LIKES,COMMENTS,SHARES,UPDATETIME
0,160712400714277,PyCon Taiwan,197223143437,10161302929498438,2021-05-29 11:17:07,EntGroupQuestionCreationStory,【#PyCast許願池】 大家好，我們是 PyCon Taiwan 的夥伴！ 今年是 PyC...,23,0,1,2021-06-27 15:58:38
1,100034770712412,陳仕原,197223143437,10161378257433438,2021-06-25 15:03:31,EntGroupMallPostCreationStory,請問一定要把綠色字刪掉才能執行嗎 (我是初學者求解),37,26,3,2021-06-27 15:58:38
2,100003199696636,Genie Kuo,197223143437,10161383216748438,2021-06-27 14:12:40,EntGroupMallPostCreationStory,安心防疫 ，在線聽講最好！ 6/30政大社科院數位轉型推動系列講座 本次推出 政大校友...,2,0,0,2021-06-27 15:58:38
3,100015297752773,劉弘翔,197223143437,10161381057933438,2021-06-26 18:52:20,EntGroupMallPostCreationStory,小小問題請大神解惑，感謝 新手發問:請問大家，我想要把文件打開並且數出字數(不要標點符號)要...,4,2,0,2021-06-27 15:58:38
4,100000199211663,Jimmy Lin,197223143437,10161381510958438,2021-06-26 23:34:16,EntGroupMallPostCreationStory,各位先進，大家好 我在使用Django的時候 返回jsonresponse 出現下列這種er...,1,3,0,2021-06-27 15:58:38
5,100013805490753,Yu Chen,197223143437,10161381601948438,2021-06-27 00:28:47,EntGroupMallPostCreationStory,"請問一下 我有一個array[1,2,3,4,5,6,7,8,9] 我要如何reshape成...",15,5,0,2021-06-27 15:58:38
6,120405158,Claire C,197223143437,10161383150663438,2021-06-24 12:00:01,EntComposerPhotoCreationStory,◤ 開發者技術與程式運用懶人包 ◢包含推薦系統 Merlin、智慧影像分析 DeepStre...,0,0,0,2021-06-27 15:58:38
7,100001685034801,Charles Liu,197223143437,10160092520388438,2020-05-14 15:57:41,EntGroupMallPostCreationStory,課程名稱: Python 快速入門精講http://estucourse.com/t...,14,2,8,2021-06-27 15:58:38
8,100000100205150,Shiyeh Yeh,197223143437,10161380996368438,2021-06-26 17:58:57,EntGroupMallPostCreationStory,請問兩個dcit要合併，該怎麼寫比較好? (應該說有條件的合併，並不一定是要新的蓋掉舊的),13,14,4,2021-06-27 15:58:38
9,100029194779243,ခ်မ္းေလး,197223143437,10161383010663438,2021-06-20 18:00:53,EntVideoCreationStory,,0,0,0,2021-06-27 15:58:38


In [93]:
df.append(ndf)

In [94]:
get_bac(resp)

'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ'

In [95]:
urllib.parse.unquote(resp.text)

destroyonunload:false,hlsButton:false,fourArrowFullScreen:false,autoplayUntilHalfGone:true,slidingWNSv3:false,chromecast:true,enableRtmpBuffer:false,rtmpBufferTime:0,rtmpBufferTimeMax:0,rtmpImprovePlayback:false,rtmpStartTimeFix:true,useBlurredBars:true,liveAudioViewer:true,webVideosBlockAutoplayWhenOffline:false,delayAutoplayUntilAfterLoad:true,autoplayMaxCallsPerWindow:0,autoplayThrottleWindow:0,autoplayThrottleDelay:0,inlineSoundVisible:false,persistentWNSEnabled:false,showWNSClose:true,rhcWNSPauseAds:true,rhcWNS:true,pauseWhenOffscreen:false,bufferingErrorTimeout:30000,liveBufferingErrorTimeout:60000,progressiveBufferingErrorTimeout:60000,allowBufferingErrorForHiddenTab:true,disableFallbackModeForInactiveTab:false,disableAutoplayForInactiveTab:true,disableStallLoggingForError:true,endBufferingOnFallbackPlay:false,delayFormatChangeEvent:false,autoplayBlockBlacklist:false,rhcWNSEnabled:true,rhcWNSDrawerEnabled:false,rhcWNSExpandToTahoe:true,abortLoadingDecisioningLogic:false,embedded

In [96]:
re.findall('\?bac=(.*?)&amp',string)

['MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ==',
 'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ==']

In [97]:
def get_bac(resp):
    string = urllib.parse.unquote(resp.text)
    bac = re.findall('\?bac=(.*?)&amp',string)[0]
    bac = bac.split(r'&',-1)[0]
    return bac



In [98]:
Crawl_GroupPosts(groupurl, until_date='2019-01-01')

MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ==
TimeStamp: 2021-06-26 20:21:45.
MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ%3D%3D
TimeStamp: 2021-06-25 18:45:16.
break_times: 1
break_times: 2
break_times: 3
break_times: 4
break_times: 5
break_times: 6


<Response [500]>

In [99]:
re.findall('\?bac=(.*?)&',string)

['MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ==',
 'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ==',
 '']

In [100]:
def get_bac(resp):
    try:
        bac = re.findall('bac=(.*?)%3D',resp.text)[0]
    except:
        try:
            bac = re.findall('bac=(.*?)&amp',resp.text)[0]
        except:
            bac = re.findall('bac%3D(.*?)%26', resp.text)[0]
    return bac



In [101]:
Crawl_GroupPosts(groupurl, until_date='2019-01-01')

MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ
TimeStamp: 2021-06-26 20:21:45.
MTYyNDU0MDAwODoxMDE2MTM3MDU1Mjk0ODQzODoxMDE2MTM3MDU1Mjk0ODQzOCwwLDE6MjA6S3c9PQ
TimeStamp: 2021-06-25 18:45:16.
MTYyNDM2MDE3NToxMDE2MTM3MDA0ODUxODQzODoxMDE2MTM3MDA0ODUxODQzOCwwLDI6MjA6S3c9PQ
TimeStamp: 2021-06-23 18:36:50.
MTYyNDIwODM2MDoxMDE2MTM2NTQ2MzYyMzQzODoxMDE2MTM2NTQ2MzYyMzQzOCwwLDM6MjA6S3c9PQ
TimeStamp: 2021-06-21 16:39:00.
MTYyNDA4OTA3OToxMDE2MTM2MTc0MDU2ODQzODoxMDE2MTM2MTc0MDU2ODQzOCwwLDQ6MjA6S3c9PQ
TimeStamp: 2021-06-20 21:11:03.
MTYyMzk5MDE5MToxMDE2MTM1ODQwNjMzODQzODoxMDE2MTM1ODQwNjMzODQzOCwwLDU6MjA6S3c9PQ
TimeStamp: 2021-06-19 08:52:22.
MTYyMzgzMzUwNzoxMDE2MTM1MzAxNTkxODQzODoxMDE2MTM1MzAxNTkxODQzOCwwLDY6MjA6S3c9PQ
TimeStamp: 2021-06-17 19:00:13.
MTYyMzczMzY2NDoxMDE2MTMwNjYzNjg1ODQzODoxMDE2MTMwNjYzNjg1ODQzOCwwLDc6MjA6S3c9PQ
TimeStamp: 2021-06-16 02:54:51.
MTYyMzUxOTc3NDoxMDE2MTM0MzI5MzU5ODQzODoxMDE2MTM0MzI5MzU5ODQzOCwwLDg6MjA6S3c9PQ
TimeStamp: 2021-06-14 13:33:41.
M

KeyboardInterrupt: 

In [102]:
def get_bac(resp):
    string = urllib.parse.unquote(resp.text)
    bac = re.findall('\?bac=(.*?)[=]{0,2}&amp',string)[0]
    bac = bac.split(r'&',-1)[0]
    bac = re.sub(r'%3D', '', bac)
    return bac



In [103]:
Crawl_GroupPosts(groupurl, until_date='2021-01-01')

MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ
TimeStamp: 2021-06-26 20:21:45.
MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ
TimeStamp: 2021-06-25 18:45:16.
MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ
TimeStamp: 2021-06-25 18:45:16.
MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ
TimeStamp: 2021-06-25 18:45:16.


KeyboardInterrupt: 

In [104]:
bac = 'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ'

In [105]:
resp = requests.get(groupurl, headers=headers, params=params)

In [106]:
parse_group_content(resp)

Unnamed: 0,ACTORID,NAME,GROUPID,POSTID,TIME,STORYNAME,CONTENT,LIKES,COMMENTS,SHARES,UPDATETIME
0,160712400714277,PyCon Taiwan,197223143437,10161302929498438,2021-05-29 11:17:07,EntGroupQuestionCreationStory,【#PyCast許願池】 大家好，我們是 PyCon Taiwan 的夥伴！ 今年是 PyC...,23,0,1,2021-06-27 16:23:12
1,100034770712412,陳仕原,197223143437,10161378257433438,2021-06-25 15:03:31,EntGroupMallPostCreationStory,請問一定要把綠色字刪掉才能執行嗎 (我是初學者求解),37,26,3,2021-06-27 16:23:12
2,100003199696636,Genie Kuo,197223143437,10161383216748438,2021-06-27 14:12:40,EntGroupMallPostCreationStory,安心防疫 ，在線聽講最好！ 6/30政大社科院數位轉型推動系列講座 本次推出 政大校友...,2,0,0,2021-06-27 16:23:12
3,100015297752773,劉弘翔,197223143437,10161381057933438,2021-06-26 18:52:20,EntGroupMallPostCreationStory,小小問題請大神解惑，感謝 新手發問:請問大家，我想要把文件打開並且數出字數(不要標點符號)要...,4,2,0,2021-06-27 16:23:12
4,100000199211663,Jimmy Lin,197223143437,10161381510958438,2021-06-26 23:34:16,EntGroupMallPostCreationStory,各位先進，大家好 我在使用Django的時候 返回jsonresponse 出現下列這種er...,1,3,0,2021-06-27 16:23:12
5,100013805490753,Yu Chen,197223143437,10161381601948438,2021-06-27 00:28:47,EntGroupMallPostCreationStory,"請問一下 我有一個array[1,2,3,4,5,6,7,8,9] 我要如何reshape成...",15,5,0,2021-06-27 16:23:12
6,120405158,Claire C,197223143437,10161383150663438,2021-06-24 12:00:01,EntComposerPhotoCreationStory,◤ 開發者技術與程式運用懶人包 ◢包含推薦系統 Merlin、智慧影像分析 DeepStre...,0,0,0,2021-06-27 16:23:12
7,100001685034801,Charles Liu,197223143437,10160092520388438,2020-05-14 15:57:41,EntGroupMallPostCreationStory,課程名稱: Python 快速入門精講http://estucourse.com/t...,14,2,8,2021-06-27 16:23:12
8,100000100205150,Shiyeh Yeh,197223143437,10161380996368438,2021-06-26 17:58:57,EntGroupMallPostCreationStory,請問兩個dcit要合併，該怎麼寫比較好? (應該說有條件的合併，並不一定是要新的蓋掉舊的),13,14,4,2021-06-27 16:23:12
9,100029194779243,ခ်မ္းေလး,197223143437,10161383010663438,2021-06-20 18:00:53,EntVideoCreationStory,,0,0,0,2021-06-27 16:23:12


In [107]:
get_bac(resp) 

'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ'

In [108]:
def get_bac(resp):
    try:
        bac = re.findall('bac=(.*?)%3D',resp.text)[0]
    except:
        try:
            bac = re.findall('bac=(.*?)&amp',resp.text)[0]
        except:
            bac = re.findall('bac%3D(.*?)%26', resp.text)[0]
    return bac



In [109]:
get_bac(resp) 

'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ'

In [110]:
bac = get_bac(resp) 

In [111]:
resp = requests.get(groupurl, headers=headers, params=params)

In [112]:
get_bac(resp) 

'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ'

In [113]:
bac = ''
resp = requests.get(groupurl, headers=headers, params=params)
get_bac(resp) 

'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ'

In [114]:
resp = requests.get(groupurl, headers=headers, params=params)
get_bac(resp) 

'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ'

In [115]:
resp = requests.get(groupurl, headers=headers, params=params)
get_bac(resp) 

'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ'

In [116]:
def get_bac(resp):
    try:
        bac = re.findall('bac=(.*?)%3D',resp.text)[0]
    except:
        try:
            bac = re.findall('bac=(.*?)&amp',resp.text)[0]
        except:
            bac = re.findall('bac%3D(.*?)%26', resp.text)[0]
    return bac



In [117]:
bac = ''

In [118]:
resp = requests.get(groupurl, headers=headers, params=params)
get_bac(resp) 

'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ'

In [119]:
resp = requests.get(groupurl, headers=headers, params=params)
get_bac(resp) 

'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ'

In [120]:
resp = requests.get(groupurl, headers=headers, params=params)
get_bac(resp) 

'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ'

In [121]:
resp = requests.get(groupurl, headers=headers, params=params)
get_bac(resp) 

'MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ'

In [122]:
def get_bac(resp):
    try:
        bac = re.findall('bac=(.*?)%3D',resp.text)[0]
    except:
        try:
            bac = re.findall('bac=(.*?)&amp',resp.text)[0]
        except:
            bac = re.findall('bac%3D(.*?)%26', resp.text)[0]
    return bac



In [123]:
Crawl_GroupPosts(groupurl, until_date='2021-05-01')

MTYyNDY4OTA3ODoxMDE2MTM3ODI5NDI3ODQzODoxMDE2MTM3ODI5NDI3ODQzOCwwLDA6MjE6S3c9PQ
TimeStamp: 2021-06-26 20:21:45.
MTYyNDU0MDAwODoxMDE2MTM3MDU1Mjk0ODQzODoxMDE2MTM3MDU1Mjk0ODQzOCwwLDE6MjA6S3c9PQ
TimeStamp: 2021-06-25 18:45:16.
MTYyNDM2MDE3NToxMDE2MTM3MDA0ODUxODQzODoxMDE2MTM3MDA0ODUxODQzOCwwLDI6MjA6S3c9PQ
TimeStamp: 2021-06-23 18:36:50.
MTYyNDIwODM2MDoxMDE2MTM2NTQ2MzYyMzQzODoxMDE2MTM2NTQ2MzYyMzQzOCwwLDM6MjA6S3c9PQ
TimeStamp: 2021-06-21 16:39:00.
MTYyNDA4OTA3OToxMDE2MTM2MTc0MDU2ODQzODoxMDE2MTM2MTc0MDU2ODQzOCwwLDQ6MjA6S3c9PQ
TimeStamp: 2021-06-20 21:11:03.
MTYyMzk5MDE5MToxMDE2MTM1ODQwNjMzODQzODoxMDE2MTM1ODQwNjMzODQzOCwwLDU6MjA6S3c9PQ
TimeStamp: 2021-06-19 08:52:22.
MTYyMzgzMzUwNzoxMDE2MTM1MzAxNTkxODQzODoxMDE2MTM1MzAxNTkxODQzOCwwLDY6MjA6S3c9PQ
TimeStamp: 2021-06-17 19:00:13.
MTYyMzczMzY2NDoxMDE2MTMwNjYzNjg1ODQzODoxMDE2MTMwNjYzNjg1ODQzOCwwLDc6MjA6S3c9PQ
TimeStamp: 2021-06-16 02:54:51.
MTYyMzUxOTc3NDoxMDE2MTM0MzI5MzU5ODQzODoxMDE2MTM0MzI5MzU5ODQzOCwwLDg6MjA6S3c9PQ
TimeStamp: 2021-06-14 13:33:41.
M