## reference
 * [偷偷說爬蟲by Go](https://city.shaform.com/zh/2019/01/11/plurk-crawler/) 
 * [Requests Docs](https://2.python-requests.org/en/master/api)
 * [bs4 Docs](https://www.crummy.com/software/BeautifulSoup/bs4/doc) 
 * [bs4 simple intro](https://blog.gtwang.org/programming/python-beautiful-soup-module-scrape-web-pages-tutorial/2/)
 * [Plurk API 2.0](https://www.plurk.com/API)

In [1]:
# import packages
from __future__ import print_function

from mdutils.mdutils import MdUtils # help to create a markdown file
from bs4 import BeautifulSoup as bs # for html parsing
import requests as rq
import slimit
from slimit.parser import Parser
from slimit.visitors.nodevisitor import ASTVisitor

### request plurk content

In [2]:
# test web: https://www.plurk.com/p/nf00yf
plurk_url = input("Plurk url: ")

Plurk url: https://www.plurk.com/p/nf00yf


In [3]:
with rq.Session() as sess:
    # request plurk content from plurk.com
    # sess.get means HTTP GET
    plurk = sess.get(plurk_url)
    if plurk.status_code == rq.codes.ok:
        print("Request Success! Status: {}.".format(plurk.status_code))
    else:
        print("Request fail. Status: {}.".format(plurk.status_code))

Request Success! Status: 200.


In [4]:
plurk.text

'<!DOCTYPE html>\n<html lang="en"> <head> <meta charset="utf-8"> <title>噗浪技術部🛠 - 精神好 - Plurk</title> <link rel="shortcut icon" type="image/png" href="//s.plurk.com/936ddc656e104792b651240cdafeb7aa.png"> <link rel="dns-prefetch" href="//avatars.plurk.com"> <link rel="dns-prefetch" href="//emos.plurk.com"> <link rel="dns-prefetch" href="//images.plurk.com"> <link rel="dns-prefetch" href="//imgs.plurk.com"> <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1, minimum-scale=1, user-scalable=0" /> <meta name="mobile-web-app-capable" content="yes" /> <meta name="apple-mobile-web-app-capable" content="yes" /> <meta name="fragment" content="!"> <meta property="og:type" content="article" /> <meta property="og:title" content="精神好" /> <meta property="og:site_name" content="Plurk" /> <meta property="og:url" content="https://www.plurk.com/p/nf00yf" /> <meta property="og:description" content="Plurk by 噗浪技術部🛠 - 30 response(s)" /> <meta property="og:image" content="http

### request response content

from the returned content get plurk_id.(one of the information in json from the last script)  
Here the id is 1415917527.  
Then use this id to request responses.

In [5]:
plurk_id = input("plurk id: ")
request_url = "https://www.plurk.com/Responses/get"
data = {'plurk_id': plurk_id, 'from_response_id': '0'}
print(data)

plurk id: 1415917527
{'plurk_id': '1415917527', 'from_response_id': '0'}


In [6]:
with rq.Session() as sess:
    # request response from Responses/get, and use plurk_id as data to tell website which plurk we are requesting
    # sess.post means HTTP POST
    response = sess.post(request_url, data=data)
    if response.status_code == rq.codes.ok:
        print("Request Success! Status: {}.".format(response.status_code))
    else:
        print("Request fail. Status: {}.".format(response.status_code))

Request Success! Status: 200.


In [7]:
response.json()

{'has_older': 0,
 'responses_seen': 0,
 'users': {'7160832': {'verified_account': False,
   'default_lang': 'tr_ch',
   'premium': True,
   'uid': 7160832,
   'dateformat': 0,
   'nick_name': 'OoHEECHULoO',
   'has_profile_image': 1,
   'enable_2fa': 1,
   'date_of_birth': 'Sat, 27 Feb 1904 00:01:01 GMT',
   'karma': 113.73,
   'gender': 2,
   'name_color': '63C6D3',
   'display_name': '小宥韓諾非🏳️\u200d🌈住非洲大叢林',
   'timeline_privacy': 0,
   'id': 7160832,
   'avatar': 48},
  '9240459': {'verified_account': False,
   'default_lang': 'tr_ch',
   'premium': True,
   'uid': 9240459,
   'dateformat': 0,
   'nick_name': 'loruru',
   'has_profile_image': 1,
   'enable_2fa': 0,
   'date_of_birth': 'Tue, 01 Mar 1904 00:01:01 GMT',
   'karma': 102.7,
   'gender': 2,
   'name_color': 'BA8FBE',
   'display_name': '如如✩Slash',
   'timeline_privacy': 0,
   'id': 9240459,
   'avatar': 22995939},
  '3876824': {'verified_account': False,
   'default_lang': 'tr_ch',
   'premium': True,
   'uid': 3876824,
  

### deal with plurk content

use BeautifulSoup4 to strip the last *script* tag.
Then use slimit to parse the javascript transform the json to python dictionary

In [8]:
# read content of HTML
soup = bs(plurk.text)
# extract the last script out
script = soup.find_all("script")[-1].string
script

'\nplurk = {"replurked": false, "porn": false, "is_mute": false, "mentioned": 0, "replurkable": true, "id": 1415917527, "favorite_count": 158, "is_unread": 0, "favorers": [21808, 3129053, 3215834, 3221973, 3312311, 3536153, 3540366, 3554026, 3670093, 3671311, 3715310, 3842286, 3894072, 3922173, 4005101, 4374158, 4419806, 4427270, 4771716, 5122743, 5519860, 5669641, 5685364, 5702467, 5745317, 5770592, 5794478, 6027406, 6030624, 6313452, 6348240, 6588550, 6853970, 6985351, 7341872, 7373785, 7417406, 7455630, 7517099, 7745128, 7765829, 7886419, 7923019, 7948777, 7971822, 7985759, 7994992, 8003857, 8051072, 8105576, 8129229, 8146844, 8146868, 8183324, 8293318, 8435723, 8464742, 8471213, 8553604, 8683078, 8701051, 8808644, 8884026, 8945182, 8984376, 8999161, 9010563, 9018214, 9056826, 9063379, 9138585, 9155106, 9155146, 9181355, 9196090, 9208338, 9255572, 9261770, 9299345, 9336945, 9397221, 9408746, 9466532, 9557182, 9829398, 9869392, 9924669, 9937072, 9939454, 10122443, 10236205, 10260574,

**defind custom visitor to parse the tree**

In [9]:
"""
" A Visitor inheritate slimit.visitors.ASTVisitor.
" To traverse the parse tree and transform to python dictionary
"""
class JSVisitor(ASTVisitor):
    def __init__(self, dic):
        self.json = dic
    """
    "  Traverse function
    """
    def visit_Object(self, node):
        for prop in node:
            left, right = prop.left, prop.right
            key = left.value[1:-1]   # deliminate quotes
            try:
                value = self.GetValue(right)
            except ValueError as e:
                print("ValueError: {}".format(str(e)))
            # print("Property key={}, value={}".format(key, value))
            self.json[key] = value
            # visit all children in turn
            self.visit(prop)
    """
    "  To get and transform values from AST nodes.
    """    
    def GetValue(self, node):
        node_type = type(node)
        if node_type is slimit.ast.NewExpr:
            return "NewExpr"
        if node_type is slimit.ast.Boolean:
            if node.value == 'false':
                return False
            if node.value == 'true':
                return True
            raise ValueError("Unknow value of node: {}".format(node.value))
        if node_type is slimit.ast.String:
            return node.value[1:-1]   # deliminate quotes
        if node_type is slimit.ast.Number:
            return int(node.value)
        if node_type is slimit.ast.Null:
            return None
        if node_type is slimit.ast.Array:
            array = [self.GetValue(n) for n in node.items]
            return array
        raise ValueError("Unknow node type: {}".format(node_type))

In [10]:
plurk_content = {}

parser = Parser()
json_tree = parser.parse(script)    # construct parse tree
visitor = JSVisitor(plurk_content)
visitor.visit(json_tree)            # traverse the tree



See [plurkAPI#plurk_data](https://www.plurk.com/API#plurk_data) to know the structure of plurk content json  
Useful Properties: plurk_id, favorite_count, owner_id, coins, qualifier, response_count, replurkers_count, anonymous, last_edited, no_comments, posted, lang, content_raw

In [11]:
plurk_content

{'replurked': False,
 'porn': False,
 'is_mute': False,
 'mentioned': 0,
 'replurkable': True,
 'id': 1415917527,
 'favorite_count': 158,
 'is_unread': 0,
 'favorers': [21808,
  3129053,
  3215834,
  3221973,
  3312311,
  3536153,
  3540366,
  3554026,
  3670093,
  3671311,
  3715310,
  3842286,
  3894072,
  3922173,
  4005101,
  4374158,
  4419806,
  4427270,
  4771716,
  5122743,
  5519860,
  5669641,
  5685364,
  5702467,
  5745317,
  5770592,
  5794478,
  6027406,
  6030624,
  6313452,
  6348240,
  6588550,
  6853970,
  6985351,
  7341872,
  7373785,
  7417406,
  7455630,
  7517099,
  7745128,
  7765829,
  7886419,
  7923019,
  7948777,
  7971822,
  7985759,
  7994992,
  8003857,
  8051072,
  8105576,
  8129229,
  8146844,
  8146868,
  8183324,
  8293318,
  8435723,
  8464742,
  8471213,
  8553604,
  8683078,
  8701051,
  8808644,
  8884026,
  8945182,
  8984376,
  8999161,
  9010563,
  9018214,
  9056826,
  9063379,
  9138585,
  9155106,
  9155146,
  9181355,
  9196090,
  9208338,

In [12]:
response_content = response.json()
response_content

{'has_older': 0,
 'responses_seen': 0,
 'users': {'7160832': {'verified_account': False,
   'default_lang': 'tr_ch',
   'premium': True,
   'uid': 7160832,
   'dateformat': 0,
   'nick_name': 'OoHEECHULoO',
   'has_profile_image': 1,
   'enable_2fa': 1,
   'date_of_birth': 'Sat, 27 Feb 1904 00:01:01 GMT',
   'karma': 113.73,
   'gender': 2,
   'name_color': '63C6D3',
   'display_name': '小宥韓諾非🏳️\u200d🌈住非洲大叢林',
   'timeline_privacy': 0,
   'id': 7160832,
   'avatar': 48},
  '9240459': {'verified_account': False,
   'default_lang': 'tr_ch',
   'premium': True,
   'uid': 9240459,
   'dateformat': 0,
   'nick_name': 'loruru',
   'has_profile_image': 1,
   'enable_2fa': 0,
   'date_of_birth': 'Tue, 01 Mar 1904 00:01:01 GMT',
   'karma': 102.7,
   'gender': 2,
   'name_color': 'BA8FBE',
   'display_name': '如如✩Slash',
   'timeline_privacy': 0,
   'id': 9240459,
   'avatar': 22995939},
  '3876824': {'verified_account': False,
   'default_lang': 'tr_ch',
   'premium': True,
   'uid': 3876824,
  

### 0803 - testing 
#### little goal: input plurk link and get some md

In [13]:
def get_content_by_link(plurk_url):
    
    # request plurk content from plurk.com
    with rq.Session() as sess:    
        plurk = sess.get(plurk_url)
        if plurk.status_code == rq.codes.ok:
            print("Request Success! Status: {}.".format(plurk.status_code))
        else:
            print("Request fail. Status: {}.".format(plurk.status_code))
            return 'Fail QAQ'
            
    # read content of HTML
    soup = bs(plurk.text)
    # extract the last script out
    script = soup.find_all("script")[-1].string
    
    plurk_content = {}

    parser = Parser()
    json_tree = parser.parse(script)    # construct parse tree
    visitor = JSVisitor(plurk_content)
    visitor.visit(json_tree)            # traverse the tree
    
    response_content = response.json()
    return response_content
    

In [15]:
#  plurk_id, favorite_count, owner_id, 
# coins, qualifier, response_count, 
# replurkers_count, anonymous, last_edited, 
# no_comments, posted, lang, content_raw
##################### 
#                   #
#   Main Function   #
#                   #
##################### 
content = get_content_by_link('https://www.plurk.com/p/nf00yf')
responses = content.get('responses')
users = content.get('users')
md_file = MdUtils(file_name='plurk_test',title='plurk_test_v0')

# 製作每則回應
for response in responses:
    name = users.get(str(response.get('user_id'))).get('display_name')
    md_file.new_paragraph(name, bold_italics_code='b', color='#DDDDDD')
    md_file.new_line(response.get('content'))

md_file.create_md_file()

Request Success! Status: 200.




<mdutils.fileutils.fileutils.MarkDownFile at 0x12fcc343f98>