## reference
 * [偷偷說爬蟲by Go](https://city.shaform.com/zh/2019/01/11/plurk-crawler/) 
 * [Requests Docs](https://2.python-requests.org/en/master/api)
 * [bs4 Docs](https://www.crummy.com/software/BeautifulSoup/bs4/doc) 
 * [bs4 simple intro](https://blog.gtwang.org/programming/python-beautiful-soup-module-scrape-web-pages-tutorial/2/)
 * [Plurk API 2.0](https://www.plurk.com/API)

In [29]:
# import packages
from __future__ import print_function

from mdutils.mdutils import MdUtils # help to create a markdown file
from bs4 import BeautifulSoup as bs # for html parsing
import requests as rq
import slimit
from slimit.parser import Parser
from slimit.visitors.nodevisitor import ASTVisitor

### 0803 - testing 
> Little Goal: input plurk link and get some md

In [30]:
"""
" A Visitor inheritate slimit.visitors.ASTVisitor.
" To traverse the parse tree and transform to python dictionary
"""
class JSVisitor(ASTVisitor):
    def __init__(self, dic):
        self.json = dic
    """
    "  Traverse function
    """
    def visit_Object(self, node):
        for prop in node:
            left, right = prop.left, prop.right
            key = left.value[1:-1]   # deliminate quotes
            try:
                value = self.GetValue(right)
            except ValueError as e:
                print("ValueError: {}".format(str(e)))
            # print("Property key={}, value={}".format(key, value))
            self.json[key] = value
            # visit all children in turn
            self.visit(prop)
    """
    "  To get and transform values from AST nodes.
    """    
    def GetValue(self, node):
        node_type = type(node)
        if node_type is slimit.ast.NewExpr:
            return "NewExpr"
        if node_type is slimit.ast.Boolean:
            if node.value == 'false':
                return False
            if node.value == 'true':
                return True
            raise ValueError("Unknow value of node: {}".format(node.value))
        if node_type is slimit.ast.String:
            return node.value[1:-1]   # deliminate quotes
        if node_type is slimit.ast.Number:
            return int(node.value)
        if node_type is slimit.ast.Null:
            return None
        if node_type is slimit.ast.Array:
            array = [self.GetValue(n) for n in node.items]
            return array
        raise ValueError("Unknow node type: {}".format(node_type))

In [31]:
def get_content_by_link(plurk_url):
    
    # request plurk content from plurk.com
    with rq.Session() as sess:    
        plurk = sess.get(plurk_url)
        if plurk.status_code == rq.codes.ok:
            print("Request Success! Status: {}.".format(plurk.status_code))
        else:
            print("Request fail. Status: {}.".format(plurk.status_code))
            return 'Fail QAQ'
            
    # read content of HTML
    soup = bs(plurk.text)
    # extract the last script out
    script = soup.find_all("script")[-1].string
    
    plurk_content = {}

    parser = Parser()
    json_tree = parser.parse(script)    # construct parse tree
    visitor = JSVisitor(plurk_content)
    visitor.visit(json_tree)            # traverse the tree

    # response
    request_url = "https://www.plurk.com/Responses/get"
    data = {'plurk_id': plurk_content.get('id'), 'from_response_id': '0'}
    print(data)
    
    with rq.Session() as sess:
        # request response from Responses/get, and use plurk_id as data to tell website which plurk we are requesting
        # sess.post means HTTP POST
        response = sess.post(request_url, data=data)
        if response.status_code == rq.codes.ok:
            print("Request Success! Status: {}.".format(response.status_code))
        else:
            print("Request fail. Status: {}.".format(response.status_code))
    
    response_content = response.json()
    return plurk_content, soup, response_content
    

In [32]:
##################### 
#                   #
#   Main Function   #
#                   #
##################### 
def get_raw_plurk(link):
    plurk_info, plurk_html, content = get_content_by_link(link)

    head = plurk_html.find_all("div", class_="plurk")  
    string = ''
    for owo in head[0].find("div", class_="text_holder"):
        string += str(owo)  

    raw_data = {
        "plurk_info": {
            "plurk_id":plurk_info.get('id'),
            "user_id":plurk_info.get('user_id'),
            "favorite_count":plurk_info.get('favorite_count'),
            "response_count":plurk_info.get('response_count'),
            "replurkers_count":plurk_info.get('replurkers_count'),
            "coins":plurk_info.get('coins'),
            "qualifier":plurk_info.get('qualifier'),
            "anonymous":plurk_info.get('anonymous'),
            "st_edited":plurk_info.get('st_edited'),
            "no_comments":plurk_info.get('no_comments'),
            "posted":plurk_info.get('posted'),
            "lang":plurk_info.get('lang'),
            "content":plurk_info.get('content'),
            "content_raw":plurk_info.get('content_raw'),
        },
        "plurk":{
            "poster_img": head[0].find("img").get("src"),
            "poster_name": head[0].find("a", class_="name").text,
            "post_time": head[0].find("time", class_="timeago")['datetime'],
            "post_content": string,
            "response_count": content.get('response_count')        
        },
        "response":[]
    }


    users = content.get('users')
    for response in content.get('responses'):
        user_id = str(response.get('user_id'))

        # user profile image
        has_profile_image = users.get(user_id).get('has_profile_image')
        avatar = users.get(user_id).get('avatar')
        if has_profile_image == 1 and avatar != None:
            user_img = "https://avatars.plurk.com/" + user_id + "-small"+str(avatar)+".gif"
        elif has_profile_image == 1 and avatar == None:
            user_img = "https://avatars.plurk.com/" + user_id + "-small.gif"
        else:
            user_img = "https://www.plurk.com/static/default_small.gif"

        # every response 
        raw_data['response'].append({
            "user_id": user_id,
            "user_img": user_img,
            "user_name": users.get(user_id).get('display_name'),
            "name_color": users.get(user_id).get('name_color'),
            "content": response.get('content_raw'),
            "posted":response.get('posted')
        })

    return raw_data

In [33]:
url = input('Plurk url: ')
raw_data = get_raw_plurk(url)
raw_data

Plurk url: https://www.plurk.com/p/negxzt
Request Success! Status: 200.
{'plurk_id': 1415027225, 'from_response_id': '0'}
Request Success! Status: 200.


{'plurk_info': {'plurk_id': 1415027225,
  'user_id': 4898180,
  'favorite_count': 206,
  'response_count': 82,
  'replurkers_count': 161,
  'coins': 0,
  'qualifier': ':',
  'anonymous': False,
  'st_edited': None,
  'no_comments': 0,
  'posted': 'NewExpr',
  'lang': 'tr_ch',
  'content': '\\u6211\\u5011\\u6709\\u65b0\\u529f\\u80fd\\u4e86\\uff0c\\u76ee\\u524d\\u5148\\u958b\\u653e\\u5657\\u5e63\\u4f7f\\u7528\\u8005\\u6e2c\\u8a66<br /><a href=\\"https://www.plurk.com/p/negxv7\\" class=\\"ex_link meta plink\\" rel=\\"nofollow\\"><img src=\\"https://avatars.plurk.com/4203050-medium10.gif\\" height=\\"40px\\">@plurktaiwan - \\u65b0\\u529f\\u80fd\\uff1a\\u8cc7\\u6e90\\u56de\\u6536\\u6876 - \\u958b\\u653e\\u5657\\u5e63\\u4f7f\\u7528\\u8005\\u6e2c\\u8a66\\u4e2d\\u64c1\\u6709\\u5657\\u5e63\\u5c07\\u6703\\u9810\\u8a2d\\u555f\\u7528\\u8cc7\\u6e90\\u56de\\u6536\\u6876...</a>',
  'content_raw': '\\u6211\\u5011\\u6709\\u65b0\\u529f\\u80fd\\u4e86\\uff0c\\u76ee\\u524d\\u5148\\u958b\\u653e\\u5657\\u5e6

In [34]:
##################### 
#                   #
#      Makedown     #
#                   #
#####################

md_file = MdUtils(file_name='plurk_test',title='plurk_test_v0')

md_file.new_paragraph("![U](" + raw_data['plurk']['poster_img'] + ")")
md_file.write(raw_data['plurk']['poster_name'], bold_italics_code='b', color='#DDDDDD')
md_file.new_line(raw_data['plurk']['post_content'])
md_file.new_line(raw_data['plurk']['post_time'])
md_file.write('\n___')

md_file.new_line(str(raw_data['plurk']['response_count'])+'則回應',color='#E8E8E8')
for i in raw_data['response']:
    md_file.new_paragraph("![U]("+i['user_img']+")")

    if i['name_color'] == None:
        md_file.write(i['user_name'], bold_italics_code='b', color='#DDDDDD')        
    else:
        md_file.write(i['user_name'], bold_italics_code='b', color='#'+i['name_color'])

    md_file.new_line(i['content'])
    md_file.new_line(i['posted'])

md_file.create_md_file()

<mdutils.fileutils.fileutils.MarkDownFile at 0x1dafdf35e80>