# Langchain Tool Bakeoff

In [None]:
import datetime
import json
import xml.etree.ElementTree as ET
import os
from collections import Counter
from typing import Optional, Type

from bs4 import BeautifulSoup
from langchain.tools import BaseTool
from langchain.callbacks.manager import AsyncCallbackManagerForToolRun, CallbackManagerForToolRun

from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

import crawler

In [17]:
def filenameParser(filename: str) -> tuple[str, str, str]:
    filename = filename[:-4]
    date, time, post_id = filename.split('_')
    return date, time, post_id

def findPostByID(post_id: str) -> str:
    for year_dir in os.listdir("./HatePolitics/"):
        for filename in os.listdir(os.path.join("./HatePolitics/", year_dir)):
            if f"{post_id}" in filename:
                return os.path.join("./HatePolitics/", year_dir, filename)

In [18]:
findPostByID("M.1672914887.A.04F")

'./HatePolitics/2023/20230105_1034_M.1672914887.A.04F.xml'

In [11]:

'''
Header format:
<?xml version='1.0' encoding='utf-8'?>
<TEI.2>
   <teiHeader>
      <metadata name="media"></metadata>
      <metadata name="author"></metadata>
      <metadata name="post_id"></metadata>
      <metadata name="year"></metadata>
      <metadata name="board"></metadata>
      <metadata name="title"></metadata>
   </teiHeader>
   <text>
      <body author=""></body>
      <title author=""></title>
      <comment author="" c_type=""></comment>
      <!-- c_type {pos: 推, neu: 箭頭, neg: 噓 -->
   </text>
</TEI.2>
'''
directory = './HatePolitics/2023'
files = os.listdir(directory)
files.sort()

start_date = 20230101
end_date = 20230131
in_range_post_ids = []
for filename in files:
    date = int(filename[:8])
    if date >= start_date and date <= end_date:
        in_range_post_ids.append(filename)

# print(inrange_files)
print(len(in_range_post_ids))

# parse each xml file and get the author name
author_count = Counter()
min_score = 0
max_score = 0
min_author = ''
max_author = ''
min_title = ''
max_title = ''

for filename in in_range_post_ids:
    tree = ET.parse(directory + '/' + filename)
    root = tree.getroot()
    author = root[0][1].text
    title = root[0][5].text
    author_count[author] += 1

    # loop through all comments
    score = 0
    for comment in root[1][2:]:
        print(comment)
        print(comment.attrib)
        comment_type = comment.attrib['c_type']
        if comment_type == 'pos':
            score += 1
        elif comment_type == 'neg':
            score -= 1
    
    if score < min_score:
        min_score = score
        min_author = author
        min_title = title
    elif score > max_score:
        max_score = score
        max_author = author
        max_title = title

print(author_count.most_common(10))
print(min_score, min_author, min_title)
print(max_score, max_author, max_title)



1020
<Element 'comment' at 0x7f0a8ef8e900>
{'author': 'gowaa', 'c_type': 'neu'}
<Element 'comment' at 0x7f0a8efbc4f0>
{'author': 'ohohohya', 'c_type': 'neg'}
<Element 'comment' at 0x7f0a8efbc9f0>
{'author': 'ohohohya', 'c_type': 'neu'}
<Element 'comment' at 0x7f0a8efbcea0>
{'author': 'ohohohya', 'c_type': 'neg'}
<Element 'comment' at 0x7f0a8efaa720>
{'author': 'TheoEpstein', 'c_type': 'neu'}
<Element 'comment' at 0x7f0a8efaac70>
{'author': 'zeuswell', 'c_type': 'neu'}
<Element 'comment' at 0x7f0a8e6ac400>
{'author': 'zeuswell', 'c_type': 'neu'}
<Element 'comment' at 0x7f0a8e6ac950>
{'author': 'kuninaka', 'c_type': 'pos'}
<Element 'comment' at 0x7f0a8e6accc0>
{'author': 'kuninaka', 'c_type': 'neu'}
<Element 'comment' at 0x7f0a8e692db0>
{'author': 'kuninaka', 'c_type': 'neu'}
<Element 'comment' at 0x7f0a8e692950>
{'author': 'kuninaka', 'c_type': 'neu'}
<Element 'comment' at 0x7f0a8e692450>
{'author': 'TheoEpstein', 'c_type': 'neu'}
<Element 'comment' at 0x7f0a8e692090>
{'author': 'TheoEp

In [10]:
filename = in_range_post_ids[0]
tree = ET.parse(directory + '/' + filename)
root = tree.getroot()
print(root[1][2].attrib)

{'author': 'gowaa', 'c_type': 'neu'}


XML file format:

```xml
<?xml version='1.0' encoding='utf-8'?>
<TEI.2>
   <teiHeader>
      <metadata name="media"></metadata>
      <metadata name="author"></metadata>
      <metadata name="post_id"></metadata>
      <metadata name="year"></metadata>
      <metadata name="board"></metadata>
      <metadata name="title"></metadata>
   </teiHeader>
   <text>
      <body author="">
         <s>
            <w type=""></w> <!-- type=詞性標記 -->
         </s>
      </body>
      <title author=""></title>
      <!-- c_type {pos: 推, neu: 箭頭, neg: 噓} -->
      <comment author="" c_type=""></comment>
   </text>
</TEI.2>
```

[Reference](https://python.langchain.com/en/latest/modules/agents/tools/custom_tools.html)

In [None]:
# Template for a tool
class TemplateTool(BaseTool):
    name = "custom_search"
    description = "useful for when you need to answer questions about current events"

    def _run(self,
             query: str,
             run_manager: Optional[CallbackManagerForToolRun] = None) -> str:
        """Use the tool."""
        return "query"
    
    async def _arun(self, query: str, run_manager: Optional[AsyncCallbackManagerForToolRun] = None) -> str:
        """Use the tool asynchronously."""
        raise NotImplementedError("custom_search does not support async")
    

In [None]:
class GetPostsByDate(BaseTool):
    """
    Get ptt posts in the database, by date
    """
    name = "get_posts_by_date"
    description = """
    Input: string of date, in format YYYYMMDD, (e.g. 20200101)
    Output: list of post ids, serialized in json
    """
    def _run(self,
             query: str,
             run_manager: Optional[CallbackManagerForToolRun] = None) -> str:
        year, month, day = query[:4], query[4:6], query[6:]
        assert len(year) == 4 and len(month) == 2 and len(day) == 2
        directory = "./HatePolitics/2023"
        in_range_post_ids = []
        files = os.listdir(directory)
        files.sort()
        for filename in files:
            date, time, post_id = filenameParser(filename)
            if date == year + month + day:
                in_range_post_ids.append(post_id)
        return json.dumps(in_range_post_ids)

    
    async def _arun(self,
                    query: str,
                    run_manager: Optional[CallbackManagerForToolRun] = None) -> str:
        raise NotImplementedError("Doesn't support async")

class GetArrowCount(BaseTool):
    """
    Get upvote count of a post
    """
    name = "get_upvote_count"
    description = """
    Input: post_id returned by get_posts_by_date (e.g. M.1672914887.A.04F)
    Output: upvote count
    """
    def _run(self,
             query: str,
             run_manager: Optional[CallbackManagerForToolRun] = None) -> str:
        directory = "./HatePolitics/2023"
        # filename = directory + '/' + query + '.xml'
        filename = findPostByID(query)
        tree = ET.parse(filename)
        root = tree.getroot()
        arrow_count = 0
        for comment in root[1][2:]:
            comment_type = comment.attrib['c_type']
            if comment_type == 'neu':
                arrow_count += 1
        return str(arrow_count)
    
    async def _arun(self,
                    query: str,
                    run_manager: Optional[CallbackManagerForToolRun] = None) -> str:
        raise NotImplementedError("Doesn't support async")
    
class GetDownvoteCount(BaseTool):
    """
    Get upvote count of a post
    """
    name = "get_upvote_count"
    description = """
    Input: post_id returned by get_posts_by_date (e.g. M.1672914887.A.04F)
    Output: upvote count
    """
    def _run(self,
             query: str,
             run_manager: Optional[CallbackManagerForToolRun] = None) -> str:
        directory = "./HatePolitics/2023"
        # filename = directory + '/' + query + '.xml'
        filename = findPostByID(query)
        tree = ET.parse(filename)
        root = tree.getroot()
        downvote_count = 0
        for comment in root[1][2:]:
            comment_type = comment.attrib['c_type']
            if comment_type == 'neg':
                downvote_count += 1
        return str(downvote_count)
    
    async def _arun(self,
                    query: str,
                    run_manager: Optional[CallbackManagerForToolRun] = None) -> str:
        raise NotImplementedError("Doesn't support async")
    
class GetUpvoteCount(BaseTool):
    """
    Get upvote count of a post
    """
    name = "get_upvote_count"
    description = """
    Input: post_id returned by get_posts_by_date (e.g. M.1672914887.A.04F)
    Output: upvote count
    """
    def _run(self,
             query: str,
             run_manager: Optional[CallbackManagerForToolRun] = None) -> str:
        directory = "./HatePolitics/2023"
        # filename = directory + '/' + query + '.xml'
        filename = findPostByID(query)
        tree = ET.parse(filename)
        root = tree.getroot()
        upvote_count = 0
        for comment in root[1][2:]:
            comment_type = comment.attrib['c_type']
            if comment_type == 'pos':
                upvote_count += 1
        return str(upvote_count)
    
    async def _arun(self,
                    query: str,
                    run_manager: Optional[CallbackManagerForToolRun] = None) -> str:
        raise NotImplementedError("Doesn't support async")

In [None]:

class GetPostsTitlesByCrawler(BaseTool):
    """
    Get latest news posts titles from crawler
    Support website: 
        (default): https://news.pts.org.tw/category/1
        https://news.ttv.com.tw/category/%E6%94%BF%E6%B2%BB
    """
    name = "get_posts_titles_by_crawler"
    description = "獲得近期政治新聞標題"
    def _run(
        self,
        query: str,
        run_manager: Optional[CallbackManagerForToolRun] = None
    ) -> str:
        posts = crawler.politic_news_crawler('pts', cnt=100)
        titles = [post['title'] for post in posts]
        return json.dumps(titles)
    
    async def _arun(
        self,
        query: str,
        run_manager: Optional[CallbackManagerForToolRun] = None
    ) -> str:
        raise NotImplementedError("Doesn't support async")

class GetPostsSummaryByCrawler(BaseTool):
    """
    Get latest news posts summary content from crawler
    Summarize by package sumy using LsaSummarizer
    Support website: 
        (default): https://news.pts.org.tw/category/1
        https://news.ttv.com.tw/category/%E6%94%BF%E6%B2%BB
    """
    name = "get_posts_titles_by_crawler"
    description = "獲得近期政治新聞內文概述"
    LANGUAGE = "chinese"
    tokenizer = Tokenizer(LANGUAGE)

    def summarize(self, contents):
        parser = PlaintextParser.from_string(
            contents,
            self.tokenizer,
        )
        summaries = summarizer
        return summaries
    
    def _run(
        self,
        query: str,
        run_manager: Optional[CallbackManagerForToolRun] = None
    ) -> str:
        posts = crawler.politic_news_crawler('pts', cnt=100)
        contents = [post['content'] for post in posts]
        summaries = summaries(contents)
        return json.dumps(summaries)
    
    async def _arun(
        self,
        query: str,
        run_manager: Optional[CallbackManagerForToolRun] = None
    ) -> str:
        raise NotImplementedError("Doesn't support async")