In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
import time
import os
import json
import sqlite3
from dotenv import load_dotenv
import requests
from datetime import datetime
import schedule
import pytz
import random

# Load the .env file
load_dotenv()

# Access the variables as environment variables
weibo_url = os.getenv('WEIBO_URL')
message_webhook_url = os.getenv('MESSAGE_WEBHOOK_URL')
status_webhook_url = os.getenv('STATUS_WEBHOOK_URL')


class WeiboScrapper:
    def __init__(self):
        # Setup driver
        # add headless
        self.driver = self.new_driver()
        # create a sqlite database to store id
        # change to mongodb later
        self.db = sqlite3.connect('weibo.db')
        self.cursor = self.db.cursor()
        self.cursor.execute('''CREATE TABLE IF NOT EXISTS weibo (id INTEGER PRIMARY KEY)''')
        self.db.commit()
        with open('kawaii_content.json', 'r') as f:
            data = json.load(f)
        self.kawaii_emojis = data['kawaii_emojis']
        self.kawaii_texts = data['kawaii_texts']
        self.kawaii_titles = data['kawaii_titles']

    
    def new_driver(self):
        # Setup driver
        # add headless
        options = Options()
        options.add_argument('--headless')
        driver = webdriver.Chrome(\
            service=Service(ChromeDriverManager().install())\
                ,options=options)
        return driver


    def start(self):
        # Run the scan immediately and then every 10 minutes
        self.scan()
        schedule.every(10).minutes.do(self.scan)

        # Run send_status immediately and then every hour
        self.send_status()
        schedule.every(1).hour.do(self.send_status)

        while True:
            schedule.run_pending()
            time.sleep(1)

        
    def get_weibo_content_once(self):
        # check if the driver is alive
        if self.driver.service.is_connectable():
            pass
        else:
            self.driver.quit()
            self.driver = self.new_driver()

        try:
            self.driver.get(os.getenv('WEIBO_URL'))
            # Wait for the dynamic content to load
            time.sleep(10)
            self.driver.implicitly_wait(20)
            pre_tag = self.driver.find_element(By.TAG_NAME, 'pre')
            json_text = pre_tag.text
        except Exception as e:
            print(e)
            return None
        content = json.loads(json_text) # content is a dictionary
        return content['data']['list']
    
    def check_id(self,item):
        # if id is not in the database, return True
        # else return False
        weibo_item_id=item['id']
        self.cursor.execute('''SELECT * FROM weibo WHERE id=?''',(weibo_item_id,))
        if self.cursor.fetchone() is None:
            #write the id to the database
            self.cursor.execute('''INSERT INTO weibo (id) VALUES (?)''',(weibo_item_id,))
            self.db.commit()
            return True
        else:
            return False       

    def get_weibo_content_loop(self):
        i=0
        print(f'getting weibo content... @ {datetime.now()}')
        while True:
            content = self.get_weibo_content_once()
            if content:
                break   
            print('retrying...')
            time.sleep(60)
            i+=1
            print(i)
            if i>10:
                print('failed')
                return None
        return content

    def scan(self):
        content = self.get_weibo_content_loop()
        if content:
            for item in content:
                if self.check_id(item):
                    self.parse_item(item)
                    time.sleep(5)
        else:
            print('failed to get content')
            return None

    def parse_item(self,item):
        # parse item and store it in the database
        # send text_raw to discord
        # add separator to text_raw
        text_raw = item['text_raw']
        created_at = item['created_at']
        # use discord embed to display the content
        # "embed_color": 16738740
        dt = datetime.strptime(created_at, '%a %b %d %H:%M:%S %z %Y')
        # Convert to UTC
        utc_dt = dt.astimezone(pytz.UTC)
        # Format as required by Discord
        discord_timestamp = utc_dt.strftime('%Y-%m-%dT%H:%M:%S.%fZ')
        message ={
            "embeds": [{
                "title": "塔菲の新微博喵~",
                "url": "https://weibo.com/7618923072?refer_flag=1001030103_",
                "description": text_raw,
                "color": 16738740,
                "timestamp": discord_timestamp
            },
            ]
        }
        response = requests.post(message_webhook_url, json=message)
        return response.status_code

    def send_status(self):
        # send status to discord, say that the script is running, add some random kawaii emoji and text
        # use discord embed to display the content
        embed_color = 16738740
        emoji = random.choice(self.kawaii_emojis)
        text = random.choice(self.kawaii_texts)
        title = random.choice(self.kawaii_titles)
        machine_info = f"{os.uname().nodename} {os.uname().machine}"
        # TODO: use chatgpt to generate random text
        # get current time, up to seconds, timezone GMT+9
        timezone = pytz.timezone('Etc/GMT-9')
        # Get current time up to seconds in GMT+9
        time_now = datetime.now(timezone).strftime('%Y-%m-%d %H:%M:%S %Z')

        message = {
            "embeds": [
                {
                    "title": title,
                    "description": f"{emoji} {text} @ {time_now} -- {machine_info}",
                    "color": embed_color
                }
            ]
        }
        response = requests.post(status_webhook_url, json=message)
        return response.status_code

In [2]:

    

scraper = WeiboScrapper()
content = scraper.get_weibo_content_loop()



[WDM] - Downloading: 100%|██████████| 6.30M/6.30M [00:00<00:00, 24.2MB/s]


getting weibo content... @ 2023-07-06 23:18:37.990563


In [3]:
# save content to a file
with open('weibo_content.json', 'w') as f:
    json.dump(content, f, indent=4, ensure_ascii=False)

In [4]:
content[0].keys()

dict_keys(['visible', 'created_at', 'id', 'idstr', 'mid', 'mblogid', 'user', 'can_edit', 'text_raw', 'text', 'textLength', 'annotations', 'source', 'favorited', 'rid', 'pic_ids', 'pic_focus_point', 'geo', 'pic_num', 'pic_infos', 'is_paid', 'mblog_vip_type', 'number_display_strategy', 'title_source', 'reposts_count', 'comments_count', 'attitudes_count', 'attitudes_status', 'continue_tag', 'isLongText', 'mlevel', 'content_auth', 'is_show_bulletin', 'comment_manage_info', 'screen_name_suffix_new', 'share_repost_type', 'url_struct', 'isTop', 'mblogtype', 'showFeedRepost', 'showFeedComment', 'pictureViewerSign', 'showPictureViewer', 'rcList', 'region_name', 'customIcons'])

In [8]:
content[0]['created_at']

'Mon Jul 03 17:00:03 +0800 2023'

In [19]:
content[0]['']

{'visible': {'type': 0, 'list_id': 0},
 'created_at': 'Tue Jun 20 12:01:01 +0800 2023',
 'id': 4914676545359937,
 'idstr': '4914676545359937',
 'mid': '4914676545359937',
 'mblogid': 'N69NQmumB',
 'user': {'id': 7618923072,
  'idstr': '7618923072',
  'pc_new': 7,
  'screen_name': '永雏塔菲',
  'profile_image_url': 'https://tvax4.sinaimg.cn/crop.0.0.1080.1080.50/008jCdW0ly8hfdxzryxz7j30u00u0gpc.jpg?KID=imgbed,tva&Expires=1688320522&ssig=aZeHk7TMYQ',
  'profile_url': '/u/7618923072',
  'verified': True,
  'verified_type': 0,
  'domain': 'acetaffy',
  'weihao': '',
  'verified_type_ext': 1,
  'avatar_large': 'https://tvax4.sinaimg.cn/crop.0.0.1080.1080.180/008jCdW0ly8hfdxzryxz7j30u00u0gpc.jpg?KID=imgbed,tva&Expires=1688320522&ssig=42vYRwga4j',
  'avatar_hd': 'https://tvax4.sinaimg.cn/crop.0.0.1080.1080.1024/008jCdW0ly8hfdxzryxz7j30u00u0gpc.jpg?KID=imgbed,tva&Expires=1688320522&ssig=0rsc1lAgUN',
  'follow_me': False,
  'following': False,
  'mbrank': 5,
  'mbtype': 12,
  'v_plus': 0,
  'planet

In [10]:
from datetime import datetime
from dateutil import tz
time_string = 'Mon Jul 03 17:00:03 +0800 2023'

# Parse string to datetime object, accounting for the timezone
dt = datetime.strptime(time_string, '%a %b %d %H:%M:%S %z %Y')

# Convert to UTC and format as required by Discord
discord_timestamp = dt.astimezone(tz.tzutc()).isoformat("T", "seconds")

print(discord_timestamp)

2023-07-03T09:00:03+00:00


In [11]:
from datetime import datetime
import pytz

# Your input string
time_string = 'Mon Jul 03 17:00:03 +0800 2023'

# Parse string to datetime object, accounting for the timezone
dt = datetime.strptime(time_string, '%a %b %d %H:%M:%S %z %Y')

# Convert to UTC
utc_dt = dt.astimezone(pytz.UTC)

# Format as required by Discord
discord_timestamp = utc_dt.strftime('%Y-%m-%dT%H:%M:%S.%fZ')

print(discord_timestamp)


2023-07-03T09:00:03.000000Z
