In [1]:
import sys
sys.path.append('../')

In [2]:
from urllib.parse import urljoin
from config import Config
import requests
from bs4 import BeautifulSoup
# import pandas as pd
import time
import json


class Crawler:
    def __init__(self) -> None:
        pass

    def fetch_data(self, url, encoding):
        """Fetch data from url. Return the content of the response."""
        time.sleep(Config.SLEEP_TIME)  # sleep for n second to avoid being blocked
        headers = {"User-Agent": Config.USER_AGENT, 'Cookie': Config.COOKIE}
        response = requests.get(url, headers=headers)
        if encoding:
            response.encoding = encoding
        return response.text
    
    def fetch_file(self, url, html_locator, encoding):
        """Fetch file from url. Return a list of bs4.element.Tag."""
        bs = BeautifulSoup(self.fetch_data(url, encoding), 'html.parser')

        for operation in html_locator:
            method = operation['method']
            args = operation['args']
            kwargs = operation.get('kwargs', None)

            if method == 'find':
                if kwargs:
                    bs = bs.find(args, **kwargs)
                else:
                    bs = bs.find(args)

            elif method == 'find_all':
                if kwargs:
                    bs = bs.find_all(args, **kwargs)
                else:
                    bs = bs.find_all(args)

        return bs
    
    def get_info(self, bs_result, url, method, args, kargs, args2):

        if method == 'find' and kargs == None:
            if args2 == 'text':
                return bs_result.find(args).text.strip()
            elif args2 == 'href':
                return urljoin(url, bs_result.find(args)[args2])
            else:
                return bs_result.find(args)[args2]

        elif method == 'find' and kargs != None:
            if args2 == 'text':
                return bs_result.find(args, **kargs).text.strip()
            elif args2 == 'href':
                return urljoin(url, bs_result.find(args, **kargs)[args2])
            else:
                return bs_result.find(args, **kargs)[args2]

        elif method == None:
            return None

    def fetch_file_list(self, url, encoding, html_locator, info_locator , source_name, file_type = None, file_type_2 = None):
        """Fetch file list from url. Return a list of dict."""
        bs_result = self.fetch_file(url, html_locator, encoding)
        
        result = []

        for i in range(len(bs_result)):
            result.append(dict())
            
            for operation in info_locator:
                info = operation['info']
                method = operation['method']
                args = operation['args']
                args2 = operation['args2']
                kargs = None

                if type(args) == list:
                    kargs = args[1]
                    args = args[0]

                result[i][info] = self.get_info(bs_result[i], url, method, args, kargs, args2)

            
            if result[i]['time'] != None and result[i]['time'].count('\n') == 1:
                parts = result[i]['time'].split('\n')
                result[i]['time'] = f'{parts[1]}-{parts[0]}'

            result[i]['source'] = source_name
            result[i]['file_type'] = file_type
            result[i]['file_type_2'] = file_type_2

        return result
    
    def generate_file_list(self):
        file_list = []
        with open('data.json', 'r') as json_file:
            data_list = json.load(json_file)

        for data in data_list:
            url = data['url']
            encoding = data['encoding']
            html_locator = data['html_locator']
            info_locator = data['info_locator']
            source_name = data['title']
            file_type = data['dtype']
            file_type_2 = data['dtype2']
            flip = data['flip']

            file_list += self.fetch_file_list(url, encoding, html_locator, info_locator, source_name, file_type, file_type_2)

        return file_list
    
# print(Crawler().generate_file_list())

In [113]:
import pandas as pd

class Crawler_test:
    def __init__(self) -> None:
        pass

    def fetch_data(self, url):
        """Fetch data from url. Return the content of the response."""
        time.sleep(Config.SLEEP_TIME)  # sleep for n second to avoid being blocked
        headers = {"User-Agent": Config.USER_AGENT, 'Cookie': Config.COOKIE}
        response = requests.get(url, headers=headers)
        response.encoding = 'utf-8'
        return response.text
    
    def fetch_file(self, url):
        """Fetch file from url. Return a list of bs4.element.Tag."""
        bs = BeautifulSoup(self.fetch_data(url), 'html.parser')

        bs = bs.find('table', frag = '窗口1')
        bs = bs.find_all('tr', class_ = 'light')

        return bs

    def fetch_file_list(self, url, source_name, file_type = None, file_type_2 = None):
        """Fetch file list from url. Return a list of dict."""
        bs_result = self.fetch_file(url)
        
        print(bs_result)

        result = []

        for i in range(len(bs_result)):
            result.append(dict())
            
            result[i]['title'] = bs_result[i].find('td', width='80%').text.strip()
            result[i]['url'] = urljoin(url, bs_result[i].find('a')['href'])
            result[i]['time'] = bs_result[i].text.strip()
            if result[i]['time'].count('\n') != None and result[i]['time'].count('\n') == 1:
                parts = result[i]['time'].split('\n')
                result[i]['time'] = f'{parts[1]}-{parts[0]}'

            result[i]['source'] = source_name
            result[i]['file_type'] = file_type
            result[i]['file_type_2'] = file_type_2

        return result
    
print(Crawler_test().fetch_file_list('https://scc.ustc.edu.cn/409/list1.psp', '超算中心', '常见使用问题'))

[<tr class="light" valign="top">
<td align="right" valign="top" width="15"><img oldid="613" oldsrc="/_upload/tpl/00/1a/26/template26/res/ydot.gif" related="1" src="/_upload/tpl/00/1a/26/template26/res/ydot.gif"/> </td>
<td width="80%"><a href="https://scc.ustc.edu.cn/zlsc/user_doc/html" target="_blank" title="用户使用手册[html版]">用户使用手册[html版]</a></td>
<td align="right">2021-03-07</td></tr>, <tr class="light" valign="top">
<td align="right" valign="top" width="15"><img oldid="613" oldsrc="/_upload/tpl/00/1a/26/template26/res/ydot.gif" related="1" src="/_upload/tpl/00/1a/26/template26/res/ydot.gif"/> </td>
<td width="80%"><a href="https://scc.ustc.edu.cn/zlsc/faq/" target="" title="常见使用问题">常见使用问题</a></td>
<td align="right">2018-10-01</td></tr>, <tr class="light" valign="top">
<td align="right" valign="top" width="15"><img oldid="613" oldsrc="/_upload/tpl/00/1a/26/template26/res/ydot.gif" related="1" src="/_upload/tpl/00/1a/26/template26/res/ydot.gif"/> </td>
<td width="80%"><a href="/2018/092

In [130]:
import pandas as pd
from tqdm import tqdm

# 翻页功能部分函数, 在教务处网站上已经基本实现翻页功能

class Crawler_2:
    def __init__(self) -> None:
        pass

    def fetch_data(self, url, encoding):
        """Fetch data from url. Return the content of the response."""
        time.sleep(Config.SLEEP_TIME)  # sleep for n second to avoid being blocked
        headers = {"User-Agent": Config.USER_AGENT, 'Cookie': Config.COOKIE}
        response = requests.get(url, headers=headers)
        if encoding:
            response.encoding = encoding
        return response.text
    
    def fetch_file(self, url, html_locator, encoding):
        """Fetch file from url. Return a list of bs4.element.Tag."""
        bs = BeautifulSoup(self.fetch_data(url, encoding), 'html.parser')

        for operation in html_locator:
            method = operation['method']
            args = operation['args']
            kwargs = operation.get('kwargs', None)

            if method == 'find':
                if kwargs:
                    bs = bs.find(args, **kwargs)
                else:
                    bs = bs.find(args)

            elif method == 'find_all':
                if kwargs:
                    bs = bs.find_all(args, **kwargs)
                else:
                    bs = bs.find_all(args)
            
            if bs == None:
                return None

        return bs
    
    def get_info(self, bs_result, url, method, args, kargs, args2):

        if method == 'find' and kargs == None:
            if args2 == 'text':
                return bs_result.find(args).text.strip()
            elif args2 == 'href':
                return urljoin(url, bs_result.find(args)[args2])
            else:
                return bs_result.find(args)[args2]

        elif method == 'find' and kargs != None:
            if args2 == 'text':
                return bs_result.find(args, **kargs).text.strip()
            elif args2 == 'href':
                return urljoin(url, bs_result.find(args, **kargs)[args2])
            else:
                return bs_result.find(args, **kargs)[args2]

        # elif method == None:
        #     return None
        elif method == None:
            if args2 == 'text':
                return bs_result.text.strip()
            elif args2 == 'href':
                return urljoin(url, bs_result[args2])

    def fetch_file_list(self, url, encoding, html_locator, info_locator , source_name, file_type = None, file_type_2 = None):
        """Fetch file list from url. Return a list of dict."""

        # 极其特殊的先研院网站翻页机制
        if url == 'https://iat.ustc.edu.cn/iat/x161/index_1.html':
            url = 'https://iat.ustc.edu.cn/iat/x161/index.html'

        bs_result = self.fetch_file(url, html_locator, encoding)

        if bs_result == None:
            return None
        
        result = []

        for i in range(len(bs_result)):
            result.append(dict())
            
            for operation in info_locator:
                info = operation['info']
                method = operation['method']
                args = operation['args']
                args2 = operation['args2']
                kargs = None

                if type(args) == list:
                    kargs = args[1]
                    args = args[0]

                result[i][info] = self.get_info(bs_result[i], url, method, args, kargs, args2)

            
            if result[i]['time'] == None or result[i]['time'] == '': 
                pass

            elif result[i]['time'].count('\n') == 1:
                parts = result[i]['time'].split('\n')
                result[i]['time'] = f'{parts[1]}-{parts[0]}'

            elif '发布时间' in result[i]['time']:
                result[i]['time'] = result[i]['time'][5:15]

            elif '[' == result[i]['time'][0]:
                result[i]['time'] = result[i]['time'][1:-1]

            elif result[i]['time'].count('年') == 1 and result[i]['time'].count('月') == 1 and result[i]['time'].count('日') == 1:
                result[i]['time'] = result[i]['time'].replace('年', '-').replace('月', '-').replace('日', '')

            # 检测时间末尾是否带有标题，如有，则去除时间中的标题。超算中心网站特性
            if result[i]['title'] in result[i]['time']:
                result[i]['time'] = result[i]['time'].replace(result[i]['title'], '').strip()[:-1]
            
            # 检测标题末尾是否有[xxxx-xx-xx]的时间信息，如有，在标题中去除，并将对应内容移动到时间处。时间处内容不包括括号(软件学院网站bug)
            if result[i]['title'][-11:-1].count('-') == 2:
                result[i]['time'] = result[i]['title'][-11:-1]
                result[i]['title'] = result[i]['title'][:-13]
            

            result[i]['source'] = source_name
            result[i]['file_type'] = file_type
            result[i]['file_type_2'] = file_type_2

        return result
    
    def generate_file_list(self):
        file_list = []
        with open('data.json', 'r') as json_file:
            data_list = json.load(json_file)

        for data in tqdm(data_list, desc='Processing', unit='item'):
            url = data['url']
            encoding = data['encoding']
            html_locator = data['html_locator']
            info_locator = data['info_locator']
            source_name = data['title']
            file_type = data['dtype']
            file_type_2 = data['dtype2']
            flip = data['flip']
            
            if flip == False:
                file_list += self.fetch_file_list(url, encoding, html_locator, info_locator, source_name, file_type, file_type_2)

            else:
                for i in range(1, 100):
                    flip_url = url.replace('{page_num}', str(i))
                    flip_result = self.fetch_file_list(flip_url, encoding, html_locator, info_locator, source_name, file_type, file_type_2)
                    if flip_result == None or len(flip_result) == 0:
                        break
                    else:
                        file_list += flip_result


        return file_list
    
    def generate_file_csv(self):
        file_list = self.generate_file_list()
        df = pd.DataFrame(file_list)
        df.to_csv('file_list.csv', index=False)
        return df

Crawler_2().generate_file_csv()

Processing: 100%|██████████| 91/91 [09:14<00:00,  6.09s/item]


Unnamed: 0,title,url,time,source,file_type,file_type_2
0,大数据学院学生离校请销假管理规定,http://sds.ustc.edu.cn/2021/1111/c15443a532371...,2022-11-28,大数据学院,,
1,公务用车平台设置使用手册,http://sds.ustc.edu.cn/2022/0628/c15443a561056...,2022-06-28,大数据学院,,
2,主题团日活动项目审批表,http://sds.ustc.edu.cn/2022/0519/c15443a555203...,2022-05-19,大数据学院,,
3,主题党日活动项目审批表,http://sds.ustc.edu.cn/2022/0519/c15443a555202...,2022-05-19,大数据学院,,
4,大数据学院网站信息发布审核制度,http://sds.ustc.edu.cn/2021/1122/c15443a535477...,2021-11-22,大数据学院,,
...,...,...,...,...,...,...
1959,新支部成立“三上三下”（各党总支所属党支部为例）,https://ses.ustc.edu.cn/2022/1009/c28343a57461...,2022-10-09,工程科学学院,,
1960,支部换届“三上三下”（样例）,https://ses.ustc.edu.cn/2022/1009/c28343a57460...,2022-10-09,工程科学学院,,
1961,出国（境）党员保留、恢复组织关系审批表,https://ses.ustc.edu.cn/2022/1009/c28343a57460...,2022-10-09,工程科学学院,,
1962,支部日常工作常用材料模板,https://ses.ustc.edu.cn/2022/1009/c28343a57460...,2022-10-09,工程科学学院,,
