In [3]:
import warnings, requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
from datetime import datetime
import ciso8601, time

# 全局设置
np.set_printoptions(suppress=True)
warnings.filterwarnings("ignore")  # Suppress warnings.
pd.options.display.max_rows = 10  # Display no more than 10 rows.

In [None]:
# 爬取天职要闻中的所有标题、日期与网址

items = []
urls = []

# 设置需要爬取的网址
for i in range(1, 154):
    urls.append(f"https://www.tjtc.edu.cn/index/tzyw/{i}.htm")
urls.append("https://www.tjtc.edu.cn/index/tzyw.htm")
urls.reverse()

j = 0
links = []

for url in urls:
    headers = {
        'user-agent':
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
    }
    page = requests.get(url, headers=headers, verify=False)  # 不设置验证
    page.encoding = "utf-8"

    # 搜索网页元素
    soup = BeautifulSoup(page.text, 'html.parser')
    for i in range(0, 15):
        id = f"line_u9_{i}"
        sources = soup.find_all('li', id=id)
        if sources:
            items.append(sources[0].text.strip())
            link = "https://www.tjtc.edu.cn" + sources[0].a['href'][-19:]
            links.append(link)
    j += 1
    if j % 10 == 0:
        print(f'Completed {j} pages.')
print(f'Completed all {j} pages.')

Completed 10 pages.
Completed 20 pages.
Completed 30 pages.
Completed 40 pages.


In [None]:
len(items)

In [9]:
# 将标题与日期分别存至一list中
titles = []
dates = []
timestamps = []

for i in range(len(items)):
    date = items[i][-10:]
    title = items[i].replace(date, '')  # 删除标题里的日期
    dates.append(date)
    titles.append(title)

    # 转换为时间戳以方便排序
    timestamp = ciso8601.parse_datetime(date)
    timestamps.append(int(time.mktime(timestamp.timetuple())))  # 添加转换为整型的时间戳

# 将日期、标题、网址存至一数据表中
news = pd.DataFrame(dates, index=timestamps, columns=['日期'])
news['标题'] = titles
news['网址'] = links

news.sort_index(ascending=False)  # 按时间戳倒序排序
news.reset_index(drop=True, inplace=True)  # 还原index
news.index += 1  # 从1开始编号

# 保存数据表至csv文件
path = r"D:\天津职业大学新闻一览.csv"
news.to_csv(path, encoding="utf-8-sig")  # 编码方式为带签名的utf-8

In [9]:
def make_clickable(val):
    # 将网址转换为可点击的网址
    # 设置参数target为_blank可以在新标签页中打开
    return f'<a target="_blank" href="{val}">{val}</a>'


search = "校长"

# news['标题'].str.contains(search, case=False) 搜索了所有包含'search'字段的标题的序号，并用loc进行定位
news.loc[news['标题'].str.contains(search, case=False)].style.format(
    {'网址': make_clickable})

Unnamed: 0,日期,标题,网址
4,2022-07-27,郑清春校长带队前往北京易华录信息技术股份有限公司考察交流,https://www.tjtc.edu.cn/info/1020/4382.htm
5,2022-07-21,校党委副书记、校长郑清春带队赴蓟州区共商区校合作,https://www.tjtc.edu.cn/info/1020/4381.htm
9,2022-07-13,2022职业院校卓越校长培训班顺利开班,https://www.tjtc.edu.cn/info/1020/4377.htm
11,2022-07-07,校党委副书记、校长郑清春带领汽车工程学院领导班子到中国长安汽车集团天津销售有限公司开展访企拓...,https://www.tjtc.edu.cn/info/1020/4363.htm
14,2022-07-04,校党委副书记、校长郑清春带领汽车工程学院领导班子到林肯（中国）天津德润汽车销售服务有限公司开...,https://www.tjtc.edu.cn/info/1020/4355.htm
15,2022-07-02,郑清春校长为马克思主义学院国培项目培训班作专题讲座,https://www.tjtc.edu.cn/info/1020/4357.htm
22,2022-06-29,副校长杨荣敏深入天津滨海环球印务有限公司开展“访企拓岗促就业”工作,https://www.tjtc.edu.cn/info/1020/4350.htm
39,2022-06-08,校党委副书记、校长郑清春带领汽车学院到中汽数据有限公司开展访企拓岗调研活动,https://www.tjtc.edu.cn/info/1020/4326.htm
71,2022-05-09,李树岭副校长带队到天津渤化化工发展有限公司调研——开展“访企拓岗”促就业专项行动,https://www.tjtc.edu.cn/info/1020/4263.htm
108,2022-03-27,市委决定郑清春任天津职业大学党委副书记、校长,https://www.tjtc.edu.cn/info/1020/4207.htm


In [6]:
# 下载2022预算
url = f"https://xxgk.tjtc.edu.cn/info/1017/1438.htm"
headers = {
    'user-agent':
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
page = requests.get(url, headers=headers, verify=False)  # 不设置验证
page.encoding = "utf-8"
soup = BeautifulSoup(page.text, 'html.parser')
sources = soup.find_all('img', attrs={'vsbhref': "vurl",})

for i in range(len(sources)):
    url = "https://xxgk.tjtc.edu.cn" + sources[i]['src'].strip()
    re = requests.get(url, headers=headers, verify=False)
    re.encoding = "utf-8"
    path = f"D:\\TJVI2022_BUDGET\\{i+1}.png"
    with open(path, 'wb') as f:
        for chunk in re.iter_content(chunk_size=128):
            f.write(chunk)