/
rail.ts
137 lines (130 loc) · 5.49 KB
/
rail.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import { Route } from '@/types';
import cache from '@/utils/cache';
import { load } from 'cheerio';
import got from '@/utils/got';
import { parseDate } from '@/utils/parse-date';
import timezone from '@/utils/timezone';
export const route: Route = {
path: '/rail/:category?/:topic?',
categories: ['new-media'],
example: '/ally/rail/hyzix/chengguijiaotong/',
parameters: { category: '分类,可在 URL 中找到;略去则抓取首页', topic: '话题,可在 URL 中找到;并非所有页面均有此字段' },
features: {
requireConfig: false,
requirePuppeteer: false,
antiCrawler: false,
supportBT: false,
supportPodcast: false,
supportScihub: false,
},
radar: [
{
source: ['rail.ally.net.cn/', 'rail.ally.net.cn/html/:category?/:topic?'],
},
],
name: '世界轨道交通资讯网',
maintainers: ['Rongronggg9'],
handler,
url: 'rail.ally.net.cn/',
description: `:::tip
默认抓取前 20 条,可通过 \`?limit=\` 改变。
:::`,
};
async function handler(ctx) {
// http://rail.ally.net.cn/sitemap.html
const { category, topic } = ctx.req.param();
const rootUrl = 'http://rail.ally.net.cn';
const pageUrl = category ? (topic ? `${rootUrl}/html/${category}/${topic}/` : `${rootUrl}/html/${category}/`) : rootUrl;
const response = await got.get(pageUrl);
const $ = load(response.data);
let title = $('.container .regsiter a') // what a typo...
.get()
.slice(1) // drop "首页"
.reduce((prev, curr) => (prev ? `${prev} - ${$(curr).text()}` : $(curr).text()), '');
title = title || (category && topic ? `${category} - ${topic}` : category) || '首页';
let links = [
// list page: http://rail.ally.net.cn/html/lujuzixun/
$('.left .hynewsO h2 a').get(),
// multi-sub-topic page: http://rail.ally.net.cn/html/hyzix/
$('.left .list_content_c').find('.new_hy_focus_con_tit a, .new_hy_list_name a').get(),
// multi-sub-topic page 2: http://rail.ally.net.cn/html/foster/
$('.left').find('.nnewslistpic a, .nnewslistinfo dd a').get(),
// data list page: http://rail.ally.net.cn/html/tongjigongbao/
$('.left .list_con .datacountTit a').get(),
// home page: http://rail.ally.net.cn
$('.container_left').find('dd a, h1 a, ul.slideshow li a').get(),
].flat();
if (!links.length) {
// try aggressively sniffing links, e.g. http://rail.ally.net.cn/html/InviteTen/
links = $('.left a, .container_left a').get();
}
let items = links
.map((link) => {
link = $(link);
const url = link.attr('href');
const urlMatch = url && url.match(/\/html\/(\d{4})\/\w+_(\d{4})\/\d+\.html/);
if (!urlMatch) {
return null;
}
const title = link.text();
return {
title,
link: url.startsWith('/') ? `${rootUrl}${url}` : url,
pubDate: timezone(parseDate(`${urlMatch[1]}${urlMatch[2]}`), 8),
};
})
.filter(Boolean)
.reduce((prev, curr) => (prev.length && prev.at(-1).link === curr.link ? prev : [...prev, curr]), [])
.sort((a, b) => b.pubDate - a.pubDate)
.slice(0, ctx.req.query('limit') || 20);
items = await Promise.all(
items.map((item) =>
cache.tryGet(item.link, async () => {
const response = await got(item.link);
const $ = load(response.data);
// fix weird format
let description = '';
const content = $('div.content_all');
if (content.length) {
content
.eq(content.length - 1) // some pages have "summary"
.contents()
.each((_, child) => {
const $child = $(child);
let innerHtml;
if (child.name === 'div') {
innerHtml = $child.html();
innerHtml = innerHtml && innerHtml.trim();
description += !innerHtml || innerHtml === ' ' ? (description ? '<br>' : '') : innerHtml;
} else {
// bare text node or something else
description += $child.toString().trim();
}
});
} else {
// http://rail.ally.net.cn/html/2022/InviteTen_0407/4686.html
description = $('div.content div').first().html();
}
description = description.replace(/\s*<br ?\/?>\s*$/, ''); // trim <br> at the end
const info = $('.content > em span');
return {
title: $('.content > h2').text() || item.title,
description,
// pubDate: timezone(parseDate(info.eq(0).text()), 8),
pubDate: item.pubDate,
author: info
.eq(1)
.text()
.replace(/^来源:/, ''),
link: item.link,
};
})
)
);
return {
title: `世界轨道交通资讯网 - ${title}`,
link: pageUrl,
item: items,
description: $('head > meta[name="description"]').attr('content'),
};
}