|
1 | 1 | import { load } from 'cheerio'; |
2 | 2 |
|
3 | 3 | import type { DataItem, Route } from '@/types'; |
4 | | -import cache from '@/utils/cache'; |
5 | 4 | import ofetch from '@/utils/ofetch'; |
6 | 5 | import { parseDate } from '@/utils/parse-date'; |
7 | 6 |
|
8 | | -const baseUrl = 'https://www.mckinsey.com'; |
9 | | -const link = `${baseUrl}/featured-insights`; |
10 | | - |
11 | | -const headers = { |
12 | | - 'accept-language': 'en-US,en;q=0.9', |
13 | | - 'user-agent': 'Mozilla/5.0 (compatible; RSSHub; +https://github.com/DIYgod/RSSHub)', |
14 | | -}; |
15 | | - |
16 | | -const datePattern = /(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},\s+\d{4}/; |
17 | | - |
18 | | -function normalizeText(value?: string | null): string { |
19 | | - return value?.replace(/\s+/g, ' ').trim() ?? ''; |
20 | | -} |
21 | | - |
22 | | -function isInsightUrl(url: URL): boolean { |
23 | | - if (url.hostname !== 'www.mckinsey.com') { |
24 | | - return false; |
25 | | - } |
26 | | - |
27 | | - const pathname = url.pathname.toLowerCase(); |
28 | | - |
29 | | - if (pathname === '/featured-insights' || pathname === '/our-insights' || pathname === '/insights') { |
30 | | - return false; |
31 | | - } |
32 | | - |
33 | | - return [ |
34 | | - '/featured-insights/', |
35 | | - '/our-insights/', |
36 | | - '/industries/', |
37 | | - '/capabilities/', |
38 | | - '/quarterly/', |
39 | | - '/mgi/', |
40 | | - '/institute-for-economic-mobility/', |
41 | | - ].some((prefix) => pathname.startsWith(prefix) && !pathname.endsWith('/our-insights')); |
42 | | -} |
43 | | - |
44 | | -function getContainerText($, element): string { |
45 | | - const containers = $(element) |
46 | | - .parents('article, li, div, section') |
47 | | - .toArray() |
48 | | - .map((container) => normalizeText($(container).text())) |
49 | | - .filter((text) => text.length >= 40 && text.length <= 1600); |
50 | | - |
51 | | - return containers[0] ?? normalizeText($(element).parent().text()); |
52 | | -} |
| 7 | +const link = 'https://www.mckinsey.com/insights/rss'; |
| 8 | +const siteUrl = 'https://www.mckinsey.com/insights'; |
53 | 9 |
|
54 | 10 | const handler: Route['handler'] = async () => { |
55 | | - const data = await ofetch(link, { headers }); |
56 | | - const $ = load(data); |
57 | | - const seen = new Set<string>(); |
58 | | - |
59 | | - const item = $('a[href]') |
| 11 | + const data = await ofetch(link, { |
| 12 | + headers: { |
| 13 | + accept: 'application/rss+xml, application/xml, text/xml;q=0.9, */*;q=0.8', |
| 14 | + 'accept-language': 'en-US,en;q=0.9', |
| 15 | + 'user-agent': 'Mozilla/5.0 (compatible; RSSHub; +https://github.com/DIYgod/RSSHub)', |
| 16 | + }, |
| 17 | + retry: 1, |
| 18 | + timeout: 15000, |
| 19 | + }); |
| 20 | + |
| 21 | + const $ = load(data, { xmlMode: true }); |
| 22 | + |
| 23 | + const item = $('item') |
60 | 24 | .toArray() |
61 | | - .map((element) => { |
62 | | - const $element = $(element); |
63 | | - const rawHref = $element.attr('href'); |
64 | | - const title = normalizeText($element.attr('title') || $element.text()); |
65 | | - |
66 | | - if (!rawHref || title.length < 12 || /^more\b/i.test(title) || /^download\b/i.test(title)) { |
67 | | - return; |
68 | | - } |
69 | | - |
70 | | - const itemUrl = new URL(rawHref, baseUrl); |
71 | | - itemUrl.hash = ''; |
72 | | - |
73 | | - if (!isInsightUrl(itemUrl) || seen.has(itemUrl.href)) { |
74 | | - return; |
75 | | - } |
76 | | - |
77 | | - seen.add(itemUrl.href); |
78 | | - |
79 | | - const containerText = getContainerText($, element); |
80 | | - const date = containerText.match(datePattern)?.[0]; |
81 | | - const description = normalizeText(containerText.replace(title, '').replace(date ?? '', '')).slice(0, 800); |
| 25 | + .map((entry) => { |
| 26 | + const $entry = $(entry); |
| 27 | + const categories = $entry |
| 28 | + .find('category') |
| 29 | + .toArray() |
| 30 | + .map((category) => $(category).text().trim()) |
| 31 | + .filter(Boolean); |
82 | 32 |
|
83 | 33 | return { |
84 | | - title, |
85 | | - link: itemUrl.href, |
86 | | - description: description || title, |
87 | | - pubDate: date ? parseDate(date) : undefined, |
| 34 | + title: $entry.find('title').first().text().trim(), |
| 35 | + link: $entry.find('link').first().text().trim(), |
| 36 | + description: $entry.find('description').first().text().trim(), |
| 37 | + pubDate: parseDate($entry.find('pubDate').first().text().trim()), |
| 38 | + guid: $entry.find('guid').first().text().trim(), |
| 39 | + category: categories, |
88 | 40 | }; |
89 | 41 | }) |
90 | | - .filter(Boolean) |
91 | | - .slice(0, 30) as DataItem[]; |
92 | | - |
93 | | - const cachedItems = await Promise.all( |
94 | | - item.map((entry) => |
95 | | - cache.tryGet(`mckinsey:insights:${entry.link}`, async () => entry) |
96 | | - ) |
97 | | - ); |
| 42 | + .filter((entry) => entry.title && entry.link) |
| 43 | + .slice(0, 50) as DataItem[]; |
98 | 44 |
|
99 | 45 | return { |
100 | | - title: 'McKinsey Insights', |
101 | | - link, |
| 46 | + title: $('channel > title').first().text().trim() || 'McKinsey Insights', |
| 47 | + link: $('channel > link').first().text().trim() || siteUrl, |
| 48 | + description: $('channel > description').first().text().trim(), |
102 | 49 | language: 'en-US', |
103 | | - item: cachedItems, |
| 50 | + item, |
104 | 51 | }; |
105 | 52 | }; |
106 | 53 |
|
|
0 commit comments