/
common.ts
117 lines (96 loc) · 3.86 KB
/
common.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import { JSDOM } from 'jsdom';
import { byteLength, countBy, walkDOM } from 'web-utility';
import { CSSSelectorPrecision, getCSSSelector, sameParentOf } from '../utility';
import { Agenda, AgendaCrawler } from './core';
export const TimePattern = /\d{1,2}\s*[::]\s*\d{2}/;
const HeadingSelector = `h1, h2, h3, h4, h5, h6, strong, b`;
export abstract class CommonAgendaCrawler extends AgendaCrawler {
document?: Document;
override async *getList(URI: string) {
const {
window: { document }
} = await JSDOM.fromURL(URI);
this.document = document;
const timeBoxes = Array.from(
walkDOM<Text>(document.body, 3),
({ nodeValue, parentElement }) =>
TimePattern.test(nodeValue) && {
selector: getCSSSelector(
parentElement,
document.body,
CSSSelectorPrecision.Medium
)
}
).filter(Boolean);
const timeBoxCount = countBy(timeBoxes, ({ selector }) => selector);
const [[agendaTimeSelector]] = Object.entries(timeBoxCount).sort(
([, a], [, b]) => b - a
);
const [first, second] = document.querySelectorAll(agendaTimeSelector);
const agendaBox = sameParentOf(first, second) as Element;
const agendaBoxSelector = getCSSSelector(
agendaBox,
document.body,
CSSSelectorPrecision.High
);
for (let i = 0; i < agendaBox.childElementCount; i++)
if (agendaBox.tagName.toLowerCase() === 'tbody')
yield* this.getItems(
agendaBox.children[i] as HTMLTableRowElement
);
else
yield await this.getItem(
`${agendaBoxSelector} > :nth-child(${i + 1})`
);
this.document = undefined;
}
override async getItem(selector: string): Promise<Agenda> {
const agendaItem = this.document?.querySelector(selector);
if (!agendaItem) return {};
let time = '';
const [head, body] = Array.from(walkDOM<Text>(agendaItem, 3)).reduce(
(group, { parentElement, nodeValue }) => {
const isHeading =
parentElement.matches(HeadingSelector) ||
!!parentElement.closest(HeadingSelector);
if (TimePattern.test(nodeValue)) time = nodeValue.trim();
else group[isHeading ? 0 : 1].push(nodeValue.trim());
return group;
},
[[], []] as string[][]
);
const [startTime, endTime] = time.split(/[^\d::]+/),
[name, title] = head.sort((a, b) => byteLength(a) - byteLength(b)),
[position, summary] = body.sort(
(a, b) => byteLength(a) - byteLength(b)
),
avatar =
agendaItem.querySelector<HTMLImageElement>('img[src]')?.src;
return {
mentor: { name, position, avatar },
title,
summary,
startTime,
endTime
};
}
protected getItems({ children }: HTMLTableRowElement): Agenda[] {
const [time, ...agendas] = [...children];
const [startTime, endTime] = time.textContent.trim().split(/[^\d::]+/);
return agendas.map(agendaItem => {
const [name, position, title, summary] = agendaItem.textContent
.trim()
.split('\n')
.sort((a, b) => byteLength(a) - byteLength(b)),
avatar =
agendaItem.querySelector<HTMLImageElement>('img[src]')?.src;
return {
mentor: { name, position, avatar },
title,
summary,
startTime,
endTime
};
});
}
}