Skip to content

Commit

Permalink
[add] Table & Heading detection of Non-semantic Agenda pages
Browse files Browse the repository at this point in the history
  • Loading branch information
TechQuery committed May 4, 2023
1 parent c4fe589 commit 3e6057c
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 17 deletions.
8 changes: 6 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@fcc-cdc/it-events",
"version": "1.3.0-alpha.0",
"version": "1.3.0",
"license": "MIT",
"author": "shiy2008@gmail.com",
"description": "IT Events Crawler of China",
Expand Down Expand Up @@ -67,7 +67,11 @@
"*.{ts,tsx}": "eslint --fix"
},
"jest": {
"preset": "ts-jest"
"preset": "ts-jest",
"transform": {
"^.+\\.(js|jsx|ts|tsx)$": "ts-jest"
},
"transformIgnorePatterns": []
},
"scripts": {
"prepare": "husky install",
Expand Down
6 changes: 6 additions & 0 deletions source/Agenda/HuoDongXing.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,9 @@ export class HuoDongXingAgenda extends CommonAgendaCrawler {

static schema = new URLPattern(`${this.baseURI}/:event(\\d+)`);
}

export class OldHuoDongXingAgenda extends CommonAgendaCrawler {
static baseURI = 'https://www.huodongxing.com/go';

static schema = new URLPattern(`${this.baseURI}/:event(\\w+)`);
}
72 changes: 61 additions & 11 deletions source/Agenda/common.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import { JSDOM } from 'jsdom';
import { walkDOM, countBy } from 'web-utility';
import { byteLength, countBy, walkDOM } from 'web-utility';

import { CSSSelectorPrecision, getCSSSelector, sameParentOf } from '../utility';
import { Agenda, AgendaCrawler } from './core';
import { getCSSSelector, sameParentOf } from '../utility';

export const TimePattern = /\d{1,2}\s*[::]\s*\d{2}/;

const HeadingSelector = `h1, h2, h3, h4, h5, h6, strong, b`;

export abstract class CommonAgendaCrawler extends AgendaCrawler {
document?: Document;

Expand All @@ -20,7 +22,11 @@ export abstract class CommonAgendaCrawler extends AgendaCrawler {
walkDOM<Text>(document.body, 3),
({ nodeValue, parentElement }) =>
TimePattern.test(nodeValue) && {
selector: getCSSSelector(parentElement, document.body)
selector: getCSSSelector(
parentElement,
document.body,
CSSSelectorPrecision.Medium
)
}
).filter(Boolean);

Expand All @@ -32,12 +38,21 @@ export abstract class CommonAgendaCrawler extends AgendaCrawler {
const [first, second] = document.querySelectorAll(agendaTimeSelector);

const agendaBox = sameParentOf(first, second) as Element;
const agendaBoxSelector = getCSSSelector(agendaBox);
const agendaBoxSelector = getCSSSelector(
agendaBox,
document.body,
CSSSelectorPrecision.High
);

for (let i = 0; i < agendaBox.childElementCount; i++)
yield await this.getItem(
`${agendaBoxSelector} > :nth-child(${i + 1})`
);
if (agendaBox.tagName.toLowerCase() === 'tbody')
yield* this.getItems(
agendaBox.children[i] as HTMLTableRowElement
);
else
yield await this.getItem(
`${agendaBoxSelector} > :nth-child(${i + 1})`
);
this.document = undefined;
}

Expand All @@ -48,20 +63,55 @@ export abstract class CommonAgendaCrawler extends AgendaCrawler {

let time = '';

const [title, name, position, summary] = Array.from(
walkDOM<Text>(agendaItem, 3),
({ nodeValue }) => nodeValue.trim()
).filter(text => !TimePattern.test(text) || !(time = text));
const [head, body] = Array.from(walkDOM<Text>(agendaItem, 3)).reduce(
(group, { parentElement, nodeValue }) => {
const isHeading =
parentElement.matches(HeadingSelector) ||
!!parentElement.closest(HeadingSelector);

if (TimePattern.test(nodeValue)) time = nodeValue.trim();
else group[isHeading ? 0 : 1].push(nodeValue.trim());

return group;
},
[[], []] as string[][]
);
const [startTime, endTime] = time.split(/[^\d::]+/),
[name, title] = head.sort((a, b) => byteLength(a) - byteLength(b)),
[position, summary] = body.sort(
(a, b) => byteLength(a) - byteLength(b)
),
avatar =
agendaItem.querySelector<HTMLImageElement>('img[src]')?.src;

return {
mentor: { name, position, avatar },
title,
summary,
startTime,
endTime
};
}

protected getItems({ children }: HTMLTableRowElement): Agenda[] {
const [time, ...agendas] = [...children];
const [startTime, endTime] = time.textContent.trim().split(/[^\d::]+/);

return agendas.map(agendaItem => {
const [name, position, title, summary] = agendaItem.textContent
.trim()
.split('\n')
.sort((a, b) => byteLength(a) - byteLength(b)),
avatar =
agendaItem.querySelector<HTMLImageElement>('img[src]')?.src;

return {
mentor: { name, position, avatar },
title,
summary,
startTime,
endTime
};
});
}
}
2 changes: 1 addition & 1 deletion source/Agenda/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ export type Duration = Partial<Record<'startTime' | 'endTime', string>>;

export type Forum = Duration & Pick<Mentor, 'name' | 'summary'>;

export interface Agenda extends Duration {
export interface Agenda extends Duration, Pick<Forum, 'summary'> {
title?: string;
mentor?: Mentor;
forum?: Forum;
Expand Down
16 changes: 13 additions & 3 deletions source/utility.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,9 +75,16 @@ export async function saveFile(
return path;
}

export enum CSSSelectorPrecision {
Low,
Medium,
High
}

export function getCSSSelector(
toElement: Element,
fromElement = toElement.getRootNode()
fromElement = toElement.getRootNode(),
precision = CSSSelectorPrecision.Low
) {
const selectors: string[] = [];

Expand All @@ -88,9 +95,12 @@ export function getCSSSelector(
tagName.toLowerCase() +
(className.trim()
? '.' + className.split(/\s+/).filter(Boolean).join('.')
: `:nth-child(${
: precision === CSSSelectorPrecision.High ||
(precision === CSSSelectorPrecision.Medium && !selectors[0])
? `:nth-child(${
[...parentNode.children].indexOf(toElement) + 1
})`);
})`
: '');
selectors.unshift(selector);

toElement = parentNode as Element;
Expand Down

0 comments on commit 3e6057c

Please sign in to comment.