11import type { CheerioAPI } from 'cheerio' ;
22import { load } from 'cheerio' ;
3+ import type { Browser , Page } from 'rebrowser-puppeteer' ;
34
45import type { DataItem } from '@/types' ;
56import cache from '@/utils/cache' ;
6- import ofetch from '@/utils/ofetch ' ;
7+ import logger from '@/utils/logger ' ;
78import { parseRelativeDate } from '@/utils/parse-date' ;
89
910export const BASE_URL = 'https://www.dailypush.dev' ;
@@ -19,6 +20,38 @@ export interface ArticleItem {
1920 dailyPushUrl ?: string ;
2021}
2122
23+ const allowedRequestTypes = new Set ( [ 'document' ] ) ;
24+
25+ async function preparePage ( page : Page ) {
26+ await page . setRequestInterception ( true ) ;
27+ page . on ( 'request' , ( request ) => {
28+ if ( allowedRequestTypes . has ( request . resourceType ( ) ) ) {
29+ request . continue ( ) ;
30+ return ;
31+ }
32+
33+ request . abort ( ) ;
34+ } ) ;
35+ }
36+
37+ export async function fetchPageHtml ( browser : Browser , url : string , waitForSelector ?: string ) : Promise < string > {
38+ const page = await browser . newPage ( ) ;
39+ await preparePage ( page ) ;
40+
41+ try {
42+ logger . http ( `Requesting ${ url } ` ) ;
43+ await page . goto ( url , { waitUntil : 'domcontentloaded' } ) ;
44+
45+ if ( waitForSelector ) {
46+ await page . waitForSelector ( waitForSelector ) ;
47+ }
48+
49+ return await page . content ( ) ;
50+ } finally {
51+ await page . close ( ) ;
52+ }
53+ }
54+
2255/**
2356 * Try to parse text as a date. Returns the Date if parsing succeeds and is valid, undefined otherwise.
2457 */
@@ -40,14 +73,14 @@ function extractAuthor(article: ReturnType<CheerioAPI>): DataItem['author'] {
4073 return undefined ;
4174 }
4275
43- // Get all content spans (exclude separator spans with '•' )
76+ // Get all content spans (exclude separator spans with "•" )
4477 const allSpans = container . find ( 'span' ) ;
4578 const contentSpans : string [ ] = [ ] ;
4679
4780 for ( let i = 0 ; i < allSpans . length ; i ++ ) {
4881 const $span = allSpans . eq ( i ) ;
4982 const text = $span . text ( ) . trim ( ) ;
50- // Skip separator spans (contain only '•' or have separator classes)
83+ // Skip separator spans (contain only "•" or have separator classes)
5184 if ( text !== '•' && ! $span . hasClass ( 'text-slate-300' ) && ! $span . hasClass ( 'dark:text-slate-600' ) ) {
5285 contentSpans . push ( text ) ;
5386 }
@@ -127,14 +160,14 @@ function extractPubDate(article: ReturnType<CheerioAPI>): Date | undefined {
127160 return undefined ;
128161 }
129162
130- // Get all content spans (exclude separator spans with '•' )
163+ // Get all content spans (exclude separator spans with "•" )
131164 const allSpans = container . find ( 'span' ) ;
132165 const contentSpans : string [ ] = [ ] ;
133166
134167 for ( let i = 0 ; i < allSpans . length ; i ++ ) {
135168 const $span = allSpans . eq ( i ) ;
136169 const text = $span . text ( ) . trim ( ) ;
137- // Skip separator spans (contain only '•' or have separator classes)
170+ // Skip separator spans (contain only "•" or have separator classes)
138171 if ( text !== '•' && ! $span . hasClass ( 'text-slate-300' ) && ! $span . hasClass ( 'dark:text-slate-600' ) ) {
139172 contentSpans . push ( text ) ;
140173 }
@@ -225,23 +258,20 @@ export function parseArticles($: CheerioAPI, baseUrl: string): ArticleItem[] {
225258}
226259
227260/**
228- * Enhance items with full summaries from dailypush article pages
261+ * Enhance items with full summaries from dailypush article pages.
262+ * Uses the provided browser; opens a new tab per URL (document requests only). Caller must close the browser.
229263 */
230- export async function enhanceItemsWithSummaries ( items : ArticleItem [ ] ) : Promise < DataItem [ ] > {
264+ export async function enhanceItemsWithSummaries ( browser : Browser , items : ArticleItem [ ] ) : Promise < DataItem [ ] > {
231265 const itemsWithUrl = items . filter ( ( item ) => item . dailyPushUrl !== undefined ) ;
232266 const itemsWithoutUrl : DataItem [ ] = items . filter ( ( item ) => item . dailyPushUrl === undefined ) ;
233267
234- const enhancedItems : DataItem [ ] = await Promise . all (
268+ const enhancedItems = await Promise . all (
235269 itemsWithUrl . map ( ( item ) =>
236270 cache . tryGet ( item . dailyPushUrl ! , async ( ) => {
237- // If we have a dailypush article URL, fetch it for the longer summary
238271 try {
239- const articleResponse = await ofetch ( item . dailyPushUrl ! ) ;
240- const $ = load ( articleResponse ) ;
241-
242- // Find the longer summary/description on the article page
272+ const html = await fetchPageHtml ( browser , item . dailyPushUrl ! , 'p.font-ibm-plex-sans.leading-relaxed' ) ;
273+ const $ = load ( html ) ;
243274 const summary = $ ( 'p.font-ibm-plex-sans.leading-relaxed' ) . first ( ) ;
244-
245275 if ( summary . length > 0 && summary . text ( ) . trim ( ) ) {
246276 item . description = summary . text ( ) . trim ( ) ;
247277 }
@@ -254,6 +284,5 @@ export async function enhanceItemsWithSummaries(items: ArticleItem[]): Promise<D
254284 )
255285 ) ;
256286
257- // Include items without dailyPushUrl as-is
258287 return [ ...enhancedItems , ...itemsWithoutUrl ] ;
259288}
0 commit comments