Skip to content

Commit 400fb1f

Browse files
authored
fix(route/picnob): use puppeteer-real-browser to pass cf check (#20478)
1 parent a07565d commit 400fb1f

File tree

6 files changed

+346
-55
lines changed

6 files changed

+346
-55
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ RUN \
148148
; \
149149
else \
150150
apt-get install -yq --no-install-recommends \
151-
chromium \
151+
chromium xvfb \
152152
&& \
153153
echo "CHROMIUM_EXECUTABLE_PATH=$(which chromium)" | tee /app/.env ; \
154154
fi; \

docker-compose.yml

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,21 +6,33 @@ services:
66
image: diygod/rsshub # or ghcr.io/diygod/rsshub
77
restart: always
88
ports:
9-
- "1200:1200"
9+
- '1200:1200'
1010
environment:
1111
NODE_ENV: production
1212
CACHE_TYPE: redis
13-
REDIS_URL: "redis://redis:6379/"
14-
PUPPETEER_WS_ENDPOINT: "ws://browserless:3000" # marked
13+
REDIS_URL: 'redis://redis:6379/'
14+
PUPPETEER_WS_ENDPOINT: 'ws://browserless:3000' # marked
15+
PUPPETEER_REAL_BROWSER_SERVICE: 'http://real-browser:3000' # marked
1516
healthcheck:
16-
test: ["CMD", "curl", "-f", "http://localhost:1200/healthz"]
17+
test: ['CMD', 'curl', '-f', 'http://localhost:1200/healthz']
1718
interval: 30s
1819
timeout: 10s
1920
retries: 3
2021
depends_on:
2122
- redis
2223
- browserless # marked
2324

25+
real-browser:
26+
image: ghcr.io/hyoban/puppeteer-real-browser-hono
27+
restart: always
28+
ports:
29+
- '3001:3000'
30+
healthcheck:
31+
test: ['CMD', 'curl', '-f', 'http://localhost:3000']
32+
interval: 30s
33+
timeout: 10s
34+
retries: 3
35+
2436
browserless: # marked
2537
image: browserless/chrome # marked
2638
restart: always # marked
@@ -29,7 +41,7 @@ services:
2941
hard: 0 # marked
3042
soft: 0 # marked
3143
healthcheck: # marked
32-
test: ["CMD", "curl", "-f", "http://localhost:3000/pressure"] # marked
44+
test: ['CMD', 'curl', '-f', 'http://localhost:3000/pressure'] # marked
3345
interval: 30s # marked
3446
timeout: 10s # marked
3547
retries: 3 # marked
@@ -40,7 +52,7 @@ services:
4052
volumes:
4153
- redis-data:/data
4254
healthcheck:
43-
test: ["CMD", "redis-cli", "ping"]
55+
test: ['CMD', 'redis-cli', 'ping']
4456
interval: 30s
4557
timeout: 10s
4658
retries: 5

lib/config.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ export type Config = {
99
enableCluster?: string;
1010
isPackage: boolean;
1111
nodeName?: string;
12+
puppeteerRealBrowserService?: string;
1213
puppeteerWSEndpoint?: string;
1314
chromiumExecutablePath?: string;
1415
// network
@@ -480,6 +481,7 @@ const calculateValue = () => {
480481
enableCluster: toBoolean(envs.ENABLE_CLUSTER, false),
481482
isPackage: !!envs.IS_PACKAGE,
482483
nodeName: envs.NODE_NAME,
484+
puppeteerRealBrowserService: envs.PUPPETEER_REAL_BROWSER_SERVICE,
483485
puppeteerWSEndpoint: envs.PUPPETEER_WS_ENDPOINT,
484486
chromiumExecutablePath: envs.CHROMIUM_EXECUTABLE_PATH,
485487
// network

lib/routes/picnob/user.ts

Lines changed: 120 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,50 @@
1+
import { config } from '@/config';
12
import { Route, ViewType } from '@/types';
2-
3-
import ofetch from '@/utils/ofetch';
4-
import { load } from 'cheerio';
5-
import { parseRelativeDate } from '@/utils/parse-date';
6-
import { puppeteerGet } from './utils';
7-
import sanitizeHtml from 'sanitize-html';
83
import cache from '@/utils/cache';
4+
import { parseRelativeDate } from '@/utils/parse-date';
5+
import { load } from 'cheerio';
6+
import { connect, Options } from 'puppeteer-real-browser';
7+
8+
const realBrowserOption: Options = {
9+
args: ['--start-maximized'],
10+
turnstile: true,
11+
headless: false,
12+
// disableXvfb: true,
13+
// ignoreAllFlags:true,
14+
customConfig: {
15+
chromePath: config.chromiumExecutablePath,
16+
},
17+
connectOption: {
18+
defaultViewport: null,
19+
},
20+
plugins: [],
21+
};
22+
23+
async function getPageWithPuppeteer(url: string, selector: string): Promise<string> {
24+
if (config.puppeteerRealBrowserService) {
25+
const res = await fetch(`${config.puppeteerRealBrowserService}?url=${encodeURIComponent(url)}&selector=${encodeURIComponent(selector)}`);
26+
const json = await res.json();
27+
return (json.data.at(0) || '') as string;
28+
} else {
29+
const { page, browser } = await connect(realBrowserOption);
30+
await page.goto(url, { timeout: 50000 });
31+
let verify: boolean | null = null;
32+
const startDate = Date.now();
33+
while (!verify && Date.now() - startDate < 50000) {
34+
// eslint-disable-next-line no-await-in-loop, no-restricted-syntax
35+
verify = await page.evaluate((sel) => (document.querySelector(sel) ? true : null), selector).catch(() => null);
36+
// eslint-disable-next-line no-await-in-loop
37+
await new Promise((r) => setTimeout(r, 1000));
38+
}
39+
const html = await page.content();
40+
await browser.close();
41+
return html;
42+
}
43+
}
44+
45+
function getProfilePage(profileUrl: string): Promise<string> {
46+
return getPageWithPuppeteer(profileUrl, '.post_box');
47+
}
948

1049
export const route: Route = {
1150
path: '/user/:id/:type?',
@@ -34,76 +73,109 @@ export const route: Route = {
3473
},
3574
],
3675
name: 'User Profile - Picnob',
37-
maintainers: ['TonyRL', 'micheal-death', 'AiraNadih', 'DIYgod'],
76+
maintainers: ['TonyRL', 'micheal-death', 'AiraNadih', 'DIYgod', 'hyoban'],
3877
handler,
3978
view: ViewType.Pictures,
4079
};
4180

4281
async function handler(ctx) {
82+
if (!config.puppeteerRealBrowserService && !config.chromiumExecutablePath) {
83+
throw new Error('PUPPETEER_REAL_BROWSER_SERVICE or CHROMIUM_EXECUTABLE_PATH is required to use this route.');
84+
}
85+
4386
// NOTE: 'picnob' is still available, but all requests to 'picnob' will be redirected to 'pixnoy' eventually
4487
const baseUrl = 'https://www.pixnoy.com';
4588
const id = ctx.req.param('id');
4689
const type = ctx.req.param('type') ?? 'profile';
4790
const profileUrl = `${baseUrl}/profile/${id}/${type === 'tagged' ? 'tagged/' : ''}`;
4891

49-
// TODO: can't bypass cloudflare 403 error without puppeteer
50-
let html;
51-
let usePuppeteer = false;
52-
try {
53-
const data = await ofetch(profileUrl);
54-
html = data;
55-
} catch {
56-
html = await puppeteerGet(profileUrl);
57-
usePuppeteer = true;
58-
}
92+
const html = await getProfilePage(profileUrl);
93+
5994
const $ = load(html);
6095

6196
const list = $('.post_box')
6297
.toArray()
6398
.map((item) => {
6499
const $item = $(item);
65-
const sum = $item.find('.sum').text();
66100
const coverLink = $item.find('.cover_link').attr('href');
67101
const shortcode = coverLink?.split('/')?.[2];
102+
const image = $item.find('.cover .cover_link img');
103+
const title = image.attr('alt') || '';
68104

69105
return {
70-
title: sanitizeHtml(sum.split('\n')[0], { allowedTags: [], allowedAttributes: {} }),
71-
description: `<img src="${$item.find('.preview_w img').attr('data-src')}" /><br />${sum.replaceAll('\n', '<br>')}`,
106+
title,
107+
description: `<img src="${image.attr('data-src')}" /><br />${title}`,
72108
link: `${baseUrl}${coverLink}`,
73109
guid: shortcode,
74110
pubDate: parseRelativeDate($item.find('.time .txt').text()),
75111
};
76112
});
77113

78-
const newDescription = await Promise.all(
79-
list.map((item) =>
80-
cache.tryGet(`picnob:user:${id}:${item.guid}`, async () => {
81-
try {
82-
const html = usePuppeteer
83-
? await puppeteerGet(item.link)
84-
: await ofetch(item.link, {
85-
headers: {
86-
'user-agent': 'PostmanRuntime/7.44.0',
87-
},
88-
});
89-
const $ = load(html);
90-
if ($('.video_img').length > 0) {
91-
return `<video src="${$('.video_img a').attr('href')}" poster="${$('.video_img img').attr('data-src')}"></video><br />${$('.sum_full').text()}`;
92-
} else {
93-
let description = '';
94-
for (const slide of $('.swiper-slide').toArray()) {
95-
const $slide = $(slide);
96-
description += `<img src="${$slide.find('.pic img').attr('data-src')}" /><br />`;
114+
// Fetch all post details concurrently
115+
// First, get HTML for all posts
116+
let htmlList: string[];
117+
if (config.puppeteerRealBrowserService) {
118+
// Use puppeteer service for multiple URLs
119+
htmlList = (await Promise.all(
120+
list.map((item) =>
121+
cache.tryGet(`picnob:user:${id}:${item.guid}:html`, async () => {
122+
const selector = '.video_img, .swiper-slide';
123+
const res = await fetch(`${config.puppeteerRealBrowserService}?url=${encodeURIComponent(item.link)}&selector=${encodeURIComponent(selector)}`);
124+
const json = await res.json();
125+
return (json.data?.at(0) || '') as string;
126+
})
127+
)
128+
)) as string[];
129+
} else {
130+
// Use local puppeteer browser
131+
const { browser } = await connect(realBrowserOption);
132+
try {
133+
htmlList = (await Promise.all(
134+
list.map((item) =>
135+
cache.tryGet(`picnob:user:${id}:${item.guid}:html`, async () => {
136+
const page = await browser.newPage();
137+
try {
138+
await page.goto(item.link, { timeout: 50000 });
139+
let verify: boolean | null = null;
140+
const startDate = Date.now();
141+
while (!verify && Date.now() - startDate < 50000) {
142+
// eslint-disable-next-line no-await-in-loop, no-restricted-syntax
143+
verify = await page.evaluate(() => (document.querySelector('.video_img') || document.querySelector('.swiper-slide') ? true : null)).catch(() => null);
144+
// eslint-disable-next-line no-await-in-loop
145+
await new Promise((r) => setTimeout(r, 1000));
146+
}
147+
return await page.content();
148+
} catch {
149+
return '';
150+
} finally {
151+
await page.close();
97152
}
98-
description += $('.sum_full').text();
99-
return description;
100-
}
101-
} catch {
102-
return '';
103-
}
104-
})
105-
)
106-
);
153+
})
154+
)
155+
)) as string[];
156+
} finally {
157+
await browser.close();
158+
}
159+
}
160+
161+
// Process HTML to generate descriptions
162+
const newDescription = htmlList.map((html) => {
163+
if (!html) {
164+
return '';
165+
}
166+
const $ = load(html);
167+
if ($('.video_img').length > 0) {
168+
return `<video src="${$('.video_img a').attr('href')}" poster="${$('.video_img img').attr('data-src')}"></video><br />${$('.sum_full').text()}`;
169+
} else {
170+
let description = '';
171+
for (const slide of $('.swiper-slide').toArray()) {
172+
const $slide = $(slide);
173+
description += `<img src="${$slide.find('.pic img').attr('data-src')}" /><br />`;
174+
}
175+
description += $('.sum_full').text();
176+
return description;
177+
}
178+
});
107179

108180
return {
109181
title: `${$('h1.fullname').text()} (@${id}) ${type === 'tagged' ? 'tagged' : 'public'} posts - Picnob`,

package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@
113113
"p-map": "7.0.4",
114114
"pac-proxy-agent": "7.2.0",
115115
"proxy-chain": "2.5.9",
116+
"puppeteer-real-browser": "1.4.4",
116117
"query-string": "9.3.1",
117118
"rate-limiter-flexible": "8.2.0",
118119
"re2js": "1.2.0",
@@ -213,6 +214,7 @@
213214
"puppeteer",
214215
"rebrowser-puppeteer",
215216
"rolldown",
217+
"sleep",
216218
"utf-8-validate",
217219
"vue-demi"
218220
],

0 commit comments

Comments
 (0)