Skip to content

Commit

Permalink
feat(processing): extract data from html,
Browse files Browse the repository at this point in the history
create screenshots
  • Loading branch information
pixelmord committed Mar 31, 2020
1 parent b7c26b2 commit ff94f0f
Show file tree
Hide file tree
Showing 6 changed files with 238 additions and 13 deletions.
2 changes: 2 additions & 0 deletions packages/poolbase-app/src/functions/handlers/addURLHandler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ type URLDocumentData = {
uid: string;
status: string | number;
created: FieldValue;
processed: {};
};

export const addURLHandler = functions.region('europe-west1').https.onCall(
Expand All @@ -31,6 +32,7 @@ export const addURLHandler = functions.region('europe-west1').https.onCall(
created: admin.firestore.FieldValue.serverTimestamp(),
uid: context.auth?.uid,
status: 'new',
processed: {},
};
// Push the new url into Cloud Firestore using the Firebase Admin SDK.
return await firestore.collection('pages').add(urlData);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ export const pageCreateHandler = functions
const page = snapshot.data();
try {
if (typeof page !== 'undefined' && typeof page.url !== 'undefined') {
const data = await scrapeHTML(page.url);
const data = await scrapeHTML(page.url, context.params.pageId);
firestore
.collection('pages')
.doc(context.params.pageId)
Expand Down
4 changes: 3 additions & 1 deletion packages/poolbase-app/src/functions/initFirebase.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ import * as admin from 'firebase-admin';

const app = admin.apps.length === 0 ? admin.initializeApp() : admin.app();
const firestore = admin.firestore(app);
const storage = admin.storage();
const bucket = storage.bucket();

export { app, firestore };
export { app, firestore, storage, bucket };
export default admin;
233 changes: 222 additions & 11 deletions packages/poolbase-app/src/functions/processing/scrapeHTML.ts
Original file line number Diff line number Diff line change
@@ -1,50 +1,261 @@
import puppeteer from 'puppeteer';
import puppeteer, { ConsoleMessage } from 'puppeteer';
import { bucket } from '../initFirebase';

type ScrapeData = {
status: number | string | null;
metaKeywords: string[] | null;
metaDescription: string | null;
metaTitle: string | null;
metaAuthor: string | null;
metaPublisher: string | null;
mainText: string | null;
metaIconUrl: string | null;
mainImageUrl: string | null;
'processed.html': boolean;
};
export const scrapeHTML = async (url: string): Promise<ScrapeData> => {

export const scrapeHTML = async (url: string, pageId: string): Promise<ScrapeData> => {
const saveScreenShot = async (imageBuffer: string | Buffer, pageId: string, size = 'preview'): Promise<string> => {
if (!imageBuffer || imageBuffer === '') {
throw new Error('No screenshot data provided');
}
if (!pageId || pageId === '') {
throw new Error('No pageId provided');
}

// Create a file object
const file = bucket.file(`/screenshots/${size}/${pageId}.png`);

// Save the image
await file.save(imageBuffer);

return 'saved screenshot!';
};

const urlObject = new URL(url);
let data: ScrapeData = {
metaKeywords: null,
metaDescription: null,
metaTitle: null,
metaAuthor: null,
metaPublisher: null,
mainText: null,
status: null,
mainImageUrl: null,
metaIconUrl: `${urlObject.protocol}//${urlObject.host}/favicon.ico`,
'processed.html': true,
};
const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] });

const page = await browser.newPage();
await page.setViewport({ width: 1366, height: 768 });
function logMessage(msg: ConsoleMessage): void {
if (msg.type() === 'log') {
return;
}
// eslint-disable-next-line @typescript-eslint/unbound-method
console.debug(`PAGE ${msg.type().toUpperCase()} : ${msg.text()}`);
}
page.on('console', logMessage);
try {
const response = await page.goto(url);
const response = await page.goto(url, { waitUntil: 'networkidle2' });
data['status'] = response?.status() || 'failed';
} catch (e) {
console.log(e);
} catch (err) {
console.error(err);
await browser.close();
data['status'] = 'failed';
return data;
}

try {
await page.waitForSelector('head > title');
const extractedFromHEAD = await page.evaluate(() => {
console.debug('extractedFromHEAD');
return {
...(!!document.querySelector('title') && { metaTitle: document?.querySelector('title')?.innerText.trim() }),
...(!!document.querySelector('meta[name=keywords]') && {
metaKeywords: document?.querySelector('meta[name=author]')?.getAttribute('content'),
...(!!document.querySelector('meta[name="keywords"]') && {
metaKeywords: document
?.querySelector('meta[name="keywords"]')
?.getAttribute('content')
?.split(',')
.map(keyword => keyword.trim()),
}),
...(!!document.querySelector('meta[name="author"]') && {
metaAuthor: document?.querySelector('meta[name="author"]')?.getAttribute('content'),
}),
...(!!document.querySelector('meta[name=author]') && {
metaAuthor: document?.querySelector('meta[name=author]')?.getAttribute('content'),
...(!!document.querySelector('meta[name="publisher"]') && {
metaPublisher: document?.querySelector('meta[name="publisher"]')?.getAttribute('content'),
}),
...(!!document.querySelector('meta[name="description"]') && {
metaDescription: document?.querySelector('meta[name="description"]')?.getAttribute('content'),
}),
...(!!document.querySelector('head > link[rel*="icon"]') && {
metaIconUrl: (document?.querySelector('head > link[rel*="icon"]') as HTMLLinkElement)?.href,
}),
};
});
data = {
...data,
...extractedFromHEAD,
};
} catch (e) {
console.log(e);
} catch (err) {
console.error(err);
}
try {
await page.waitForSelector('body');

await page.waitFor(2000); // give page 2 seconds to load/render (SPA)
const extractedFromBody = await page.evaluate(() => {
const findMainContentElement = (): HTMLElement | null => {
console.debug('findMainContentElement' + ' ' + document.querySelector('body')?.innerText.substring(0, 120));
const body = document.querySelector('body');
const bodyText = document.querySelector('body')?.innerText;
let main = document.querySelector('main');
if (main) {
return main;
}
main = document.querySelector('#main');
if (main && bodyText && main.innerText.length >= bodyText?.length * 0.3) {
return main;
}
main = document.querySelector('.main');
if (main && bodyText && main.innerText.length >= bodyText?.length * 0.3) {
return main;
}
main = document.querySelector('#content');
if (main && bodyText && main.innerText.length >= bodyText?.length * 0.3) {
return main;
}
return body;
};
const main = findMainContentElement();
const getMainText = (): string | null => {
console.debug('getMainText' + ' ' + main?.innerText.substring(0, 100));
return main?.innerText || null;
};
const mainText = getMainText();
const getMainImageUrlFromMainElement = (): string | null => {
if (!main) {
return null;
}
console.debug('getMainImageUrlFromMainElement' + ' ' + main.getElementsByTagName('img').length);
const _score = (image: HTMLImageElement): number => {
let score = 0;
let src;
if (image.getAttribute('src')) {
src = image.getAttribute('src');
}
if (!src) {
return -10000;
}

const rules = [
{ pattern: /(large|big)/, score: 1 },
{ pattern: /static/, score: 1 },
{ pattern: /upload/, score: 1 },
{ pattern: /media/, score: 1 },
{ pattern: /gravatar.com/, score: -1 },
{ pattern: /feeds.feedburner.com/, score: -1 },
{ pattern: /icon/i, score: -1 },
{ pattern: /logo/i, score: -1 },
{ pattern: /spinner/i, score: -1 },
{ pattern: /loading/i, score: -1 },
{ pattern: /badge/, score: -1 },
{ pattern: /1x1/, score: -1 },
{ pattern: /pixel/, score: -1 },
{ pattern: /ads/i, score: -1 },
{ pattern: /doubleclick/i, score: -1 },
];

for (let i = 0, l = rules.length; i < l; i++) {
if (rules[i].pattern.exec(src)) {
score += rules[i].score;
}
}
return score;
};
const img = main.getElementsByTagName('img');
const images = [];
if (img.length) {
for (let i = 0, l = img.length; i < l; i++) {
//Look for lazy loaded images
if (img[i].getAttribute('data-src')) {
img[i].setAttribute('src', img[i].getAttribute('data-src') || '');
}
if (img[i].getAttribute('data-lazy-src')) {
img[i].setAttribute('src', img[i].getAttribute('data-lazy-src') || '');
}

//Compute surface
const w = +(img[i].naturalWidth || img[i].getAttribute('width') || 1);
const h = +(img[i].naturalHeight || img[i].getAttribute('height') || 1);
const surface = w * h;

const score = _score(img[i]);

//Filter by size and minimum score
if (score >= 0 && surface > 100 * 100) {
images.push({ img: img[i], surface, score });
}
}

if (images.length > 0) {
//Sort by score
images.sort((a, b) => {
if (a.surface === b.surface) {
return b.score - a.score;
} else {
return b.surface - a.surface;
}
});

return images[0].img.src;
}
}
return null;
};
const getMainImageUrl = (): string | null => {
let mainImage = document.querySelector('meta[property="og:image"]');
if (mainImage) {
console.debug('getMainImageUrl' + ' ' + mainImage.getAttribute('content')?.trim());
return mainImage.getAttribute('content')?.trim() || null;
}
mainImage = document.querySelector('meta[name="twitter:image"]');
if (mainImage) {
console.debug('getMainImageUrl' + ' ' + mainImage.getAttribute('content')?.trim());
return mainImage.getAttribute('content')?.trim() || null;
}
if (main) {
const mainImageUrl = getMainImageUrlFromMainElement();
if (mainImageUrl) {
console.debug('getMainImageUrl' + ' ' + mainImageUrl);
return mainImageUrl;
}
}
return null;
};

const mainImageUrl = getMainImageUrl();
return {
...(mainText?.length && { mainText }),
...(mainImageUrl && { mainImageUrl }),
};
});
data = {
...data,
...extractedFromBody,
};
} catch (err) {
console.error(err);
}
try {
let imageBuffer: string | Buffer = await page.screenshot();
await saveScreenShot(imageBuffer, pageId);
imageBuffer = await page.screenshot({ fullPage: true });
await saveScreenShot(imageBuffer, pageId, 'full');
} catch (err) {
console.error(err);
}
await browser.close();
return data;
};
1 change: 1 addition & 0 deletions packages/poolbase-app/src/functions/tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"compilerOptions": {
"lib": ["es6", "dom", "es2018"],
"module": "commonjs",
"moduleResolution": "node",
"strict": true,
"outDir": "../../../../dist/app/functions",
"baseUrl": "./",
Expand Down
9 changes: 9 additions & 0 deletions storage.rules
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
rules_version = '2';
service firebase.storage {
match /b/{bucket}/o {
match /{allPaths=**} {
allow read;
allow write: if request.auth != null;
}
}
}

0 comments on commit ff94f0f

Please sign in to comment.