getmaxun · RohitR311 · Jan 14, 2025 · Jan 15, 2025 · Jan 15, 2025 · Jan 15, 2025
diff --git a/maxun-core/src/config/kafka.ts b/maxun-core/src/config/kafka.ts
@@ -0,0 +1,10 @@
+export const kafkaConfig = {
+    clientId: 'maxun-scraper',
+    brokers: ['localhost:29092'],
+    topics: {
+      SCRAPING_TASKS: 'scraping-tasks',
+      SCRAPING_RESULTS: 'scraping-results',
+      SCRAPING_DLQ: 'scraping-dlq'
+    },
+    consumerGroup: 'scraping-group'
+};
-    consumerGroup: 'scraping-group'
-};
+    consumerGroup: `${process.env.ENV || 'dev'}-scraping-group`
+};
-    consumerGroup: 'scraping-group'
-};
+    consumerGroup: `${process.env.ENV || 'dev'}-scraping-group`
+};
diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts
@@ -16,6 +16,8 @@ import Concurrency from './utils/concurrency';
 import Preprocessor from './preprocessor';
 import log, { Level } from './utils/logger';
 
+import os from 'os'; 
+
 /**
  * Extending the Window interface for custom scraping functions.
  */
@@ -451,7 +453,7 @@ export default class Interpreter extends EventEmitter {
           const scrapeResults: Record<string, any>[] = await page.evaluate((cfg) => window.scrapeList(cfg), config);
           await this.options.serializableCallback(scrapeResults);
         } else {
-          const scrapeResults: Record<string, any>[] = await this.handlePagination(page, config);
+          const scrapeResults: Record<string, any>[] = await this.handleParallelPagination(page, config);
           await this.options.serializableCallback(scrapeResults);
         }
       },
@@ -540,6 +542,131 @@ export default class Interpreter extends EventEmitter {
     }
   }
 
+  private async handleParallelPagination(page: Page, config: any): Promise<any[]> {
+    if (config.limit > 10000 && config.pagination.type === 'clickNext') {
+      console.time('parallel-scraping');
+
+      const numWorkers = Math.max(1, Math.min(os.cpus().length - 1, 4));
+      const batchSize = Math.ceil(config.limit / numWorkers);
+      const pageUrls: string[] = [];
+
+      let workers: any = null;
+      let availableSelectors = config.pagination.selector.split(',');
+      let visitedUrls: string[] = [];
+
+      const { itemsPerPage, estimatedPages } = await page.evaluate(
+        ({ listSelector, limit }) => {
+          const items = document.querySelectorAll(listSelector).length;
+          return {
+            itemsPerPage: items,
+            estimatedPages: Math.ceil(limit / items)
+          };
+        },
+        { listSelector: config.listSelector, limit: config.limit }
+      );
+
+      console.log(`Items per page: ${itemsPerPage}`);
+      console.log(`Estimated pages needed: ${estimatedPages}`);
+
+      try {
+        while (true) {
+          pageUrls.push(page.url())    
+
+          if (pageUrls.length >= estimatedPages) {
+            console.log('Reached estimated number of pages. Stopping pagination.');
+            break;
+          }
+
+          let checkButton = null;
+          let workingSelector = null;
+
+          for (let i = 0; i < availableSelectors.length; i++) {
+            const selector = availableSelectors[i];
+            try {
+              // Wait for selector with a short timeout
+              checkButton = await page.waitForSelector(selector, { state: 'attached' });
+              if (checkButton) {
+                workingSelector = selector;
+                break;
+              }
+            } catch (error) {
+              console.log(`Selector failed: ${selector}`);
+            }
+          }
+
+          if(!workingSelector) {
+            break;
+          }
+
+          const nextButton = await page.$(workingSelector);
+          if (!nextButton) {
+            break;
+          }
+
+          const selectorIndex = availableSelectors.indexOf(workingSelector!);
+          availableSelectors = availableSelectors.slice(selectorIndex);
+
+          const previousUrl = page.url();
+          visitedUrls.push(previousUrl);
+
+          try {
+            // Try both click methods simultaneously
+            await Promise.race([
+              Promise.all([
+                page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
+                nextButton.click()
+              ]),
+              Promise.all([
+                page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
+                nextButton.dispatchEvent('click')
+              ])
+            ]);
+          } catch (error) {
+            // Verify if navigation actually succeeded
+            const currentUrl = page.url();
+            if (currentUrl === previousUrl) {
+              console.log("Previous URL same as current URL. Navigation failed.");
+            }
+          }
+
+          const currentUrl = page.url();
+          if (visitedUrls.includes(currentUrl)) {
+            console.log(`Detected navigation to a previously visited URL: ${currentUrl}`);
+
+            // Extract the current page number from the URL
+            const match = currentUrl.match(/\d+/);
+            if (match) {
+              const currentNumber = match[0];
+              // Use visitedUrls.length + 1 as the next page number
+              const nextNumber = visitedUrls.length + 1;
+
+              // Create new URL by replacing the current number with the next number
+              const nextUrl = currentUrl.replace(currentNumber, nextNumber.toString());
+
+              console.log(`Navigating to constructed URL: ${nextUrl}`);
+
+              // Navigate to the next page
+              await Promise.all([
+                page.waitForNavigation({ waitUntil: 'networkidle' }),
+                page.goto(nextUrl)
+              ]);
+            }
+          }
+
+          await page.waitForTimeout(1000);
+        }
+      } catch (error) {
+          console.error('Error collecting page URLs:', error);
+      }
+
+      console.log(`Collected ${pageUrls.length} unique page URLs`);
+
+
+    }
+
+    return this.handlePagination(page, config);
+  }
+
   private async handlePagination(page: Page, config: { listSelector: string, fields: any, limit?: number, pagination: any }) {
     let allResults: Record<string, any>[] = [];
     let previousHeight = 0;
@@ -556,6 +683,7 @@ export default class Interpreter extends EventEmitter {
           await page.waitForTimeout(2000);
 
           const currentHeight = await page.evaluate(() => document.body.scrollHeight);
+          console.log(`Current scroll height: ${currentHeight}`);
           if (currentHeight === previousHeight) {
             const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
             allResults = allResults.concat(finalResults);

diff --git a/maxun-core/src/types/worker.ts b/maxun-core/src/types/worker.ts
@@ -0,0 +1,59 @@
+export interface WorkerConfig {
+    workerIndex: number;
+    startIndex: number;
+    endIndex: number;
+    batchSize: number;
+    pageUrls: string[];
+    listSelector: string;
+    fields: any;
+    pagination: {
+        type: string;
+        selector: string;
+    };
+}
+
+export interface SharedState {
+    totalScraped: number;
+    results: any[];
+}
+
+export interface WorkerProgressData {
+    percentage: number;
+    currentUrl: string;
+    scrapedItems: number;
+    timeElapsed: number;
+    estimatedTimeRemaining: number;
+    failures: number;
+    performance: PerformanceMetrics;
+}
+
+export interface PerformanceMetrics {
+    startTime: number;
+    endTime: number;
+    duration: number;
+    pagesProcessed: number;
+    itemsScraped: number;
+    failedPages: number;
+    averageTimePerPage: number;
+    memoryUsage: {
+        heapUsed: number;
+        heapTotal: number;
+        external: number;
+        rss: number;
+    };
+    cpuUsage: {
+        user: number;
+        system: number;
+    };
+}
+
+export interface GlobalMetrics {
+    totalPagesProcessed: number;
+    totalItemsScraped: number;
+    totalFailures: number;
+    workersActive: number;
+    averageSpeed: number;
+    timeElapsed: number;
+    memoryUsage: NodeJS.MemoryUsage;
+    cpuUsage: NodeJS.CpuUsage;
+}
diff --git a/maxun-core/src/utils/kafka-manager.ts b/maxun-core/src/utils/kafka-manager.ts
@@ -0,0 +1,66 @@
+import { Kafka, Consumer, Producer } from 'kafkajs';
+import { kafkaConfig } from '../config/kafka';
+import { EventEmitter } from 'events';
+
+export class KafkaManager extends EventEmitter {
+  private kafka: Kafka;
+  private producer: Producer;
+  private consumer: Consumer;
+  private metricsInterval: NodeJS.Timeout | null = null;
+
+  constructor() {
+    super();
+    this.kafka = new Kafka({
+      clientId: kafkaConfig.clientId,
+      brokers: kafkaConfig.brokers
+    });
+
+    this.producer = this.kafka.producer();
+    this.consumer = this.kafka.consumer({ 
+      groupId: kafkaConfig.consumerGroup,
+      sessionTimeout: 30000
+    });
+  }
+
+  async initialize() {
+    await this.producer.connect();
+    await this.consumer.connect();
+    await this.createTopics();
+    this.startMetricsReporting();
+  }
+
+  private async createTopics() {
+    const admin = this.kafka.admin();
+    await admin.createTopics({
+      topics: [
+        { topic: kafkaConfig.topics.SCRAPING_TASKS, numPartitions: 10 },
+        { topic: kafkaConfig.topics.SCRAPING_RESULTS, numPartitions: 10 },
+        { topic: kafkaConfig.topics.SCRAPING_DLQ, numPartitions: 1 }
+      ]
+    });
+    await admin.disconnect();
+  }
+
+  private startMetricsReporting() {
+    this.metricsInterval = setInterval(async () => {
+      const admin = this.kafka.admin();
+      const metrics = await admin.fetchTopicMetadata({
+        topics: [
+          kafkaConfig.topics.SCRAPING_TASKS,
+          kafkaConfig.topics.SCRAPING_RESULTS
+        ]
+      });
+
+      this.emit('metrics', metrics);
+      await admin.disconnect();
+    }, 5000);
+  }
+
+  async cleanup() {
+    if (this.metricsInterval) {
+      clearInterval(this.metricsInterval);
+    }
+    await this.producer.disconnect();
+    await this.consumer.disconnect();
+  }
+}