getmaxun · RohitR311 · Jan 14, 2025 · Jan 15, 2025 · Jan 15, 2025 · Jan 15, 2025
diff --git a/maxun-core/src/config/kafka.ts b/maxun-core/src/config/kafka.ts
@@ -0,0 +1,10 @@
+export const kafkaConfig = {
+    clientId: 'maxun-scraper',
+    brokers: ['localhost:29092'],
+    topics: {
+      SCRAPING_TASKS: 'scraping-tasks',
+      SCRAPING_RESULTS: 'scraping-results',
+      SCRAPING_DLQ: 'scraping-dlq'
+    },
+    consumerGroup: 'scraping-group'
+};
-    consumerGroup: 'scraping-group'
-};
+    consumerGroup: `${process.env.ENV || 'dev'}-scraping-group`
+};
-    consumerGroup: 'scraping-group'
-};
+    consumerGroup: `${process.env.ENV || 'dev'}-scraping-group`
+};
diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts
@@ -16,6 +16,11 @@ import Concurrency from './utils/concurrency';
 import Preprocessor from './preprocessor';
 import log, { Level } from './utils/logger';
 
+import { Kafka } from 'kafkajs';
+import { kafkaConfig } from './config/kafka';
+
+import os from 'os'; 
+
 /**
  * Extending the Window interface for custom scraping functions.
  */
@@ -39,6 +44,7 @@ declare global {
 interface InterpreterOptions {
   maxRepeats: number;
   maxConcurrency: number;
+  maxWorkers: number;
   serializableCallback: (output: any) => (void | Promise<void>);
   binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
   debug: boolean;
@@ -68,13 +74,31 @@ export default class Interpreter extends EventEmitter {
 
   private cumulativeResults: Record<string, any>[] = [];
 
+  private kafka: Kafka;
+
+  private producer: any;
+
+  private async initializeKafka() {
+    this.producer = this.kafka.producer({
+      allowAutoTopicCreation: true,
+      idempotent: true
+    });
+    await this.producer.connect();
+  }
+
-  private kafka: Kafka;
-
-  private producer: any;
-
-  private async initializeKafka() {
-    this.producer = this.kafka.producer({
-      allowAutoTopicCreation: true,
-      idempotent: true
-    });
-    await this.producer.connect();
-  }
+  private kafka: Kafka;
+  private producer: any;
+  private readonly producerConfig = {
+    allowAutoTopicCreation: true,
+    idempotent: true,
+    // Add other configurable options
+  };
+
+  private async initializeKafka() {
+    try {
+        this.producer = this.kafka.producer(this.producerConfig);
+        await this.producer.connect();
+        console.log('Kafka producer connected successfully');
+    } catch (error) {
+        console.error('Failed to initialize Kafka producer:', error);
+        throw error;
+    }
+  }
+
+  private async cleanup() {
+    if (this.producer) {
+        try {
+            await this.producer.disconnect();
+            console.log('Kafka producer disconnected successfully');
+        } catch (error) {
+            console.error('Error disconnecting Kafka producer:', error);
+        }
+    }
+  }
-  private kafka: Kafka;
-
-  private producer: any;
-
-  private async initializeKafka() {
-    this.producer = this.kafka.producer({
-      allowAutoTopicCreation: true,
-      idempotent: true
-    });
-    await this.producer.connect();
-  }
+  private kafka: Kafka;
+  private producer: any;
+  private readonly producerConfig = {
+    allowAutoTopicCreation: true,
+    idempotent: true,
+    // Add other configurable options
+  };
+
+  private async initializeKafka() {
+    try {
+        this.producer = this.kafka.producer(this.producerConfig);
+        await this.producer.connect();
+        console.log('Kafka producer connected successfully');
+    } catch (error) {
+        console.error('Failed to initialize Kafka producer:', error);
+        throw error;
+    }
+  }
+
+  private async cleanup() {
+    if (this.producer) {
+        try {
+            await this.producer.disconnect();
+            console.log('Kafka producer disconnected successfully');
+        } catch (error) {
+            console.error('Error disconnecting Kafka producer:', error);
+        }
+    }
+  }
   constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>) {
     super();
     this.workflow = workflow.workflow;
     this.initializedWorkflow = null;
+    this.kafka = new Kafka({
+      clientId: kafkaConfig.clientId,
+      brokers: kafkaConfig.brokers
+    });
+    this.initializeKafka();
     this.options = {
       maxRepeats: 5,
       maxConcurrency: 5,
+      maxWorkers: Math.max(1, Math.min(os.cpus().length - 1, 4)),
       serializableCallback: (data) => { 
         log(JSON.stringify(data), Level.WARN);
       },
@@ -451,7 +475,7 @@ export default class Interpreter extends EventEmitter {
           const scrapeResults: Record<string, any>[] = await page.evaluate((cfg) => window.scrapeList(cfg), config);
           await this.options.serializableCallback(scrapeResults);
         } else {
-          const scrapeResults: Record<string, any>[] = await this.handlePagination(page, config);
+          const scrapeResults: Record<string, any>[] = await this.handleParallelPagination(page, config);
           await this.options.serializableCallback(scrapeResults);
         }
       },
@@ -540,6 +564,276 @@ export default class Interpreter extends EventEmitter {
     }
   }
 
+  private async handleParallelPagination(page: Page, config: any): Promise<any[]> {
+    if (config.limit > 10000 && config.pagination.type === 'clickNext') {
+      console.time('parallel-scraping');
+
+      const workflowId = `workflow-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
+      console.log(`Starting workflow with ID: ${workflowId}`);
+
+      const numWorkers = Math.max(1, Math.min(os.cpus().length - 1, 4));
+      const batchSize = Math.ceil(config.limit / numWorkers);
+      const tasks = [];
+      const pageUrls: string[] = [];
+
+      let availableSelectors = config.pagination.selector.split(',');
+      let visitedUrls: string[] = [];
+
+      const { itemsPerPage, estimatedPages } = await page.evaluate(
+        ({ listSelector, limit }) => {
+          const items = document.querySelectorAll(listSelector).length;
+          return {
+            itemsPerPage: items,
+            estimatedPages: Math.ceil(limit / items)
+          };
+        },
+        { listSelector: config.listSelector, limit: config.limit }
+      );
+
+      console.log(`Items per page: ${itemsPerPage}`);
+      console.log(`Estimated pages needed: ${estimatedPages}`);
+
+      try {
+        while (true) {
+          pageUrls.push(page.url())    
+
+          if (pageUrls.length >= estimatedPages) {
+            console.log('Reached estimated number of pages. Stopping pagination.');
+            break;
+          }
+
+          let checkButton = null;
+          let workingSelector = null;
+
+          for (let i = 0; i < availableSelectors.length; i++) {
+            const selector = availableSelectors[i];
+            try {
+              // Wait for selector with a short timeout
+              checkButton = await page.waitForSelector(selector, { state: 'attached' });
+              if (checkButton) {
+                workingSelector = selector;
+                break;
+              }
+            } catch (error) {
+              console.log(`Selector failed: ${selector}`);
+            }
+          }
+
+          if(!workingSelector) {
+            break;
+          }
+
+          const nextButton = await page.$(workingSelector);
+          if (!nextButton) {
+            break;
+          }
+
+          const selectorIndex = availableSelectors.indexOf(workingSelector!);
+          availableSelectors = availableSelectors.slice(selectorIndex);
+
+          const previousUrl = page.url();
+          visitedUrls.push(previousUrl);
+
+          try {
+            // Try both click methods simultaneously
+            await Promise.race([
+              Promise.all([
+                page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
+                nextButton.click()
+              ]),
+              Promise.all([
+                page.waitForNavigation({ waitUntil: 'networkidle', timeout: 30000 }),
+                nextButton.dispatchEvent('click')
+              ])
+            ]);
+          } catch (error) {
+            // Verify if navigation actually succeeded
+            const currentUrl = page.url();
+            if (currentUrl === previousUrl) {
+              console.log("Previous URL same as current URL. Navigation failed.");
+            }
+          }
+
+          const currentUrl = page.url();
+          if (visitedUrls.includes(currentUrl)) {
+            console.log(`Detected navigation to a previously visited URL: ${currentUrl}`);
+
+            // Extract the current page number from the URL
+            const match = currentUrl.match(/\d+/);
+            if (match) {
+              const currentNumber = match[0];
+              // Use visitedUrls.length + 1 as the next page number
+              const nextNumber = visitedUrls.length + 1;
+
+              // Create new URL by replacing the current number with the next number
+              const nextUrl = currentUrl.replace(currentNumber, nextNumber.toString());
+
+              console.log(`Navigating to constructed URL: ${nextUrl}`);
+
+              // Navigate to the next page
+              await Promise.all([
+                page.waitForNavigation({ waitUntil: 'networkidle' }),
+                page.goto(nextUrl)
+              ]);
+            }
+          }
+
+          await page.waitForTimeout(1000);
+        }
+      } catch (error) {
+          console.error('Error collecting page URLs:', error);
+      }
+
+      console.log(`Collected ${pageUrls.length} unique page URLs`);
+
+      for (let i = 0; i < numWorkers; i++) {
+        const startIndex = i * batchSize;
+        const endIndex = Math.min((i + 1) * batchSize, config.limit);
+        const workerUrls = pageUrls.slice(
+          i * Math.ceil(pageUrls.length / numWorkers),
+          (i + 1) * Math.ceil(pageUrls.length / numWorkers)
+        );
+
+        const task = {
+          taskId: `${workflowId}-task-${i}`,
+          workflowId,
+          urls: workerUrls,
+          config: {
+            listSelector: config.listSelector,
+            fields: config.fields,
+            pagination: config.pagination,
+            batchSize: endIndex - startIndex,
+            startIndex,
+            endIndex
+          }
+        };
+
+        await this.producer.send({
+          topic: kafkaConfig.topics.SCRAPING_TASKS,
+          messages: [{
+            key: task.taskId,
+            value: JSON.stringify(task),
+            headers: {
+              'workflow-id': workflowId,
+              'retry-count': '0',
+              'total-tasks': numWorkers.toString()
+            }
+          }]
+        });
+
+        tasks.push(task);
+      }
+
+      console.log("TASKS SENT TO KAFKA (Not stringified)", tasks);
+
+      // Wait for results from Kafka
+      const results = await this.waitForScrapingResults(tasks);
+      console.timeEnd('parallel-scraping');
+      return results;
+    }
+
+    return this.handlePagination(page, config);
+  }
+
+  private async waitForScrapingResults(tasks: any[]): Promise<any[]> {
+    // Create a map to store our workflow's results
+    const resultsMap = new Map<string, any[]>();
+
+    // Extract the workflow ID from the first task - all tasks in this batch will share the same workflow ID
+    const workflowId = tasks[0].workflowId;
+    console.log(`Waiting for results from workflow: ${workflowId}`);
+
+    // Create a Set of task IDs for quick lookup - these are the only tasks we care about
+    const expectedTaskIds = new Set(tasks.map(task => task.taskId));
+
+    // Create a consumer specifically for this workflow
+    const resultConsumer = this.kafka.consumer({ 
+        groupId: `scraping-group-results-${workflowId}`,
+        maxWaitTimeInMs: 1000,
+        maxBytesPerPartition: 2097152 // 2MB
+    });
+
+    try {
+        await resultConsumer.connect();
+        console.log('Result consumer connected successfully');
+
+        await resultConsumer.subscribe({ 
+            topic: kafkaConfig.topics.SCRAPING_RESULTS,
+            fromBeginning: true 
+        });
+        console.log('Result consumer subscribed to topic successfully');
+
+        return new Promise((resolve, reject) => {
+            let isRunning = true;
+
+            resultConsumer.run({
+                eachMessage: async ({ topic, partition, message }) => {
+                    if (!isRunning) return;
+
+                    try {
+                        const result = JSON.parse(message.value!.toString());
+
+                        // Verify both task ID and workflow ID match
+                        if (result.workflowId === workflowId && expectedTaskIds.has(result.taskId)) {
+                            // Store this task's results
+                            if (!resultsMap.has(result.taskId)) {
+                                resultsMap.set(result.taskId, result.data);
+                                console.log(`Received results for task ${result.taskId}. ` + 
+                                          `Got ${resultsMap.size} of ${tasks.length} tasks from workflow ${workflowId}`);
+                            }
+
+                            // Check if we have all our workflow's results
+                            if (resultsMap.size === tasks.length) {
+                                isRunning = false;
+
+                                // Sort tasks by their numeric index (extract number from task ID)
+                                const sortedTasks = [...tasks].sort((a, b) => {
+                                    const aIndex = parseInt(a.taskId.split('-').pop() || '0');
+                                    const bIndex = parseInt(b.taskId.split('-').pop() || '0');
+                                    return aIndex - bIndex;
+                                });
+
+                                // Combine results in the sorted task order
+                                const allResults = sortedTasks
+                                    .map(task => {
+                                        const taskResults = resultsMap.get(task.taskId);
+                                        if (!taskResults) {
+                                            console.warn(`Missing results for task ${task.taskId} in workflow ${workflowId}`);
+                                            return [];
+                                        }
+                                        return taskResults;
+                                    })
+                                    .flat();
+
+                                console.log(`Successfully collected all results from workflow ${workflowId}`);
+
+                                resolve(allResults);
+                            }
+                        }
+                    } catch (error) {
+                        console.error(`Error processing message in workflow ${workflowId}:`, error);
+                        reject(error);
+                    }
+                }
+            });
+
+            // // Add a timeout to prevent hanging
+            // const timeout = setTimeout(() => {
+            //     if (isRunning) {
+            //         isRunning = false;
+            //         console.error(`Timeout waiting for results from workflow ${workflowId}. ` +
+            //                     `Received ${resultsMap.size} of ${tasks.length} expected results.`);
+            //         reject(new Error(`Timeout waiting for results from workflow ${workflowId}`));
+            //     }
+            // }, 30000); // 30 second timeout
+        });
+
+    } catch (error) {
+        console.error(`Fatal error in waitForScrapingResults for workflow ${workflowId}:`, error);
+        throw error;
+    }
+  }
+
-  private async waitForScrapingResults(tasks: any[]): Promise<any[]> {
-    // Create a map to store our workflow's results
-    const resultsMap = new Map<string, any[]>();
-    
-    // Extract the workflow ID from the first task - all tasks in this batch will share the same workflow ID
-    const workflowId = tasks[0].workflowId;
-    console.log(`Waiting for results from workflow: ${workflowId}`);
-    
-    // Create a Set of task IDs for quick lookup - these are the only tasks we care about
-    const expectedTaskIds = new Set(tasks.map(task => task.taskId));
-    
-    // Create a consumer specifically for this workflow
-    const resultConsumer = this.kafka.consumer({ 
-        groupId: `scraping-group-results-${workflowId}`,
-        maxWaitTimeInMs: 1000,
-        maxBytesPerPartition: 2097152 // 2MB
-    });
-
-    try {
-        await resultConsumer.connect();
-        console.log('Result consumer connected successfully');
-        
-        await resultConsumer.subscribe({ 
-            topic: kafkaConfig.topics.SCRAPING_RESULTS,
-            fromBeginning: true 
-        });
-        console.log('Result consumer subscribed to topic successfully');
-
-        return new Promise((resolve, reject) => {
-            let isRunning = true;
-
-            resultConsumer.run({
-                eachMessage: async ({ topic, partition, message }) => {
-                    if (!isRunning) return;
-                    
-                    try {
-                        const result = JSON.parse(message.value!.toString());
-                        
-                        // Verify both task ID and workflow ID match
-                        if (result.workflowId === workflowId && expectedTaskIds.has(result.taskId)) {
-                            // Store this task's results
-                            if (!resultsMap.has(result.taskId)) {
-                                resultsMap.set(result.taskId, result.data);
-                                console.log(`Received results for task ${result.taskId}. ` + 
-                                          `Got ${resultsMap.size} of ${tasks.length} tasks from workflow ${workflowId}`);
-                            }
-
-                            // Check if we have all our workflow's results
-                            if (resultsMap.size === tasks.length) {
-                                isRunning = false;
-                                
-                                // Sort tasks by their numeric index (extract number from task ID)
-                                const sortedTasks = [...tasks].sort((a, b) => {
-                                    const aIndex = parseInt(a.taskId.split('-').pop() || '0');
-                                    const bIndex = parseInt(b.taskId.split('-').pop() || '0');
-                                    return aIndex - bIndex;
-                                });
-
-                                // Combine results in the sorted task order
-                                const allResults = sortedTasks
-                                    .map(task => {
-                                        const taskResults = resultsMap.get(task.taskId);
-                                        if (!taskResults) {
-                                            console.warn(`Missing results for task ${task.taskId} in workflow ${workflowId}`);
-                                            return [];
-                                        }
-                                        return taskResults;
-                                    })
-                                    .flat();
-
-                                console.log(`Successfully collected all results from workflow ${workflowId}`);
-                                
-                                resolve(allResults);
-                            }
-                        }
-                    } catch (error) {
-                        console.error(`Error processing message in workflow ${workflowId}:`, error);
-                        reject(error);
-                    }
-                }
-            });
-
-            // // Add a timeout to prevent hanging
-            // const timeout = setTimeout(() => {
-            //     if (isRunning) {
-            //         isRunning = false;
-            //         console.error(`Timeout waiting for results from workflow ${workflowId}. ` +
-            //                     `Received ${resultsMap.size} of ${tasks.length} expected results.`);
-            //         reject(new Error(`Timeout waiting for results from workflow ${workflowId}`));
-            //     }
-            // }, 30000); // 30 second timeout
-        });
-
-    } catch (error) {
-        console.error(`Fatal error in waitForScrapingResults for workflow ${workflowId}:`, error);
-        throw error;
-    }
-  }
+interface ScrapingResult {
+    workflowId: string;
+    taskId: string;
+    data: any[];
+}
+
+const RESULT_COLLECTION_TIMEOUT_MS = 30000;
+
+private async waitForScrapingResults(tasks: any[]): Promise<any[]> {
+    // Create a map to store our workflow's results
+    const resultsMap = new Map<string, any[]>();
+    
+    // Extract the workflow ID from the first task - all tasks in this batch will share the same workflow ID
+    const workflowId = tasks[0].workflowId;
+    console.log(`Waiting for results from workflow: ${workflowId}`);
+    
+    // Create a Set of task IDs for quick lookup - these are the only tasks we care about
+    const expectedTaskIds = new Set(tasks.map(task => task.taskId));
+    
+    // Create a consumer specifically for this workflow
+    const resultConsumer = this.kafka.consumer({ 
+        groupId: `scraping-group-results-${workflowId}`,
+        maxWaitTimeInMs: 1000,
+        maxBytesPerPartition: 2097152 // 2MB
+    });
+
+    try {
+        await resultConsumer.connect();
+        console.log('Result consumer connected successfully');
+        
+        await resultConsumer.subscribe({ 
+            topic: kafkaConfig.topics.SCRAPING_RESULTS,
+            fromBeginning: true 
+        });
+        console.log('Result consumer subscribed to topic successfully');
+
+        return new Promise((resolve, reject) => {
+            let isRunning = true;
+            const timeout = setTimeout(() => {
+                if (isRunning) {
+                    isRunning = false;
+                    const error = new Error(
+                        `Timeout waiting for results from workflow ${workflowId}. ` +
+                        `Received ${resultsMap.size} of ${tasks.length} expected results.`
+                    );
+                    reject(error);
+                }
+            }, RESULT_COLLECTION_TIMEOUT_MS);
+
+            resultConsumer.run({
+                eachMessage: async ({ topic, partition, message }) => {
+                    if (!isRunning) return;
+                    
+                    try {
+                        const result = JSON.parse(message.value!.toString()) as ScrapingResult;
+                        
+                        // Verify both task ID and workflow ID match
+                        if (result.workflowId === workflowId && expectedTaskIds.has(result.taskId)) {
+                            // Store this task's results
+                            if (!resultsMap.has(result.taskId)) {
+                                resultsMap.set(result.taskId, result.data);
+                                console.log(
+                                    `Received results for task ${result.taskId}. ` +
+                                    `Got ${resultsMap.size} of ${tasks.length} tasks from workflow ${workflowId}`
+                                );
+                            }
+
+                            // Check if we have all our workflow's results
+                            if (resultsMap.size === tasks.length) {
+                                isRunning = false;
+                                clearTimeout(timeout);
+                                
+                                // Sort tasks by their numeric index (extract number from task ID)
+                                const sortedTasks = [...tasks].sort((a, b) => {
+                                    const aIndex = parseInt(a.taskId.split('-').pop() || '0');
+                                    const bIndex = parseInt(b.taskId.split('-').pop() || '0');
+                                    return aIndex - bIndex;
+                                });
+
+                                // Combine results in the sorted task order
+                                const allResults = sortedTasks
+                                    .map(task => resultsMap.get(task.taskId) || [])
+                                    .flat();
+
+                                console.log(`Successfully collected all results from workflow ${workflowId}`);
+                                
+                                await resultConsumer.disconnect();
+                                resolve(allResults);
+                            }
+                        }
+                    } catch (error) {
+                        console.error(`Error processing message in workflow ${workflowId}:`, error);
+                        clearTimeout(timeout);
+                        await resultConsumer.disconnect();
+                        reject(error);
+                    }
+                }
+            });
+        });
+    } catch (error) {
+        console.error(`Fatal error in waitForScrapingResults for workflow ${workflowId}:`, error);
+        await resultConsumer.disconnect();
+        throw error;
+    }
+}
-  private async waitForScrapingResults(tasks: any[]): Promise<any[]> {
-    // Create a map to store our workflow's results
-    const resultsMap = new Map<string, any[]>();
-    
-    // Extract the workflow ID from the first task - all tasks in this batch will share the same workflow ID
-    const workflowId = tasks[0].workflowId;
-    console.log(`Waiting for results from workflow: ${workflowId}`);
-    
-    // Create a Set of task IDs for quick lookup - these are the only tasks we care about
-    const expectedTaskIds = new Set(tasks.map(task => task.taskId));
-    
-    // Create a consumer specifically for this workflow
-    const resultConsumer = this.kafka.consumer({ 
-        groupId: `scraping-group-results-${workflowId}`,
-        maxWaitTimeInMs: 1000,
-        maxBytesPerPartition: 2097152 // 2MB
-    });
-
-    try {
-        await resultConsumer.connect();
-        console.log('Result consumer connected successfully');
-        
-        await resultConsumer.subscribe({ 
-            topic: kafkaConfig.topics.SCRAPING_RESULTS,
-            fromBeginning: true 
-        });
-        console.log('Result consumer subscribed to topic successfully');
-
-        return new Promise((resolve, reject) => {
-            let isRunning = true;
-
-            resultConsumer.run({
-                eachMessage: async ({ topic, partition, message }) => {
-                    if (!isRunning) return;
-                    
-                    try {
-                        const result = JSON.parse(message.value!.toString());
-                        
-                        // Verify both task ID and workflow ID match
-                        if (result.workflowId === workflowId && expectedTaskIds.has(result.taskId)) {
-                            // Store this task's results
-                            if (!resultsMap.has(result.taskId)) {
-                                resultsMap.set(result.taskId, result.data);
-                                console.log(`Received results for task ${result.taskId}. ` + 
-                                          `Got ${resultsMap.size} of ${tasks.length} tasks from workflow ${workflowId}`);
-                            }
-
-                            // Check if we have all our workflow's results
-                            if (resultsMap.size === tasks.length) {
-                                isRunning = false;
-                                
-                                // Sort tasks by their numeric index (extract number from task ID)
-                                const sortedTasks = [...tasks].sort((a, b) => {
-                                    const aIndex = parseInt(a.taskId.split('-').pop() || '0');
-                                    const bIndex = parseInt(b.taskId.split('-').pop() || '0');
-                                    return aIndex - bIndex;
-                                });
-
-                                // Combine results in the sorted task order
-                                const allResults = sortedTasks
-                                    .map(task => {
-                                        const taskResults = resultsMap.get(task.taskId);
-                                        if (!taskResults) {
-                                            console.warn(`Missing results for task ${task.taskId} in workflow ${workflowId}`);
-                                            return [];
-                                        }
-                                        return taskResults;
-                                    })
-                                    .flat();
-
-                                console.log(`Successfully collected all results from workflow ${workflowId}`);
-                                
-                                resolve(allResults);
-                            }
-                        }
-                    } catch (error) {
-                        console.error(`Error processing message in workflow ${workflowId}:`, error);
-                        reject(error);
-                    }
-                }
-            });
-
-            // // Add a timeout to prevent hanging
-            // const timeout = setTimeout(() => {
-            //     if (isRunning) {
-            //         isRunning = false;
-            //         console.error(`Timeout waiting for results from workflow ${workflowId}. ` +
-            //                     `Received ${resultsMap.size} of ${tasks.length} expected results.`);
-            //         reject(new Error(`Timeout waiting for results from workflow ${workflowId}`));
-            //     }
-            // }, 30000); // 30 second timeout
-        });
-
-    } catch (error) {
-        console.error(`Fatal error in waitForScrapingResults for workflow ${workflowId}:`, error);
-        throw error;
-    }
-  }
+interface ScrapingResult {
+    workflowId: string;
+    taskId: string;
+    data: any[];
+}
+
+const RESULT_COLLECTION_TIMEOUT_MS = 30000;
+
+private async waitForScrapingResults(tasks: any[]): Promise<any[]> {
+    // Create a map to store our workflow's results
+    const resultsMap = new Map<string, any[]>();
+    
+    // Extract the workflow ID from the first task - all tasks in this batch will share the same workflow ID
+    const workflowId = tasks[0].workflowId;
+    console.log(`Waiting for results from workflow: ${workflowId}`);
+    
+    // Create a Set of task IDs for quick lookup - these are the only tasks we care about
+    const expectedTaskIds = new Set(tasks.map(task => task.taskId));
+    
+    // Create a consumer specifically for this workflow
+    const resultConsumer = this.kafka.consumer({ 
+        groupId: `scraping-group-results-${workflowId}`,
+        maxWaitTimeInMs: 1000,
+        maxBytesPerPartition: 2097152 // 2MB
+    });
+
+    try {
+        await resultConsumer.connect();
+        console.log('Result consumer connected successfully');
+        
+        await resultConsumer.subscribe({ 
+            topic: kafkaConfig.topics.SCRAPING_RESULTS,
+            fromBeginning: true 
+        });
+        console.log('Result consumer subscribed to topic successfully');
+
+        return new Promise((resolve, reject) => {
+            let isRunning = true;
+            const timeout = setTimeout(() => {
+                if (isRunning) {
+                    isRunning = false;
+                    const error = new Error(
+                        `Timeout waiting for results from workflow ${workflowId}. ` +
+                        `Received ${resultsMap.size} of ${tasks.length} expected results.`
+                    );
+                    reject(error);
+                }
+            }, RESULT_COLLECTION_TIMEOUT_MS);
+
+            resultConsumer.run({
+                eachMessage: async ({ topic, partition, message }) => {
+                    if (!isRunning) return;
+                    
+                    try {
+                        const result = JSON.parse(message.value!.toString()) as ScrapingResult;
+                        
+                        // Verify both task ID and workflow ID match
+                        if (result.workflowId === workflowId && expectedTaskIds.has(result.taskId)) {
+                            // Store this task's results
+                            if (!resultsMap.has(result.taskId)) {
+                                resultsMap.set(result.taskId, result.data);
+                                console.log(
+                                    `Received results for task ${result.taskId}. ` +
+                                    `Got ${resultsMap.size} of ${tasks.length} tasks from workflow ${workflowId}`
+                                );
+                            }
+
+                            // Check if we have all our workflow's results
+                            if (resultsMap.size === tasks.length) {
+                                isRunning = false;
+                                clearTimeout(timeout);
+                                
+                                // Sort tasks by their numeric index (extract number from task ID)
+                                const sortedTasks = [...tasks].sort((a, b) => {
+                                    const aIndex = parseInt(a.taskId.split('-').pop() || '0');
+                                    const bIndex = parseInt(b.taskId.split('-').pop() || '0');
+                                    return aIndex - bIndex;
+                                });
+
+                                // Combine results in the sorted task order
+                                const allResults = sortedTasks
+                                    .map(task => resultsMap.get(task.taskId) || [])
+                                    .flat();
+
+                                console.log(`Successfully collected all results from workflow ${workflowId}`);
+                                
+                                await resultConsumer.disconnect();
+                                resolve(allResults);
+                            }
+                        }
+                    } catch (error) {
+                        console.error(`Error processing message in workflow ${workflowId}:`, error);
+                        clearTimeout(timeout);
+                        await resultConsumer.disconnect();
+                        reject(error);
+                    }
+                }
+            });
+        });
+    } catch (error) {
+        console.error(`Fatal error in waitForScrapingResults for workflow ${workflowId}:`, error);
+        await resultConsumer.disconnect();
+        throw error;
+    }
+}
   private async handlePagination(page: Page, config: { listSelector: string, fields: any, limit?: number, pagination: any }) {
     let allResults: Record<string, any>[] = [];
     let previousHeight = 0;
@@ -556,6 +850,7 @@ export default class Interpreter extends EventEmitter {
           await page.waitForTimeout(2000);
 
           const currentHeight = await page.evaluate(() => document.body.scrollHeight);
+          console.log(`Current scroll height: ${currentHeight}`);
           if (currentHeight === previousHeight) {
             const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
             allResults = allResults.concat(finalResults);

diff --git a/maxun-core/src/scripts/setup-kafka.ts b/maxun-core/src/scripts/setup-kafka.ts
@@ -0,0 +1,23 @@
+import { KafkaManager } from '../utils/kafka-manager';
+
+async function setupKafka() {
+    const manager = new KafkaManager();
+
+    try {
+        console.log('Initializing Kafka manager...');
+        await manager.initialize();
+        console.log('Kafka setup completed successfully');
+
+        // Keep monitoring for a while to verify setup
+        setTimeout(async () => {
+            await manager.cleanup();
+            process.exit(0);
+        }, 10000);
+
+    } catch (error) {
+        console.error('Failed to setup Kafka:', error);
+        process.exit(1);
+    }
+}
+
+setupKafka().catch(console.error);