added scripts

paritoshtripathi935 · paritoshtripathi935 · commit 2b88cb2f49e2 · 2023-05-30T15:16:24.000+05:30
diff --git a/FlipkartScraper/dbConnector.py b/FlipkartScraper/dbConnector.py
@@ -0,0 +1,44 @@
+import sqlite3
+import os
+
+class FlipkartDatabaseConnector:
+    def __init__(self, stamp):
+        self.dbPath = "flipkart.db"
+        self.conn = sqlite3.connect(self.dbPath)
+        self.cur = self.conn.cursor()
+        self.welcomeMessage = "Welcome to Flipkart Scraper. This is the database for the Flipkart Scraper. This database was created on {}.".format(stamp)
+
+    def schemaMaker(self):
+        # creating tables
+        self.cur.execute("""CREATE TABLE products (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            sku TEXT NOT NULL,
+            name TEXT NOT NULL,
+            description TEXT NOT NULL,
+            image_path TEXT NOT NULL,
+            category TEXT NOT NULL,
+            timestamp TEXT NOT NULL,
+            URL TEXT NOT NULL,
+            price TEXT NOT NULL
+        );""")
+        self.conn.commit()
+        self.cur.execute("CREATE TABLE product_matches (id INTEGER PRIMARY KEY AUTOINCREMENT, product_id INTEGER NOT NULL, product_sku INTEGER NOT NULL, match_id INTEGER NOT NULL, match_sku INTEGER NOT NULL);")
+        self.conn.commit()
+    
+    def insertProduct(self, productDetails):
+        self.cur.execute("INSERT INTO products (sku, name, description, image_path, category, timestamp, URL, price) VALUES (?, ?, ?, ?, ?, ?, ?, ?)", (productDetails["sku"], productDetails["name"], productDetails["description"], productDetails["image_path"], productDetails["category"], productDetails["timestamp"], productDetails["URL"], productDetails["price"]))
+        self.conn.commit()
+
+    def fetchAllProducts(self):
+        self.cur.execute("SELECT * FROM products")
+        return self.cur.fetchall()
+
+    def clearDatabase(self):
+        self.cur.execute("DELETE FROM products")
+        self.conn.commit()
+        self.cur.execute("DELETE FROM product_matches")
+        self.conn.commit()
+    
+    def removeDuplicates(self):
+        self.cur.execute("DELETE FROM products WHERE rowid NOT IN (SELECT MIN(rowid) FROM products GROUP BY sku)")
+        self.conn.commit()
diff --git a/FlipkartScraper/flipkart.db b/FlipkartScraper/flipkart.db
diff --git a/FlipkartScraper/genricHtmlib.py b/FlipkartScraper/genricHtmlib.py
@@ -0,0 +1,153 @@
+from multiprocessing import Pool
+import os
+from datetime import datetime
+import lxml.html as html
+import pandas as pd
+import time
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+import warnings
+import requests
+warnings.filterwarnings("ignore")
+
+class SeleniumScraper:
+    def __init__(self, timeout=10):
+        self.timeout = timeout
+        self.reqSession = requests.Session()
+        self.stamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        self.storagePath = os.path.join(
+            os.path.dirname(os.path.abspath(__file__))
+        )
+
+        self.headers = {
+            'authority': 'www.amazon.com',
+            'pragma': 'no-cache',
+            'cache-control': 'no-cache',
+            'dnt': '1',
+            'upgrade-insecure-requests': '1',
+            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36',
+            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
+            'sec-fetch-site': 'none',
+            'sec-fetch-mode': 'navigate',
+            'sec-fetch-dest': 'document',
+            'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
+        }
+    
+    def fetch_request_normal(self, url, params=None):
+        try:
+            headers = {
+                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
+            }
+            response = self.reqSession.get(url, headers=headers)
+
+            if response.status_code == 200:
+                return response.text
+            
+            if response.status_code == 301:
+                # retry with redirect
+                response = requests.get(response.headers['Location'])
+                response.raise_for_status()
+                if response.status_code == 200:
+                    return response.text
+                
+            if response.status_code == 503:
+                #print("Request Failed Response status code for url: {} and status code: {}".format(url, 503))
+                return None
+                        
+        except Exception as e:
+            print(
+                "Exception occurred for url: {} and exception: {}".format(url, e)
+            )
+            print("Exception occurred for url: {} and exception: {}".format(url, e))
+            pass
+            return None
+        
+    def get_xpath_link(self, doc, xpath, website):
+        try:
+            name = doc.xpath("".join(xpath))
+            for i in range(len(name)):
+                if name[i].startswith("/"):
+                    name[i] = website + name[i]
+                else:
+                    name[i] = name[i]
+            return name
+
+        except Exception as e:
+            print("Error in getting {}: {}".format(name, e))
+            pass
+            return None
+            pass
+        
+    def get_selenium_driver(self):
+        chrome_options = Options()
+        chrome_options.add_argument("--headless")
+        chrome_options.add_argument("--window-size=1920,1080")
+        chrome_options.add_argument("--disable-gpu")
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--disable-dev-shm-usage")
+        chrome_options.add_argument("--disable-extensions")
+        chrome_options.add_argument("--disable-logging")
+        chrome_options.add_argument("--log-level=3")
+        chrome_options.add_argument("--silent")
+        chrome_options.add_argument("--blink-settings=imagesEnabled=false")
+        driver = webdriver.Chrome(chrome_options=chrome_options)
+        return driver
+
+    def fetch_request_selenium(self, url, waiting_time=1):
+        try:
+            driver = self.get_selenium_driver()
+            driver.get(url)
+            time.sleep(waiting_time)
+            doc = html.fromstring(driver.page_source)
+            driver.close()
+            return doc
+
+        except Exception as e:
+            print(
+                "Exception occurred for url: {} and exception: {}".format(url, e)
+            )
+            pass
+
+    def get_xpath_data(self, doc, xpath):
+        try:
+            name = doc.xpath(xpath)
+            return name
+
+        except Exception as e:
+            print("Error in getting {}: {}".format(name, e))
+            pass
+            return None
+        
+    def slow_page_scroll(self, driver, speed):
+        current_scroll_position = driver.execute_script("return window.pageYOffset;")
+        while current_scroll_position < driver.execute_script(
+            "return document.body.scrollHeight;"
+        ):
+            driver.execute_script(
+                "window.scrollTo(0, arguments[0]);", current_scroll_position
+            )
+            current_scroll_position += 1000
+            time.sleep(speed)
+
+    def data_storage(self, df_list, unique_id, name, storageFormat, storagePath=None):
+        df_combined = pd.concat(df_list, ignore_index=True)
+        df_combined.drop_duplicates(subset=unique_id, inplace=True)
+        if storageFormat == "csv":
+            df_combined.to_csv(
+            self.storagePath +"/{}_{}.csv".format(name, self.stamp),
+            index=False,
+        )
+        elif storageFormat == "json":
+            df_combined.to_json(
+            self.storagePath + "/{}_{}.json".format(name, self.stamp),
+            orient="records",
+        )
+      
+    def cleanData(self, array):
+        array = [x.strip() for x in array]
+        array = list(filter(None, array))
+        array = [x.encode("ascii", "ignore").decode() for x in array]
+        array = [x.replace("\n", "") for x in array]
+        return array
+    
+    
diff --git a/FlipkartScraper/main.py b/FlipkartScraper/main.py
@@ -0,0 +1,156 @@
+import logging
+from datetime import datetime
+from dbConnector import FlipkartDatabaseConnector
+from productList import product_categories
+from genricHtmlib import SeleniumScraper
+import os
+import lxml.html as html
+import concurrent.futures
+
+SeleniumScraper = SeleniumScraper()
+
+class Scraper:
+    def __init__(self):
+        self.brand: str = "flipkart"
+        self.website = "https://www.flipkart.com/search?q="
+        self.websiteName = "https://www.flipkart.com"
+        self.stamp: str = datetime.now().strftime("%Y-%ma-%d_%H-%M-%S") 
+        self.storagePath: str = os.getcwd()
+    
+        self.productLinksXpath = '//*[@rel="noopener noreferrer"]//@href'
+        self.skuXpath = '//tr[contains(@class, "row")]//td[contains(text(), "Model Number")]/following-sibling::td[1]/ul/li/text()'
+        self.nameXpath = '//*[@class="B_NuCI"]//text()'
+        self.description = '//div[contains(text(), "Description")]/following-sibling::div[1]/div/text()'
+        self.image = '//*[@class="_396cs4 _2amPTt _3qGmMb"]//@src'
+        self.category = '//*[@class="_3GIHBu"]//text()'
+        self.price = '//*[@class="_30jeq3 _16Jk6d"]//text()'
+        
+    def getProductList(self, keyword):
+        try:
+            productLinks = []
+            url = self.website + keyword
+            response = SeleniumScraper.fetch_request_normal(url)
+            if response is None:
+                doc = SeleniumScraper.fetch_request_selenium(url)
+            else:
+                doc = html.fromstring(response)
+            
+            Links = SeleniumScraper.get_xpath_link(doc, self.productLinksXpath, self.websiteName)
+            productLinks.extend(Links)
+
+            for page in range(2, 20):
+                print(f'Geting Page {page} for {keyword}')
+                url = self.website + keyword + "&page=" + str(page)
+                response = SeleniumScraper.fetch_request_normal(url)
+                if response is None:
+                    doc = SeleniumScraper.fetch_request_selenium(url)
+                else:
+                    doc = html.fromstring(response)
+                
+                Links = SeleniumScraper.get_xpath_link(doc, self.productLinksXpath, self.websiteName)
+                productLinks.extend(Links)
+
+            print(f'Total products for {keyword} is {len(productLinks)}')
+            return productLinks
+        
+        except Exception as e:
+            print(e)
+
+    def getProductDetails(self, productLink):
+        print(f'Getting product details for {productLink}')
+        response = SeleniumScraper.fetch_request_normal(productLink)
+        if response is None:
+            doc = SeleniumScraper.fetch_request_selenium(productLink)
+        else:
+            doc = html.fromstring(response)
+
+        productDetails = {}
+
+        try:
+            sku = SeleniumScraper.get_xpath_data(doc ,self.skuXpath)
+            sku = sku[0]
+        except:
+            sku = "None"
+
+        try:
+            name = SeleniumScraper.get_xpath_data(doc ,self.nameXpath)
+            name = name[0]
+        except:
+            name = "None"
+
+        try:
+            description = SeleniumScraper.get_xpath_data(doc, self.description)
+            description = ''.join(description)
+        except:
+            description = "None"
+
+        try:
+            image_path = SeleniumScraper.get_xpath_link(doc, self.image, self.websiteName)
+            image_path = image_path[0]
+        except:
+            image_path = "None"
+
+        try:
+            category = SeleniumScraper.get_xpath_data(doc, self.category)
+            category = category[1]
+        except:
+            category = "None"
+
+        try:
+            price = SeleniumScraper.get_xpath_data(doc, self.price)
+            price = SeleniumScraper.cleanData(price)
+            price = price[0]
+        except:
+            price = "None"    
+
+        productDetails["sku"] = str(sku)
+        productDetails["name"] = str(name)
+        productDetails["description"] = str(description)
+        productDetails["image_path"] = str(image_path)
+        productDetails["category"] = str(category)
+        productDetails["timestamp"] = str(self.stamp)
+        productDetails["URL"] = str(productLink)
+        productDetails['price'] = price
+
+        print(productDetails)
+        return productDetails
+
+    def start(self):
+        productList = []
+        number_of_threads: int = 1
+
+        # Log start of scraper
+        print(f"Starting {self.brand} scraper")
+
+        # make db amazon.db if it doesn't exist
+        if not os.path.exists(self.storagePath + "/" + self.brand + ".db"):
+            print(f'Creating {self.brand}.db at {self.storagePath+self.brand+".db"}')
+            db = FlipkartDatabaseConnector(self.stamp)
+            db.schemaMaker()
+            print(db.welcomeMessage)
+
+        self.db = FlipkartDatabaseConnector(self.stamp)
+        print(self.db.welcomeMessage)
+    
+        with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_threads) as executor:
+           productUrls =  executor.map(self.getProductList, product_categories)
+           productList.extend(productUrls)
+
+        
+        # flatten the list productList
+        productList = [item for sublist in productList for item in sublist]
+        print(f'Total products for {self.brand} is {len(productList)}')
+
+        with concurrent.futures.ThreadPoolExecutor(max_workers=number_of_threads) as executor:
+            results = executor.map(self.getProductDetails, productList)
+
+            for result in results:
+                print(f"Saving {result['sku']} to db")
+                self.db.insertProduct(result)
+
+        self.db.removeDuplicates()
+        
+    
+if __name__ == '__main__':
+    scraper = Scraper()
+    scraper.start()
diff --git a/FlipkartScraper/productList.py b/FlipkartScraper/productList.py
@@ -0,0 +1,3 @@
+product_categories = [
+    'mobiles',
+]
diff --git a/FlipkartScraper/requirements.txt b/FlipkartScraper/requirements.txt
@@ -1,25 +1,10 @@
-jupyter 
-scikit-learn 
-pandas 
-numpy 
-matplotlib 
-seaborn 
-tensorflow 
-flask 
-openai
 bs4
 requests
 pandas
-requests
 numpy
 bs4
-geopy
 boto3
 ndjson
 selenium
 httpx
 lxml
-python-dotenv
-paramiko
-undetected-chromedriver
-fastjsonschema
diff --git a/FlipkartScraper/useragent.py b/FlipkartScraper/useragent.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+product_categories = [`
	`2`	`+ 'mobiles',`
	`3`	`+]`