In [1]:
pip install pytesseract pillow beautifulsoup4 requests pandas


Defaulting to user installation because normal site-packages is not writeable
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pytesseract
from PIL import Image
import pandas as pd
import requests
from bs4 import BeautifulSoup
from io import BytesIO

pytesseract.pytesseract.tesseract_cmd=r'/usr/local/bin/tesseract'

In [2]:
def extract_text_from_image(image_url):
    response = requests.get(image_url)
    img = Image.open(BytesIO(response.content))
    text = pytesseract.image_to_string(img)
    return text

# Function to scrape Flipkart based on the category URL
def scrape_flipkart(url, category):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
    
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'html.parser')

    products = []

    # Modify the below class names and tags according to the actual structure of the Flipkart page.
    product_divs = soup.find_all('div', class_='_1AtVbE')  # Example class for product containers

    for product in product_divs:
        try:
            product_name = product.find('a', class_='IRpwTa').text  # Example for product name
            product_price = product.find('div', class_='_30jeq3').text  # Example for product price
            product_rating = product.find('div', class_='_3LWZlK').text  # Example for product rating
            image_url = product.find('img', class_='_396cs4')['src']  # Example for product image URL
            
            # Extract text from image (OCR)
            ocr_text = extract_text_from_image(image_url)
            
            products.append({
                'Category': category,
                'Product Name': product_name,
                'Price': product_price,
                'Rating': product_rating,
                'Image URL': image_url,
                'Extracted Text': ocr_text
            })
        except Exception as e:
            # Skip any product without complete details
            print(f"Skipping a product due to missing data: {e}")
    
    return products

# Common details to be extracted for all categories
def extract_common_data(product):
    return {
        'Product Name': product.get('Product Name', ''),
        'Price': product.get('Price', ''),
        'Rating': product.get('Rating', ''),
        'Image URL': product.get('Image URL', ''),
        'Extracted Text (OCR)': product.get('Extracted Text', '')
    }

# Category-specific data extraction (example for 'Fashion & Apparel')
def extract_fashion_data(product):
    # Add any extra details you want specifically for this category
    return {
        'Material': 'To be extracted',  # Add real extraction logic based on available details
        'Brand': 'To be extracted'
    }

# Category-specific data extraction (example for 'Electronics')
def extract_electronics_data(product):
    return {
        'Warranty': 'To be extracted',  # Add extraction logic based on available details
        'Specifications': 'To be extracted'
    }

# Function to save scraped data into a CSV
def save_to_csv(data, file_name):
    df = pd.DataFrame(data)
    df.to_csv(file_name, index=False)
    print(f"Data saved to {file_name}")

category_urls={
    'Fashion & Apparel':'https://www.flipkart.com/clothing-and-accessories/pr?sid=clo&otracker=categorytree&p%5B%5D=facets.ideal_for%255B%255D%3DMen&otracker=nmenu_sub_Men_0_Clothing',
    'Electronics': 'https://www.flipkart.com/mobile-phones-store?otracker=nmenu_sub_Electronics_0_Mobiles',
    'Beauty Products':'https://www.flipkart.com/beauty-and-grooming/pr?sid=g9b&p[]=facets.serviceability%5B%5D%3Dtrue&otracker=categorytree&otracker=nmenu_sub_Women_0_Beauty%20%26%20Grooming',  
}

def scrape_all_categories():
    all_data = []
    for category, url in category_urls.items():
        print(f"Scraping category: {category}")
        products = scrape_flipkart(url, category)
        
        for product in products:
            common_data = extract_common_data(product)
            if category == 'Fashion & Apparel':
                category_data = extract_fashion_data(product)
            elif category == 'Electronics':
                category_data = extract_electronics_data(product)
            # Add more categories with specific extraction logic as needed
            else:
                category_data = {}

            combined_data = {**common_data, **category_data}
            all_data.append(combined_data)
    
    # Save all scraped data to a CSV file
    save_to_csv(all_data, 'flipkart_products_data.csv')

