In [1]:
from bs4 import BeautifulSoup 
import requests
import openpyxl
from openpyxl.workbook import Workbook

In [2]:
url = "https://www.flipkart.com"

In [3]:
temp_data = [] 
column_titles = [ "Name" ,"Selling_price" ,"Colour" , "ROM" , "Memory" , "Display" , "Camera" , "Battery" , "Processor" ,"Others" , "Product_ratings" ,"Camera_ratings" , "Battery_ratings" , "Display_ratings" , "Value for Money" ,"Seller_Name" , "Seller_Ratings" , "Product_Link" ]
final_data = []
is_first_call = True

In [4]:
def get_categories_from_topbar(url):
    """Provides categories & required details from home page top bar"""
    global temp_data
    # Request to url page for data 
    page = requests.get(url)
    htmCont = page.content
    
    # Html Content Parsing
    soup =  BeautifulSoup(htmCont,'html.parser') 
    
    #Fetch category titles
    top_category_bar = soup.find_all('div',class_="eFQ30H")
    
    # Target Category detail to get return
    Mobile_category_link = ""
    
    print("List of categories at top-bar home page\n")
    
    #Iterating over categories , can get Titles with respective link
    for i in range(len(top_category_bar)):
        category_name = top_category_bar[i].text
        try:
            category_link = top_category_bar[i].find('a')['href']
        except:
            category_link = ""
        
        # To get mobile category link to extract further details
        if category_name == "Mobiles":
            Mobile_category_link = category_link
        
        print(f"{category_name}")
        print("----------------------------------")
        
    return Mobile_category_link

In [5]:
mobile_category_page_link = get_categories_from_topbar(url)

List of categories at top-bar home page

Mobiles
----------------------------------
Fashion
----------------------------------
Electronics
----------------------------------
Home
----------------------------------
Travel
----------------------------------
Appliances
----------------------------------
Furniture
----------------------------------
Beauty,Toys & more
----------------------------------
Grocery
----------------------------------


In [6]:
def upload_to_excel(final_data):
    """Upload data to excel file"""
    global column_titles
    global is_first_call

    workbook_name = 'output_flipkart_data.xlsx'
    wb = openpyxl.load_workbook(workbook_name)
    if is_first_call:
        # Write to default sheet during first call
        page = wb.active
        is_first_call = False
    else:
        # Creates new sheet for next mobiles list
        page = wb.create_sheet()
    
    # write the headers to the first line
    page.append(column_titles) 

    # Data to write:
    for info in final_data:
        page.append(info)
    wb.save(workbook_name)
    wb.close()


In [7]:
def add_list_contents(content_list):
    """Extract data from highlighted to front list"""
    
    global temp_data
    
    memory = ""
    display = ""
    camera = ""
    battery = ""
    processor = ""
    other = ""
    
    for data in content_list:
        if 'RAM' in data.text:
            memory = data.text
        elif 'Display' in data.text:
            display = data.text
        elif 'Camera'.lower() in data.text.lower():
            camera = data.text
        elif 'Battery' in data.text:
            battery = data.text
        elif 'Processor' in data.text:
            processor = data.text
        else:
            other += " "+ data.text+ " "
        
    temp_data.append(memory)
    temp_data.append(display)
    temp_data.append(camera)
    temp_data.append(battery)
    temp_data.append(processor)
    temp_data.append(other.strip())
    
    
    

In [8]:
def get_seller_info(link):
    """Extraction for seller information & ratings"""
    global temp_data
    
    page = requests.get(link)
    htmCont = page.content

    soup =  BeautifulSoup(htmCont,'html.parser')
    try:
        seller_detail = soup.find('div',{"id":"sellerName"})

        seller = seller_detail.find('span')

        seller_name = seller.find('span').text

        seller_ratings = seller.find('div',class_="_3LWZlK _1D-8OL").text
    except:
        seller_name = "NA"
        seller_ratings = "NA"
        
    ratings_container = soup.find_all('div',class_="_2a78PX")
    
    camera_rating = ""
    battery_rating = ""
    display_rating = ""
    value_for_money = ""
    
    for rating in ratings_container:
    
        rate = rating.find('div',class_='_2aWUii').text
        tag = rating.find('div',class_='_3npa3F').text
        if 'Camera' in tag:
            camera_rating = rate
        if 'Battery' in tag:
            battery_rating = rate
        if 'Display' in tag:
            display_rating = rate
        if 'Money' in tag:
            value_for_money = rate
    
    temp_data.append(camera_rating)
    temp_data.append(battery_rating)
    temp_data.append(display_rating)
    temp_data.append(value_for_money)
    temp_data.append(seller_name)
    temp_data.append(seller_ratings)
    

In [9]:
def get_mobiles_data_list(mobile_company_page_link):
    """Provides Company's Mobiles list"""
    global url
    global temp_data
    global final_data
    
    mobile_company_page = requests.get(mobile_company_page_link)
    htmCont = mobile_company_page.content
    
    soup =  BeautifulSoup(htmCont,'html.parser')
    mobile_divison = soup.find('div',class_="_1YokD2 _3Mn1Gg")
    
    mobile_containers = mobile_divison.find_all('div',class_="_1AtVbE col-12-12")
    
    i=0
    for mobile in mobile_containers:
        i+=1
        if i==25:
            break
        
        try:
            title = mobile.find('div',class_="_4rR01T").text
            name = title.split('(')[0].strip()
            colour = title.split('(')[1].split(',')[0].strip()
            rom = title.split('(')[1].split(',')[1].split(')')[0]
        except:
            name ="NA"
            colour ="NA"
            rom = "NA"

        selling_price = mobile.find('div',class_="_30jeq3 _1_WHN1").text
        
        temp_data.append(name)
        temp_data.append(selling_price)
        temp_data.append(colour)
        temp_data.append(rom.strip())
        
        content_list = mobile.find_all('li',class_="rgWa7D")
        
        add_list_contents(content_list)
        
        try:
            ratings = mobile.find('div',class_="_3LWZlK").text
        except:
            ratings = "NA"
        
        temp_data.append(ratings)
        
        detail_link = url + mobile_containers[0].find('a',class_="_1fQZEK")['href']
        
        get_seller_info(detail_link)
        
        temp_data.append(detail_link)
        

        final_data.append(temp_data)

        temp_data=[]

        if len(final_data)==24:
            upload_to_excel(final_data)
            final_data = []

In [10]:
def get_mobile_list_page_link(mobile_category_page_link):
    """Provides mobile list pages link by company to company"""
    
    global url
    global final_data
    
    page = requests.get(mobile_category_page_link)
    htmCont = page.content
    soup =  BeautifulSoup(htmCont,'html.parser')
    
    mobile_category = soup.find_all('div',class_="_3YgSsQ")

    for category in mobile_category:
        link = category.find('a',class_="h1Fvn6")['href']
        
        mobile_company_page_link = url+link
        get_mobiles_data_list(mobile_company_page_link)

In [11]:
get_mobile_list_page_link(mobile_category_page_link)