In [1]:
import requests
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns

In [2]:
def get_product_urls(search_query, num_pages):
    product_urls = []

    for page in range(1, num_pages + 1):
        search_url = f'https://www.flipkart.com/search?q={search_query}&{filter_parameter}&page={page}'
        response = requests.get(search_url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.text, 'html.parser')
            # Find all the product URLs on the search result page
            product_links = soup.find_all('a', {'class': '_1fQZEK'})
            for link in product_links:
                product_urls.append('https://www.flipkart.com' + link.get('href'))
        else:
            print(f'Failed to retrieve search results on page {page}')

    return product_urls

search_query = 'mobiles'
num_pages = 9
filter_parameter = 'Filters&p%5B%5D=facets.brand%255B%255D%3DOnePlus'

product_urls = get_product_urls(search_query, num_pages)

# https://www.flipkart.com/search?sid=tyy%2C4io&otracker=CLP_Filters&p%5B%5D=facets.brand%255B%255D%3DOnePlus

In [3]:
# Function to scrape data from an individual product page
def scrape_product_page(product_url):
    response = requests.get(product_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')

        # Initialize variables to store scraped data
        title = None
        price_without_discount = None
        discount = None
        price_after_discount = None
        
        # Extract the data you need from the HTML using BeautifulSoup
        title_elem = soup.find('span', {'class': 'B_NuCI'})
        if title_elem:
            title = title_elem.text

        actual_price_elem = soup.find('div', {'class': '_3I9_wc _2p6lqe'})
        if actual_price_elem:
            price_without_discount = actual_price_elem.text
        
        disc_elem = soup.find('div', {'class': '_3Ay6Sb _31Dcoz'})
        if disc_elem:
            discount = disc_elem.text
        
        price_elem = soup.find('div', {'class': '_30jeq3 _16Jk6d'})
        if price_elem:
            price_after_discount = price_elem.text
        

        # Check if any value is still None and mark it as "NULL"
        if title is None:
            title = "NULL"
        if price_without_discount is None:
            price_without_discount = price_after_discount
        if discount is None:
            disount = "NULL"
        if price_after_discount is None:
            price_after_discount = "NULL"
        
        return (title, price_without_discount, discount, price_after_discount)
        
    else:
        print('Failed to retrieve the product page:', product_url)

In [4]:
# Initialize a list to store the scraped data as tuples
data_list = []

# Scrape data from each individual product page and add to the list
for url in product_urls:
    product_data = scrape_product_page(url)
    if product_data:
        data_list.append(product_data)
        
print(data_list)

[('OnePlus Nord CE 3 Lite 5G (Pastel Lime, 128 GB)\xa0\xa0(8 GB RAM)', '₹19,999', None, '₹19,999'), ('OnePlus Nord CE 3 Lite 5G (Pastel Lime, 256 GB)\xa0\xa0(8 GB RAM)', '₹21,999', None, '₹21,999'), ('OnePlus Nord CE 3 Lite 5G (Chromatic Gray, 128 GB)\xa0\xa0(8 GB RAM)', '₹19,999', None, '₹19,999'), ('OnePlus Nord CE 2 Lite 5G (Black Dusk, 128 GB)\xa0\xa0(6 GB RAM)', '₹19,999', '12% off', '₹17,565'), ('OnePlus Nord CE 3 Lite 5G (Chromatic Gray, 256 GB)\xa0\xa0(8 GB RAM)', '₹21,999', None, '₹21,999'), ('OnePlus 11R 5G (Sonic Black, 128 GB)\xa0\xa0(8 GB RAM)', '₹39,999', '1% off', '₹39,439'), ('OnePlus 11R 5G (Sonic Black, 256 GB)\xa0\xa0(16 GB RAM)', '₹44,999', '3% off', '₹43,645'), ('OnePlus 11R 5G (Galactic Silver, 128 GB)\xa0\xa0(8 GB RAM)', '₹39,999', None, '₹39,750'), ('OnePlus 11R 5G (Sonic Black, 128 GB)\xa0\xa0(8 GB RAM)', '₹39,999', '1% off', '₹39,463'), ('OnePlus 10R (Sierra Black, 128 GB)\xa0\xa0(8 GB RAM)', '₹38,999', '27% off', '₹28,300'), ('OnePlus Nord 3 5G (Misty Green, 

In [5]:
flipkart_data = pd.DataFrame(data_list, columns = ['product_name', 'price_without_discount', 'discount_percent', 'price_after_discount'])

flipkart_data

Unnamed: 0,product_name,price_without_discount,discount_percent,price_after_discount
0,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, 128 GB...","₹19,999",,"₹19,999"
1,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, 256 GB...","₹21,999",,"₹21,999"
2,"OnePlus Nord CE 3 Lite 5G (Chromatic Gray, 128...","₹19,999",,"₹19,999"
3,"OnePlus Nord CE 2 Lite 5G (Black Dusk, 128 GB)...","₹19,999",12% off,"₹17,565"
4,"OnePlus Nord CE 3 Lite 5G (Chromatic Gray, 256...","₹21,999",,"₹21,999"
...,...,...,...,...
210,"OnePlus Nord 2T 5G (Gray Shadow, 256 GB) (12 ...","₹33,990",14% off,"₹29,198"
211,"OnePlus 6T (Speed Orange, 256 GB) (10 GB RAM)","₹39,999",,"₹39,999"
212,"OnePlus 6T (Mirror Black, 128 GB) (6 GB RAM)","₹30,999",,"₹30,999"
213,"OnePlus 6T (Midnight Black, 128 GB) (8 GB RAM)","₹41,999",47% off,"₹21,999"


In [10]:
cleaned_data = flipkart_data.copy()

cleaned_data['price_without_discount'] = cleaned_data['price_without_discount'].str.replace('₹','').str.replace(',','')

cleaned_data['price_without_discount'] = cleaned_data['price_without_discount'].astype(int)

cleaned_data['price_after_discount'] = cleaned_data['price_after_discount'].str.replace('₹','').str.replace(',','')

cleaned_data['price_after_discount'] = cleaned_data['price_after_discount'].astype(int)

cleaned_data.sort_values(by = 'price_after_discount', ascending = False)

cleaned_data['total_savings'] = cleaned_data['price_without_discount'] - cleaned_data['price_after_discount']

cleaned_data['discount_percent'] = cleaned_data['discount_percent'].apply(lambda x: x.replace('%', '') if pd.notna(x) else x)

cleaned_data['discount_percent'] = cleaned_data['discount_percent'].apply(lambda x: x.replace(' off','') if pd.notna(x) else x)

cleaned_data['discount_percent'] = cleaned_data['discount_percent'].apply(lambda x: int(x) if pd.notna(x) else 0)

cleaned_data

Unnamed: 0,product_name,price_without_discount,discount_percent,price_after_discount,total_savings
0,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, 128 GB...",19999,0,19999,0
1,"OnePlus Nord CE 3 Lite 5G (Pastel Lime, 256 GB...",21999,0,21999,0
2,"OnePlus Nord CE 3 Lite 5G (Chromatic Gray, 128...",19999,0,19999,0
3,"OnePlus Nord CE 2 Lite 5G (Black Dusk, 128 GB)...",19999,12,17565,2434
4,"OnePlus Nord CE 3 Lite 5G (Chromatic Gray, 256...",21999,0,21999,0
...,...,...,...,...,...
210,"OnePlus Nord 2T 5G (Gray Shadow, 256 GB) (12 ...",33990,14,29198,4792
211,"OnePlus 6T (Speed Orange, 256 GB) (10 GB RAM)",39999,0,39999,0
212,"OnePlus 6T (Mirror Black, 128 GB) (6 GB RAM)",30999,0,30999,0
213,"OnePlus 6T (Midnight Black, 128 GB) (8 GB RAM)",41999,47,21999,20000


In [12]:
cleaned_data.to_csv(r"D:\Data Analytics Projects\Flipkart_Project\flipkart_webscrapped_data.csv", index = False)