scraping 5 brand mobiles link with price

In [6]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Function to scrape reviews for a given mobile brand and item, limited to 20 items
def scrape_reviews(brand_xpath):
    # Open Flipkart URL
    driver.get(url)

    # Close the login popup if it appears
    try:
        close_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(text(),'✕')]"))
        )
        close_button.click()
    except Exception as e:
        print(f"Error closing login popup: {e}")

    # Locate the filter section
    try:
        filter_section = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="container"]/div/div[3]/div[1]/div[1]'))
        )
    except Exception as e:
        print(f"Error locating filter section: {e}")

    # Click on the mobile brand
    try:
        mobile_brand = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, brand_xpath))
        )
        mobile_brand.click()
        time.sleep(10)  # Wait for the page to reload if the click is successful
    except Exception as e:
        print(f"Error clicking on mobile brand: {e}")
        return  # Exit the function if we cannot click the brand

    # Set minimum price filter to ₹20,000
    try:
        min_price = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="container"]/div/div[3]/div[1]/div[1]/div/div[1]/div/section[2]/div[4]/div[1]'))
        )
        min_price.click()

        # Select ₹20,000 from the dropdown
        min_price_option = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="container"]/div/div[3]/div[1]/div[1]/div/div[1]/div/section[2]/div[4]/div[1]/select/option[4]'))
        )
        min_price_option.click()
        time.sleep(1)  # Allow time for the selection to process
    except Exception as e:
        print(f"Error setting minimum price filter: {e}")

    # Scroll to ensure the brand element is visible
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(1)
    driver.execute_script("window.scrollTo(0, 0);")
    time.sleep(1)

    # Initialize an empty list to store product details
    product_details = []
    product_count = 0

    # Scrape product links, names, and prices from the current page and iterate through pagination
    while product_count < 20:  # Stop after collecting 20 items
        time.sleep(3)  # Allow time for page to load

        # Scrape phone links, names, and prices
        try:
            # Locate the div elements that contain product links
            product_divs = driver.find_elements(By.CSS_SELECTOR, "div._75nlfW")
            product_names = driver.find_elements(By.CSS_SELECTOR, "div.KzDlHZ")
            product_prices = driver.find_elements(By.CSS_SELECTOR, "div.Nx9bqj._4b5DiR")

            # Iterate over product divs and scrape details
            for i in range(len(product_divs)):
                if product_count >= 20:  # Stop if 20 products have been collected
                    break

                try:
                    # Get the product link
                    link = product_divs[i].find_element(By.TAG_NAME, "a").get_attribute('href')

                    # Get the product name
                    name = product_names[i].text if i < len(product_names) else "N/A"

                    # Get the product price
                    price = product_prices[i].text if i < len(product_prices) else "N/A"

                    # Print scraped details
                    print(f'Product Link: {link}, Product Name: {name}, Price: {price}')

                    # Append the details to the product list
                    product_details.append([link, name, price])
                    product_count += 1  # Increment product count

                except Exception as e:
                    print(f"Error extracting product details: {e}")  # Print error and skip this product

            # Click the 'Next' button to go to the next page, if available and more products needed
            if product_count < 20:
                try:
                    next_button = WebDriverWait(driver, 10).until(
                        EC.element_to_be_clickable((By.XPATH, "//span[contains(text(),'Next')]"))
                    )
                    driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
                    time.sleep(1)
                    next_button.click()
                    time.sleep(3)  # Wait for the next page to load
                except Exception as e:
                    print(f"Error clicking next button: {e}")
                    break  # No more pages or an error occurred

        except Exception as e:
            print(f"Error scraping page: {e}")
            break  # Could not find phones or scrape links

    return product_details  # Return the scraped product details


# Initialize Chrome driver
driver = webdriver.Chrome()

# Open Flipkart URL
url = "https://www.flipkart.com/search?q=mobile+phones+&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&as-pos=1&as-type=HISTORY"
driver.get(url)

# List of mobile brands and their respective XPaths
brands = {
    "Samsung": {
        "brand_xpath": '//*[@id="container"]/div/div[3]/div/div[1]/div/div[1]/div/section[3]/div[2]/div[1]/div[3]',
    },
    
    "Google": {
        "brand_xpath": '//*[@id="container"]/div/div[3]/div/div[1]/div/div[1]/div/section[3]/div[2]/div[1]/div[4]',
    },

    "Motorola": {
        "brand_xpath": '//*[@id="container"]/div/div[3]/div/div[1]/div/div[1]/div/section[3]/div[2]/div[1]/div[5]',
    },

    "Vivo": {
        "brand_xpath": '//*[@id="container"]/div/div[3]/div/div[1]/div/div[1]/div/section[3]/div[2]/div[1]/div[6]',
    },

    "Oppo": {
        "brand_xpath": '//*[@id="container"]/div/div[3]/div/div[1]/div/div[1]/div/section[3]/div[2]/div[1]/div[7]',
    }
}

# Prepare the CSV file for writing data
csv_file_path = "mobile_Links.csv"
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Product Link", "Product Name", "Price"])  # Write headers
    
    # Iterate over each brand and scrape reviews
    for brand, xpaths in brands.items():
        print(f"Scraping reviews for {brand}...")
        product_details = scrape_reviews(xpaths["brand_xpath"])
        writer.writerows(product_details)  # Write all product details to the CSV after scraping each brand

# Close the browser
driver.quit()


Scraping reviews for Samsung...
Error closing login popup: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF6B818B095+29557]
	(No symbol) [0x00007FF6B80FFA50]
	(No symbol) [0x00007FF6B7FBB56A]
	(No symbol) [0x00007FF6B800F695]
	(No symbol) [0x00007FF6B800F8EC]
	(No symbol) [0x00007FF6B805B777]
	(No symbol) [0x00007FF6B80371CF]
	(No symbol) [0x00007FF6B805851C]
	(No symbol) [0x00007FF6B8036F33]
	(No symbol) [0x00007FF6B800116F]
	(No symbol) [0x00007FF6B80022D1]
	GetHandleVerifier [0x00007FF6B84BC96D+3378253]
	GetHandleVerifier [0x00007FF6B8508497+3688311]
	GetHandleVerifier [0x00007FF6B84FD1CB+3642539]
	GetHandleVerifier [0x00007FF6B824A6B6+813462]
	(No symbol) [0x00007FF6B810AB5F]
	(No symbol) [0x00007FF6B8106B74]
	(No symbol) [0x00007FF6B8106D10]
	(No symbol) [0x00007FF6B80F5C1F]
	BaseThreadInitThunk [0x00007FFCEFE3257D+29]
	RtlUserThreadStart [0x00007FFCF0D2AF08+40]

Product Link: https://www.flipkart.com/samsung-galaxy-s23-5g-cream-128-gb/p/itmc77ff94cdf044?pid=MOBGMFFX5XYE8MZN&li

In [7]:
import pandas as pd

# Load the dataset
file_path = "mobile_Links.csv"
df = pd.read_csv(file_path)
    
# Display original data shape
print(f"Original dataset size: {df.shape}")

# Remove duplicates
df = df.drop_duplicates()

# Clean the 'Price' column by removing currency symbol '₹' and commas, and then convert to float
df['Price'] = df['Price'].str.replace('₹', '').str.replace(',', '').str.strip().astype(float)

# Remove rows with price greater than 40000
df = df[df['Price'] <= 40000]

# Remove rows where the 'Product Link' contains the word 'refurbished'
df = df[~df['Product Link'].str.contains("refurbished", case=False)]

# Display cleaned data shape
print(f"Cleaned dataset size: {df.shape}")

# Save the cleaned dataset to a new CSV file
output_file_path = "Cleaned_mobile_Links.csv"
df.to_csv(output_file_path, index=False)

print(f"Cleaned dataset saved to {output_file_path}")


Original dataset size: (100, 3)
Cleaned dataset size: (70, 3)
Cleaned dataset saved to Cleaned_mobile_Links.csv


scraping review

In [8]:
import time
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup

# Function to scrape product details
def scrape_product_details(product_link):
    # Load the product link in the current tab
    driver.get(product_link)  # Load the product link in the current tab
    driver.refresh()  # Refresh the page to ensure it loads properly

    # Get page source and parse with Beautiful Soup
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    # Scrape the product name
    try:
        product_name = soup.select_one("h1._6EBuvT").get_text(strip=True)
        print(f'Product Name: {product_name}')
    except Exception as e:
        print("Could not find the product name.")
        product_name = "N/A"

    # Scrape the product price
    try:
        price = soup.select_one("div.Nx9bqj.CxhGGd").get_text(strip=True)
        print(f'Product Price: {price}')
    except Exception as e:
        print("Could not find the product price.")
        price = "N/A"

    # Click the 'All Reviews' section
    try:
        all_reviews_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, "div._23J90q.RcXBOT"))
        )
        all_reviews_button.click()
    except Exception as e:
        print("Could not find or click on the 'All Reviews' button.")
        return []

    # Wait for reviews to load
    time.sleep(5)  # Adjust this time if needed

    # List to store scraped review data
    review_data = []

    # Scrape ratings and reviews across multiple pages
    page_count = 0
    max_pages = 25  # Set a maximum number of pages to prevent infinite loops

    while page_count < max_pages:
        time.sleep(5)  # Allow time for reviews to load

        # Scrape ratings and reviews
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        ratings = soup.select("div.XQDdHH.Ga3i8K")
        reviews = driver.find_elements(By.CSS_SELECTOR, "div.ZmyHeo")

        if not ratings or not reviews:
            print("No ratings or reviews found. Exiting...")
            break

        # Iterate over each review and handle the 'Read More' button
        for rating, review in zip(ratings, reviews):
            try:
                # Check for the 'Read More' button
                read_more_button = review.find_element(By.CSS_SELECTOR, "span.b4x-fr")
                if read_more_button:
                    driver.execute_script("arguments[0].click();", read_more_button)
                    time.sleep(1)  # Allow time for the review to expand

                # Scrape the full review text and format it
                full_review_text = review.text.replace('\n', ' ').replace('\r', '').strip()
            except Exception:
                # If no 'Read More' button is found, just get the text directly and format it
                full_review_text = review.text.replace('\n', ' ').replace('\r', '').strip()

            print(f'Rating: {rating.text}, Review: {full_review_text}')
            review_data.append([product_link, product_name, price, rating.text, full_review_text])

        # Check for the next button and move to the next page if available
        try:
            next_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//span[contains(text(),'Next')]"))
            )
            driver.execute_script("arguments[0].scrollIntoView(true);", next_button)
            time.sleep(1)  # Allow some time for scrolling
            driver.execute_script("arguments[0].click();", next_button)
            time.sleep(5)  # Wait for the next page to load
            page_count += 1  # Increment the page count
        except Exception as e:
            print("No more pages or an error occurred:", e)
            break

    return review_data  # Return the collected review data

# Initialize Chrome driver
driver = webdriver.Chrome()

# Read product links from the CSV file
input_csv_file_path = "Cleaned_mobile_Links.csv"
product_links = []

with open(input_csv_file_path, mode='r', encoding='utf-8') as file:
    reader = csv.DictReader(file)
    for row in reader:
        product_links.append(row['Product Link'])  # Extracting the 'Product Link' column

# Prepare to save data into a new CSV file
output_csv_file_path = "Mobile_Phone_Data.csv"
with open(output_csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write headers to the output CSV file
    writer.writerow(["Product Link", "Product Name", "Price", "Rating", "Review"])

# Loop through each product link and scrape details
for link in product_links:
    review_data = scrape_product_details(link)  # Call the scrape function
    if review_data:  # Only save if there's data to save
        # Save scraped data into the CSV file
        with open(output_csv_file_path, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerows(review_data)  # Write all review data for this product
    
    # Reuse the same tab for the next product
    driver.get("about:blank")  # Clear the page before loading the next URL

# Quit the browser after all reviews have been scraped
driver.quit()

Product Name: SAMSUNG Galaxy S23 5G (Cream, 128 GB)  (8 GB RAM)
Product Price: ₹39,999
Rating: 5, Review: Superb phone....
Rating: 5, Review: Amazing camera and Good performance..best choice
Rating: 5, Review: This time Samsung does very well. Currently Best Premium Smartphone in market.
Rating: 5, Review: Amazing Product, Light Weight, Best Camera, Handy, Compact Premium Smart Phone.
Rating: 5, Review: Portrait mode is excellent. Night mode performance is very good.pictures are clear and sharp. Charging speed could have been better but yes battery is optimised very well so easily lasts for a day.
Rating: 5, Review: Camera quality is like dslr. Compact phone with powerful features. Battery is also good which last a entire day. Overall happy to choose this against iPhone 14
Rating: 5, Review: This is the best Android device available to those who don't like big phones but want everything that a big phone can do.  Everything is super smooth here.  Great camera quality and great video rec

In [1]:
import pandas as pd

# Step 1: Load the Data
file_path = 'mobile_phone_data.csv'
data = pd.read_csv(file_path)

# Step 2: Inspect the Data
print(data.head())  # Show the first few rows
print(data.info())  # Get a summary of the DataFrame

# Step 3: Handle Missing Values
# Check for missing values
missing_values = data.isnull().sum()
print("Missing values per column:\n", missing_values)

# Option 1: Remove rows with missing values
data = data.dropna()

# Option 2: Fill missing values (for example, with the mean for numerical columns)
# data['column_name'].fillna(data['column_name'].mean(), inplace=True)

# Step 4: Remove Duplicates
data = data.drop_duplicates()

# Step 5: Standardize Data Types
# Convert columns to appropriate data types (example for a numerical column)
# data['price'] = pd.to_numeric(data['price'], errors='coerce')

# Step 6: Clean Text Data
# Remove extra whitespace and convert text to lower case
data['Review'] = data['Review'].str.strip().str.lower()

# Remove special characters (optional)
data['Review'] = data['Review'].str.replace(r'[^a-zA-Z0-9\s]', '', regex=True)

# Step 7: Normalize Categorical Data
# Example: Standardizing a categorical column
# data['brand'] = data['brand'].str.lower().str.strip()

# Step 8: Feature Engineering (if needed)
# Example: Create a new feature based on existing columns
# data['price_per_feature'] = data['price'] / data['features_count']

# Save the cleaned data to a new CSV file
data.to_csv('cleaned_mobile_phone_data.csv', index=False)

# Display the cleaned data
print(data.head())


                                        Product Link  \
0  https://www.flipkart.com/samsung-galaxy-s23-5g...   
1  https://www.flipkart.com/samsung-galaxy-s23-5g...   
2  https://www.flipkart.com/samsung-galaxy-s23-5g...   
3  https://www.flipkart.com/samsung-galaxy-s23-5g...   
4  https://www.flipkart.com/samsung-galaxy-s23-5g...   

                                        Product Name    Price  Rating  \
0  SAMSUNG Galaxy S23 5G (Cream, 128 GB)  (8 GB RAM)  ₹39,999       5   
1  SAMSUNG Galaxy S23 5G (Cream, 128 GB)  (8 GB RAM)  ₹39,999       5   
2  SAMSUNG Galaxy S23 5G (Cream, 128 GB)  (8 GB RAM)  ₹39,999       5   
3  SAMSUNG Galaxy S23 5G (Cream, 128 GB)  (8 GB RAM)  ₹39,999       5   
4  SAMSUNG Galaxy S23 5G (Cream, 128 GB)  (8 GB RAM)  ₹39,999       5   

                                              Review  
0                                   Superb phone....  
1   Amazing camera and Good performance..best choice  
2  This time Samsung does very well. Currently Be...  
3  A