In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd

# URL of the website
url = "https://www.flipkart.com/search?q=iqoo&otracker=search&otracker1=search&marketplace=FLIPKART&as-show=on&as=off&otracker=nmenu_sub_Electronics_0_IQOO+3&p%5B%5D=facets.price_range.from%3D20000&p%5B%5D=facets.price_range.to%3DMax"
def initialize_driver():
    driver = webdriver.Chrome()  # Ensure ChromeDriver is in your PATH
    driver.maximize_window()
    return driver

def load_page(driver, url):
    driver.get(url)
    time.sleep(5)  # Wait for the page to load

# Function to scrape product names, links, and prices
def scrape_product_data(driver):
    product_names = [name.text for name in driver.find_elements(By.CLASS_NAME, 'KzDlHZ')]
    product_links = [link.get_attribute('href') for link in driver.find_elements(By.XPATH, '//a[@class="CGtC98"]')]
    product_prices = [price.text for price in driver.find_elements(By.CLASS_NAME, 'hl05eU')]  # Adjusted class name for prices
    
    # Return product data
    return product_names, product_links, product_prices

# Function to scrape multiple pages
def scrape_multiple_pages(driver, base_url, num_pages):
    all_product_names = []
    all_product_links = []
    all_product_prices = []
    
    for page in range(1, num_pages + 1):
        load_page(driver, f"{base_url}&page={page}")  # Update the URL to include the page number
        product_names, product_links, product_prices = scrape_product_data(driver)
        
        # Ensure the lists are of the same length before extending
        if len(product_names) == len(product_links) == len(product_prices):
            all_product_names.extend(product_names)
            all_product_links.extend(product_links)
            all_product_prices.extend(product_prices)
        else:
            print(f"Warning: Mismatched data on page {page}. Names: {len(product_names)}, Links: {len(product_links)}, Prices: {len(product_prices)}")

        time.sleep(5)  # Wait before loading the next page
    
    return all_product_names, all_product_links, all_product_prices

# Initialize WebDriver and scrape multiple pages
driver = initialize_driver()
all_product_names, all_product_links, all_product_prices = scrape_multiple_pages(driver, url, 2)  # Adjust number of pages as needed

# Close the driver
driver.quit()

# Create a DataFrame to store the results
df = pd.DataFrame({
    'Product_Name': all_product_names,
    'Product_Link': all_product_links,
    'Product_Price': all_product_prices  # Updated to include product prices
})

# Display or save the scraped data
df.head()  # Display the DataFrame
Output_path = "F:\\Projects\\Flipkart\\flipkart_scrape_iq.csv"
df.to_csv(Output_path, index=False)
#df.to_csv('flipkart_scrape_redmi.csv', index=False)  # Save the scraped data to a CSV file

In [6]:
import pandas as pd
df = pd.read_csv("F:\\Projects\\Flipkart\\flipkart_scrape_iq.csv")

In [7]:
import re

# Function to extract and clean the price (remove rupee symbol and commas)
def extract_clean_price(price_string):
    # Find the rupee symbol followed by the price
    match = re.search(r'₹(\d[\d,]*)', price_string)
    if match:
        # Remove the rupee symbol and commas, and convert to an integer
        return int(match.group(1).replace(',', ''))
    return None

# Apply the function to the 'Price' column
df['Product_Price'] = df['Product_Price'].apply(extract_clean_price)

In [8]:
df = df[df['Product_Price'] <= 40000]

In [9]:
Output_path = "F:\\Projects\\Flipkart\\flipkart_cleaned_iq.csv"
df.to_csv(Output_path, index=False)

In [1]:
import pandas as pd
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException

# Load the CSV file containing product links
file_path = r"F:\Projects\Flipkart\flipkart_cleaned_iq.csv"
df = pd.read_csv(file_path)

# Initialize the Chrome driver
driver = webdriver.Chrome()

# Function to extract reviews and ratings from a product page
def extract_reviews_and_ratings(driver):
    reviews = []
    ratings = []

    # Wait for the reviews section to load
    wait = WebDriverWait(driver, 10)
    try:
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'ZmyHeo')))
    except TimeoutException:
        print("Timed out waiting for reviews to load.")
        return reviews, ratings  # Return empty lists if timed out

    # Extract reviews
    review_elements = driver.find_elements(By.CLASS_NAME, "ZmyHeo")
    for element in review_elements:
        try:
            # Click "Read More" if available
            read_more = element.find_elements(By.CLASS_NAME, "b4x-fr")
            if read_more:
                driver.execute_script("arguments[0].click();", read_more[0])
                time.sleep(1)  # Wait for the full review to load
            
            reviews.append(element.text)
        except StaleElementReferenceException:
            continue

    # Extract star ratings
    rating_elements = driver.find_elements(By.CLASS_NAME, "XQDdHH.Ga3i8K")
    for i in range(max(len(reviews), len(rating_elements))):
        # Append ratings or None if not available
        if i < len(rating_elements):
            ratings.append(rating_elements[i].text)
        else:
            ratings.append(None)  # Placeholder for missing ratings

    return reviews, ratings

# Function to load the page with the correct page number in the URL
def load_page(driver, url):
    driver.get(url)
    wait = WebDriverWait(driver, 10)
    try:
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'ZmyHeo')))
    except TimeoutException:
        print("Timed out waiting for reviews page to load.")

# Scrape reviews and ratings for all product links
all_data = []

num_pages_reviews = 20  # Number of review pages to scrape per product

# Loop through each product link in the DataFrame
for index, product_link in df['Product_Link'].items():
    print(f"Scraping product {index + 1}/{len(df)}: {product_link}")
    
    # Scrape reviews for the current product
    driver.get(product_link)
    time.sleep(5)  # Wait for the product page to load

    # Click on the 'All Reviews' button if it exists
    try:
        wait = WebDriverWait(driver, 10)
        all_reviews_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, '_23J90q.RcXBOT')))
        all_reviews_button.click()
        time.sleep(5)  # Wait for the reviews page to load
    except TimeoutException:
        print(f"Warning: Could not find 'All Reviews' button for {product_link}. Continuing to next product.")
        continue  # Skip to the next product if no reviews found

    # Scrape reviews and ratings across multiple review pages
    for page in range(1, num_pages_reviews + 1):
        if page > 1:
            # Update the URL to navigate to the next page of reviews
            page_url = f"{driver.current_url}&page={page}"
            load_page(driver, page_url)
            time.sleep(8)

        reviews, ratings = extract_reviews_and_ratings(driver)
        
        # Check if reviews are empty, and if so, stop scraping further pages
        if not reviews:
            print(f"Warning: No more reviews found on page {page}")
            continue

        # Append reviews and ratings to the all_data list along with the product link
        for review, rating in zip(reviews, ratings):
            all_data.append({
                'Product_Link': product_link,
                'Review': review,
                'Rating': rating
            })

        time.sleep(5)  # Wait before loading the next reviews page

# Close the driver after scraping is complete
driver.quit()

# Convert the collected data into a DataFrame and save it as a CSV file
result_df = pd.DataFrame(all_data)

# Save to CSV
output_file = r"F:\Projects\Flipkart\iq_reviews_ratings_all_products.csv"
result_df.to_csv(output_file, index=False)

print(f"Scraping complete. Data saved to {output_file}")

Scraping product 1/41: https://www.flipkart.com/iqoo-z7-pro-5g-graphite-matte-256-gb/p/itm5f318d2e3e6d3?pid=MOBHFTYM5FMDHS6U&lid=LSTMOBHFTYM5FMDHS6UPHIUP5&marketplace=FLIPKART&q=iqoo&store=tyy%2F4io&srno=s_1_1&otracker=search&otracker1=search&iid=bf967b75-2300-486a-b9c6-f6c700b7d2f5.MOBHFTYM5FMDHS6U.SEARCH&ssid=w11yhvq5eo0000001729620319531&qH=1cc40088ec90b2d2
Timed out waiting for reviews page to load.
Timed out waiting for reviews to load.
Timed out waiting for reviews page to load.
Timed out waiting for reviews to load.
Timed out waiting for reviews page to load.
Timed out waiting for reviews to load.
Timed out waiting for reviews page to load.
Timed out waiting for reviews to load.
Timed out waiting for reviews page to load.
Timed out waiting for reviews to load.
Timed out waiting for reviews page to load.
Timed out waiting for reviews to load.
Timed out waiting for reviews page to load.
Timed out waiting for reviews to load.
Timed out waiting for reviews page to load.
Timed out wa

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("F:\Projects\Flipkart\iq_reviews_ratings_all_products.csv")

In [4]:
df1 = pd.read_csv("F:\\Projects\\Flipkart\\flipkart_cleaned_iq.csv")

In [5]:
df = pd.merge(df, df1, on = 'Product_Link', how='inner')

In [6]:
output_file = r"F:\Projects\Flipkart\iq_merged.csv"
df.to_csv(output_file, index=False)

In [1]:
import pandas as pd
df = pd.read_csv("F:\Projects\Flipkart\iq_merged.csv")

In [2]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Cleaning function
def clean_text(text):
    
    # Replace curly apostrophe ’ with straight apostrophe '
    text = text.replace("’", "'")
    
    # Remove bullet points and other unnecessary symbols, keeping only regular apostrophes
    text = re.sub(r"[^a-zA-Z\s']", '', text)

    # Remove unnecessary apostrophes: e.g., at the end of words like "best'"
    text = re.sub(r"\b'\b|'\B|\B'", '', text)
    
    # Lowercase the text
    text = text.lower()
    
    # Remove numbers (optional, keep this if you don't want numbers)
    text = re.sub(r'\d+', '', text)
    
    # Tokenize the text
    tokens = text.split()
    
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

# Apply the cleaning function to each review
df['Review'] = df['Review'].apply(clean_text)

In [3]:
import re

# Function to remove emojis
def remove_emojis(text):
    # Regular expression to match emojis
    
    emoji_pattern = re.compile(
        "[\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002700-\U000027BF"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)  # Remove emojis

# Apply the function to the 'Review' column
df['Review'] = df['Review'].apply(remove_emojis)

In [4]:
from autocorrect import Speller

# Initialize the spell checker for English
spell = Speller(lang='en')  # Specify English language

# Function to correct spelling in a review
def correct_spelling(text):
    # Check if the input is a string
    if isinstance(text, str):
        return ' '.join([spell(word) for word in text.split()])
    return text  # Return as is for non-string inputs


# Apply the spelling correction function to the 'Review' column
df['Review'] = df['Review'].apply(correct_spelling)

In [5]:
import pandas as pd
import re

# Custom dictionary of words to remove or replace
custom_dict =  {
    'w': 'watt',  "havent": "have not", "hasnt": "has not", 'u': "you", 'r': "are", "mp": 'million pixels', 
    "ai": "artificial intelligence", "ui": "user interface", "io": "iphone operating system",  "sot": "special operation team", "le": "less", 
    "ffd": "full high definition", "dont": "do not", "mb": "megabyte", "cam": "camera", "came": "camera", "avg": "average", "min": 'minutes',  
    "yea": "yeah", "lil": "little", "its": "it is", "very": "very", "n": "and", "cant": "cannot", "dis": "this", "v": "we", "gen": "generation", 
    "hdr": 'high dynamic range', "didnt": "did not", "ive": "i have", "ur": "your", "wont": 'will not', 'issuehrs': "issue hours", 'doomed': "zoomed", 
    'fps': "frames per second", 'ois': "optical image stabilization", 'theyll': "they will", 'ig': "instagram", 'bbd': "bigger better deal",
    "hd": 'high definition', 'cleanui': "clean user interface", "tatics": "haptics", "sd": "secure digital", 'denmark': 'dxomark',
    "usp": 'unique selling proposition', 'degc': "degree celsius", "tatic": "haptic", "unbuilt": "inbuilt", 'xiomi': "xiaomi", 'thik': "think", "tooo": "too",
    'regreating': "regretting", 'fyi': "for your information", 'cemra': "camera", 'fastly': "fast", "optimise": 'optimize', 'osum': 'awesome', 
    'vi': "vodafone", 'upi': "unified payments interface", 'eyeturner': "eye turner", 'banger': "banger", 're': "resolution", 'goddamn': "goddamn", 
    'aint': "am not", 'plesently': "pleasantly", "uisvery": 'user interface very', 'gif': "graphics interchange format", 'siz': "six", 'iphones': "iphone",
    'youre': 'you are', 'doubtbut': "doubt but", 'phome': "phone", 'red': "redmi", 'okif': "okay if", 'pic': "picture", 'smatter': 'smarter', 'kkk': 'okay',
    'membrane': "ambrane", "holdnew": 'hold new', 'swine': "swipe", 'pixelated': "pixel related", 'cameraai': "camera artificial intelligence", 
    'oppos': 'oppo', 'amaze': "amazing", 'daytona': "day to day", 'offmy': "off my", 'laggy': "lag", 'slowlike': "slow like", 'opp': "oppo", 
    'wholeday': "full day", 'hdfc': "hdfc", 'wil': "will", 'commendable': "recommendable", "kinda": 'kind of', 'baku': "vaku", 'onto': 'into',
    'beat': "best", 'surfed': "suffered", 'bmi': "battle ground mobile india", 'isnt': "is not", "oplus": "oneplus", 'approx hr sot': 'approximately hour screen on time',
    'flickering': "flickering", 'least': "at least", 'doun': "down", 'thats': "that is", 'ill': "i will", 'bout': "about", 
    'overrated': "overrated", 'butter': "better", 'emi': "equated monthly installment", 'prefect': "perfect", 'ketone': "keyone", 
    'ie': "in other words", 'cuz': "because", 'mah': "milliampere hour", 'asus': "asus", "youve": "you have", 'ott': "over the top", 
    'oct': "october", 'ip': "iphone", 'nowhope': "now hope", 'eraserunblur': "eraser focus", "wifi": "wireless fidelity", 'suppppppb': 'super',
    'ok': "okay", 'hiccup': "hiccup", 'slowness': "slowness", 'janso': "january so", 'mahmaybe': "milliampere hour may be", 'luv': 'love',
    'lovable': "lovable", 'etc': "et cetera", 'miami': 'xiaomi', 'realm': 'realme', 'frm': 'from', 'mob':'mobile', 'approx': 'approximately', 'ok': "okay", 
    'oplus': 'oneplus', 'amp': "ampere", "gen": "generation", 'onepluss': 'oneplus', 'vary':'very', 'barry': 'battery', 'doesnt': "does not", 'supper': 'super', 
    'lvl': 'level',  'sup': 'super', 'pr': 'product', 'bos': 'range boost', 'beast': 'best', 'dlr': 'digital single lens reflex', 'batter': 'better', 'cod': 'call of duty', 
    'nyc': 'nice', 'extent': 'excellent','nice hoon': 'nice phone','gr': 'great','math': 'match','bt': 'but', 'ph': 'phone', 'op': 'oneplus','assume': 'awesome',
    'uv': 'ultra violet', 'vry': 'very', 'easilllyyy': 'easily', 'yr': 'year',  'dslr': 'digital single lens reflex', 'mic failed': 'microphone failed','premium paper': 'premium feel',
    'flipcard': 'flipkart', 'wiki': 'wikipedia', 'mobil': 'mobile', 'prod': 'production', 'wifi': 'wireless fidelity', 'osm dolly atoms': 'awesome dolby atmos', 
    'nfc': "near field communication", 'plz': "please", 'perpomence': "performance", 'phn': "phone", 'camara': 'camera', 'fps': 'frames per second',
    'sd gen': 'snapdragon generation', 'superb': 'super', 'professor': 'processor', 'sound college': 'sound quality', 'dam': 'damn', 'opp': 'oppo', 'aws': 'awesome',
    'vry gd': 'very good', 'supppoppp': 'super', 'prosesar': 'processor','hr': 'hour', 'gonna': 'going to','pub': 'pubg','gun': 'good','jus': 'just',
    }

# List of words to remove
remove_list = ['pm', 'f', 'tho', 'h', 'th', 'gn', 'xr', 'xl', 'am', 'tg', 'p', 'z', 'cc', 'g', 'hz', 'karma', 'nts', 'se', 'pro',
               'hijab','ft', 'emu', 'gpu', 'fr', 'nit', 'c', 'ly', 'pg', 'sec', 'um', 'fei', 'era', 'yep', 'q', 'c', 'sw', 'b', 'kd', 'ofc','sar',
              'ly', 'oo', 'hh', 'va', 'eis', 'bl', '°c', 'lol', '₹', 'lg', 'al', 'sm', 'rsk', 'gh', 'nu', 'ota', 'et', 'rgb', 'dts', 'nt', 'iu', 'vr',
              'fm', 'fp','ota', 'ordo', 'gre', '•', 'eg', 'pl', 'eis', 'haha', 'bd', 'tad' ]
              
# Function to clean the text
def clean_text(text, custom_dict, remove_list):
    # Remove specified words from remove_list
    pattern_remove = r'\b(' + '|'.join(re.escape(word) for word in remove_list) + r')\b'
    cleaned_text = re.sub(pattern_remove, '', text, flags=re.IGNORECASE)

    # Replace words according to custom_dict
    for word, replacement in custom_dict.items():
        cleaned_text = re.sub(r'\b' + re.escape(word) + r'\b', replacement, cleaned_text, flags=re.IGNORECASE)
    
    return cleaned_text.strip()

# Assuming 'df' is your DataFrame with a 'Review' column
df['Review'] = df['Review'].apply(lambda x: clean_text(x, custom_dict, remove_list))

In [6]:
output_file = r"F:\Projects\Flipkart\iq_text_cleaned.csv"
df.to_csv(output_file, index=False)