In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup


class IndiaMartScraper:
    def __init__(self, driver_path):
        self.driver_path = driver_path
        self.driver = webdriver.Chrome(executable_path=self.driver_path)
        self.wait = WebDriverWait(self.driver, 20)
        self.df = pd.DataFrame(columns=['Category', 'Name', 'Address'])

    def open_website(self, url):
        self.driver.get(url)
        self.driver.maximize_window()

    def search_product(self, search_text,id):
        search_box = self.driver.find_element(By.ID, f'{id}') #search-input
        search_box.clear()
        search_box.send_keys(search_text)
        search_box.send_keys(Keys.RETURN)

    def sign_in(self, phone_number):
        button = self.wait.until(EC.element_to_be_clickable((By.ID, 'user_sign_in')))
        button.click()
        cn = self.wait.until(EC.element_to_be_clickable((By.ID, 'mobile')))
        cn.send_keys(phone_number)
        cn.send_keys(Keys.RETURN)

    def generate_otp(self):
        time.sleep(20)  # Wait for the OTP to be generated
        button = self.driver.find_element(By.ID, 'passwordbtn1')
        #time.sleep(20)
        button.click()
        time.sleep(5)

    def enter_otp(self, otp):
        for i, char in enumerate(otp):
            element = self.wait.until(EC.visibility_of_element_located((By.XPATH, f"//input[@id='{['first', 'second', 'third', 'fourth'][i]}' and contains(@class, 'mobbox1 f1 border_black1')]")))
            element.send_keys(char)

    def scroll_to_bottom(self):
        last_height = self.driver.execute_script("return document.body.scrollHeight")
        while True:
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            new_height = self.driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

    def click_show_more_results(self):
        element = self.wait.until(EC.presence_of_element_located((By.XPATH, "//div[@class='showmoreresultsdiv']/button[text()='Show more results']")))
        self.driver.execute_script("arguments[0].scrollIntoView();", element)
        self.driver.execute_script("arguments[0].click();", element)

    def scrape_data(self, search_text):
        name_elements = self.wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'cardlinks')))
        #contact_elements = self.wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'pns_h.duet.fwb')))
        #address_elements = self.wait.until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'tac.wpw')))
        address_elements = self.wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'span.db.to-txt-area.lh16.tal p.tac.wpw')))

        
        names = [el.text for el in name_elements]
        #contacts = [el.text for el in contact_elements]
        addresses = [el.get_attribute('textContent') for el in address_elements]
        

        for name, address in zip(names, addresses):
            self.df = self.df.append({'Category': search_text, 'Name': name, 'Address': address}, ignore_index=True)

    def close_browser(self):
        self.driver.quit()
        
    def Get_Data(self):
        data = pd.read_excel('Product-Category.xlsx')
        for i in range(1,len(data['Seller'])):
            search_text=data['Seller'][i]
            self.driver.execute_script("window.scrollTo(0, 0);")
            self.search_product(search_text,'search_string')
            for j in range(10):
                time.sleep(5)
                self.click_show_more_results()
                time.sleep(5)
                self.scroll_to_bottom()
                self.scrape_data(search_text)
    
    def run(self, url, search_text, phone_number):
        try:
            self.open_website(url)
            self.search_product(search_text,'search-input')
            self.sign_in(phone_number)
            self.scroll_to_bottom()
            self.click_show_more_results()
            
            self.generate_otp() 
            otp = input('Enter OTP\n')
            self.enter_otp(otp)
            for i in range(10):
                time.sleep(5)
                self.click_show_more_results()
                time.sleep(5)
                self.scroll_to_bottom()
            self.driver.execute_script("window.scrollTo(0, 0);")
            self.scrape_data(search_text)
            print(self.df)
            self.df.to_excel(f'{search_text}.xlsx')
            
            
            
        finally:
            #self.df.to_excel(f'{search_text}.xlsx')
            self.close_browser()


if __name__ == "__main__":
    driver_path = 'C:/web_driver/chromedriver.exe'
    url = 'https://www.indiamart.com/'
    excel_file = pd.read_excel('Product-Category.xlsx')
    phone_number = '9871308613'
    
    for i in range (len(excel_file)):
        if excel_file['Status'][i]=="Scrapped":
            continue
            
        search_text=excel_file['Seller'][i]
        scraper = IndiaMartScraper(driver_path)
        scraper.run(url, search_text, phone_number)
        #excel_file['Status'][i]='Scrapped'
        
    

  self.driver = webdriver.Chrome(executable_path=self.driver_path)


Enter OTP
4820
    Category                                    Name  \
0       TARP  Blue Woven Truck Waterproof Tarpaulins   
1       TARP                      Lamifab Industries   
2       TARP                  Plain Truck Tarpaulins   
3       TARP        Gujarat Craft Industries Limited   
4       TARP              Waterproof Truck Tarpaulin   
..       ...                                     ...   
168     TARP                      Blue PVC Tarpaulin   
169     TARP                     Gulati Canvas Store   
170     TARP   Modern Car Tarpaulins, For Industrial   
171     TARP       Rainproof Exports Private Limited   
172     TARP                PVC Coated Yellow Tirpal   

                                               Address  
0    Plot No. 4702,4703,4704,4705& 4604, 4605 , Pla...  
1    No. 431, Santej-Vadsar Road, Tal. Kalol, Kalol...  
2    Plot No 401 402 403 404 Paraj Station Road Mah...  
3    gala No 5 Shed No 2 Bombay Agra Road Hanuman S...  
4    Survey No. 79, N. H. 2

# Aggregator

In [1]:
import pandas as pd
import os

def aggregate_specific_excel_files(input_directory, output_file, file_list):
    # Create an empty DataFrame to hold all the data
    combined_df = pd.DataFrame()

    # Loop through all the files in the input directory
    for filename in os.listdir(input_directory):
        if filename in file_list:
            file_path = os.path.join(input_directory, filename)
            df = pd.read_excel(file_path)
            combined_df = pd.concat([combined_df, df], ignore_index=True)

    # Write the combined DataFrame to an output Excel file
    combined_df.to_excel(output_file, index=False)

if __name__ == "__main__":
    input_directory = r'C:\Users\user\Desktop\Data\Personal\Portfolio-Projects\Indiamart'  # Replace with your directory containing Excel files
    output_file = 'Indiamart-28-Jun.xlsx'  # Output file name
    file_list = ['TOY_MODEL_VEHICLE_TRACK.xlsx',
'TOY_VEHICLE_SET.xlsx',
'Travel Pillows.xlsx',
'TWO WAY RADIO.xlsx',
'UNDERPANTS.xlsx',
'Vehicle Parts.xlsx',
'WINDOW_FILM.xlsx',
'WINDOW_SHADE.xlsx',
'Noodles brand.xlsx',
'Maggi.xlsx',
'Sunfeast Yippee Noodles.xlsx',
'Knorr Soupy Noodles.xlsx',
'Top Ramen.xlsx',
'Ching’s Secret Instant Noodles.xlsx',
'Patanjali Atta Noodles.xlsx',
'Wai Wai Noodles.xlsx',
'Horlicks Foodles.xlsx',
'Biscuit Brand.xlsx',
'Parle-G.xlsx',
'Britannia Biscuits.xlsx',
'Sunfeast Biscuits.xlsx',
'Parle Hide & Seek Biscuits.xlsx',
'Mcvities Biscuit.xlsx',
'Parle Krack Jack Biscuits.xlsx',
'Oreo.xlsx',
 
]  # Replace with the list of specific file names

    aggregate_specific_excel_files(input_directory, output_file, file_list)
    print(f"Specified Excel files have been aggregated into {output_file}")


Specified Excel files have been aggregated into Indiamart-28-Jun.xlsx
